#include <array>
const char* rocfft_complex_h {
R"_PY_EMBED_(
// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef ROCFFT_COMPLEX_H
#define ROCFFT_COMPLEX_H

#if !defined(__HIPCC_RTC__)
#endif

#ifdef __HIP_PLATFORM_NVIDIA__
typedef __half rocfft_fp16;
#else
typedef _Float16 rocfft_fp16;
#endif

template <typename Treal>
struct rocfft_complex
{

    Treal x; // Real part
    Treal y; // Imaginary part

    // Constructors
    // Do not initialize the members x or y by default, to ensure that it can
    // be used in __shared__ and that it is a trivial class compatible with C.
    __device__ __host__ rocfft_complex()                      = default;
    __device__ __host__ rocfft_complex(const rocfft_complex&) = default;
    __device__ __host__ rocfft_complex(rocfft_complex&&)      = default;
    __device__ __host__ rocfft_complex& operator=(const rocfft_complex& rhs) & = default;
    __device__ __host__ rocfft_complex& operator=(rocfft_complex&& rhs) & = default;
    __device__                          __host__ ~rocfft_complex()        = default;

    // Constructor from real and imaginary parts
    __device__ __host__ constexpr rocfft_complex(Treal real, Treal imag)
        : x{real}
        , y{imag}
    {
    }

    // Conversion from different precision
    template <typename U>
    __device__ __host__ explicit constexpr rocfft_complex(const rocfft_complex<U>& z)
        : x(z.x)
        , y(z.y)
    {
    }

    // Accessors
    __device__ __host__ constexpr Treal real() const
    {
        return x;
    }

    __device__ __host__ constexpr Treal imag() const
    {
        return y;
    }

    // Mutators
    __device__ __host__ void real(const Treal new_x)
    {
        x = new_x;
    }

    __device__ __host__ void imag(const Treal new_y)
    {
        y = new_y;
    }

    // Unary operations
    __forceinline__ __device__ __host__ rocfft_complex operator-() const
    {
        return {-x, -y};
    }

    __forceinline__ __device__ __host__ rocfft_complex operator+() const
    {
        return *this;
    }

    __device__ __host__ Treal asum(const rocfft_complex& z)
    {
        return abs(z.x) + abs(z.y);
    }

    // Internal real functions
    static __forceinline__ __device__ __host__ Treal abs(Treal x)
    {
        return x < 0 ? -x : x;
    }

    static __forceinline__ __device__ __host__ float sqrt(float x)
    {
        return ::sqrtf(x);
    }

    static __forceinline__ __device__ __host__ double sqrt(double x)
    {
        return ::sqrt(x);
    }

    // Addition operators
    __device__ __host__ auto& operator+=(const rocfft_complex& rhs)
    {
        return *this = {x + rhs.x, y + rhs.y};
    }

    __device__ __host__ auto operator+(const rocfft_complex& rhs) const
    {
        auto lhs = *this;
        return lhs += rhs;
    }

    // Subtraction operators
    __device__ __host__ auto& operator-=(const rocfft_complex& rhs)
    {
        return *this = {x - rhs.x, y - rhs.y};
    }

    __device__ __host__ auto operator-(const rocfft_complex& rhs) const
    {
        auto lhs = *this;
        return lhs -= rhs;
    }

    // Multiplication operators
    __device__ __host__ auto& operator*=(const rocfft_complex& rhs)
    {
        return *this = {x * rhs.x - y * rhs.y, y * rhs.x + x * rhs.y};
    }

    __device__ __host__ auto operator*(const rocfft_complex& rhs) const
    {
        auto lhs = *this;
        return lhs *= rhs;
    }

    // Division operators
    __device__ __host__ auto& operator/=(const rocfft_complex& rhs)
    {
        // Form of Robert L. Smith's Algorithm 116
        if(abs(rhs.x) > abs(rhs.y))
        {
            Treal ratio = rhs.y / rhs.x;
            Treal scale = 1 / (rhs.x + rhs.y * ratio);
            *this       = {(x + y * ratio) * scale, (y - x * ratio) * scale};
        }
        else
        {
            Treal ratio = rhs.x / rhs.y;
            Treal scale = 1 / (rhs.x * ratio + rhs.y);
            *this       = {(y + x * ratio) * scale, (y * ratio - x) * scale};
        }
        return *this;
    }

    __device__ __host__ auto operator/(const rocfft_complex& rhs) const
    {
        auto lhs = *this;
        return lhs /= rhs;
    }

    // Comparison operators
    __device__ __host__ constexpr bool operator==(const rocfft_complex& rhs) const
    {
        return x == rhs.x && y == rhs.y;
    }

    __device__ __host__ constexpr bool operator!=(const rocfft_complex& rhs) const
    {
        return !(*this == rhs);
    }

    // Operators for complex-real computations
    template <typename U>
    __device__ __host__ auto& operator+=(const U& rhs)
    {
        return (x += Treal(rhs)), *this;
    }

    template <typename U>
    __device__ __host__ auto& operator-=(const U& rhs)
    {
        return (x -= Treal(rhs)), *this;
    }

    __device__ __host__ auto operator+(const Treal& rhs)
    {
        auto lhs = *this;
        return lhs += rhs;
    }

    __device__ __host__ auto operator-(const Treal& rhs)
    {
        auto lhs = *this;
        return lhs -= rhs;
    }

    template <typename U>
    __device__ __host__ auto& operator*=(const U& rhs)
    {
        return (x *= Treal(rhs)), (y *= Treal(rhs)), *this;
    }

    template <typename U>
    __device__ __host__ auto operator*(const U& rhs) const
    {
        auto lhs = *this;
        return lhs *= Treal(rhs);
    }

    template <typename U>
    __device__ __host__ auto& operator/=(const U& rhs)
    {
        return (x /= Treal(rhs)), (y /= Treal(rhs)), *this;
    }

    template <typename U>
    __device__ __host__ auto operator/(const U& rhs) const
    {
        auto lhs = *this;
        return lhs /= Treal(rhs);
    }

    template <typename U>
    __device__ __host__ constexpr bool operator==(const U& rhs) const
    {
        return x == Treal(rhs) && y == 0;
    }

    template <typename U>
    __device__ __host__ constexpr bool operator!=(const U& rhs) const
    {
        return !(*this == rhs);
    }
};

// Stream operators
#if !defined(__HIPCC_RTC__)
static std::ostream& operator<<(std::ostream& stream, const rocfft_fp16& f)
{
    return stream << static_cast<double>(f);
}

template <typename Treal>
std::ostream& operator<<(std::ostream& out, const rocfft_complex<Treal>& z)
{
    return out << '(' << static_cast<double>(z.x) << ',' << static_cast<double>(z.y) << ')';
}
#endif

// Operators for real-complex computations
template <typename U, typename Treal>
__device__ __host__ rocfft_complex<Treal> operator+(const U& lhs, const rocfft_complex<Treal>& rhs)
{
    return {Treal(lhs) + rhs.x, rhs.y};
}

template <typename U, typename Treal>
__device__ __host__ rocfft_complex<Treal> operator-(const U& lhs, const rocfft_complex<Treal>& rhs)
{
    return {Treal(lhs) - rhs.x, -rhs.y};
}

template <typename U, typename Treal>
__device__ __host__ rocfft_complex<Treal> operator*(const U& lhs, const rocfft_complex<Treal>& rhs)
{
    return {Treal(lhs) * rhs.x, Treal(lhs) * rhs.y};
}

template <typename U, typename Treal>
__device__ __host__ rocfft_complex<Treal> operator/(const U& lhs, const rocfft_complex<Treal>& rhs)
{
    // Form of Robert L. Smith's Algorithm 116
    if(rocfft_complex<Treal>::abs(rhs.x) > rocfft_complex<Treal>::abs(rhs.y))
    {
        Treal ratio = rhs.y / rhs.x;
        Treal scale = Treal(lhs) / (rhs.x + rhs.y * ratio);
        return {scale, -scale * ratio};
    }
    else
    {
        Treal ratio = rhs.x / rhs.y;
        Treal scale = Treal(lhs) / (rhs.x * ratio + rhs.y);
        return {ratio * scale, -scale};
    }
}

template <typename U, typename Treal>
__device__ __host__ constexpr bool operator==(const U& lhs, const rocfft_complex<Treal>& rhs)
{
    return Treal(lhs) == rhs.x && 0 == rhs.y;
}

template <typename U, typename Treal>
__device__ __host__ constexpr bool operator!=(const U& lhs, const rocfft_complex<Treal>& rhs)
{
    return !(lhs == rhs);
}

// Extending std namespace to handle rocfft_complex datatype
namespace std
{
    template <typename Treal>
    __device__ __host__ constexpr Treal real(const rocfft_complex<Treal>& z)
    {
        return z.x;
    }

    template <typename Treal>
    __device__ __host__ constexpr Treal imag(const rocfft_complex<Treal>& z)
    {
        return z.y;
    }

    template <typename Treal>
    __device__ __host__ constexpr rocfft_complex<Treal> conj(const rocfft_complex<Treal>& z)
    {
        return {z.x, -z.y};
    }

    template <typename Treal>
    __device__ __host__ inline Treal norm(const rocfft_complex<Treal>& z)
    {
        return (z.x * z.x) + (z.y * z.y);
    }

    template <typename Treal>
    __device__ __host__ inline Treal abs(const rocfft_complex<Treal>& z)
    {
        Treal tr = rocfft_complex<Treal>::abs(z.x), ti = rocfft_complex<Treal>::abs(z.y);
        return tr > ti ? (ti /= tr, tr * rocfft_complex<Treal>::sqrt(ti * ti + 1))
               : ti    ? (tr /= ti, ti * rocfft_complex<Treal>::sqrt(tr * tr + 1))
                       : 0;
    }
}

#endif // ROCFFT_COMPLEX_H
)_PY_EMBED_"};
const char* common_h {
R"_PY_EMBED_(
// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef COMMON_H
#define COMMON_H

#if defined(__HIPCC_RTC__) || defined(__CUDACC_RTC__)
typedef signed int   int32_t;
typedef unsigned int uint32_t;
#endif

#ifdef WIN32
#define ROCFFT_DEVICE_EXPORT __declspec(dllexport)
#else
#define ROCFFT_DEVICE_EXPORT
#endif

// NB:
//   All kernels were compiled based on the assumption that the default max
//   work group size is 256. This default value in compiler might change in
//   future. Each kernel has to explicitly set proper sizes through
//   __launch_bounds__ or __attribute__.
//   Further performance tuning might be done later.
static const unsigned int LAUNCH_BOUNDS_R2C_C2R_KERNEL = 256;

#ifdef __HIP_PLATFORM_NVIDIA__

__device__ inline rocfft_complex<float> operator-(const rocfft_complex<float>& a,
                                                  const rocfft_complex<float>& b)
{
    return rocfft_complex<float>(a.x - b.x, a.y - b.y);
}
__device__ inline rocfft_complex<float> operator+(const rocfft_complex<float>& a,
                                                  const rocfft_complex<float>& b)
{
    return rocfft_complex<float>(a.x + b.x, a.y + b.y);
}
__device__ inline rocfft_complex<float> operator*(const float& a, const rocfft_complex<float>& b)
{
    return rocfft_complex<float>(a * b.x, a * b.y);
}
__device__ inline rocfft_complex<float> operator*=(rocfft_complex<float>&       a,
                                                   const rocfft_complex<float>& b)
{
    a = cuCmulf(a, b);
    return a;
}
__device__ inline rocfft_complex<float> operator*=(rocfft_complex<float>& a, const float& b)
{
    a = cuCmulf(a, rocfft_complex<float>(b, b));
    return a;
}
__device__ inline rocfft_complex<float> operator-(const rocfft_complex<float>& a)
{
    return cuCmulf(a, rocfft_complex<float>(-1.0, -1.0));
}

__device__ inline rocfft_complex<double> operator-(const rocfft_complex<double>& a,
                                                   const rocfft_complex<double>& b)
{
    return rocfft_complex<double>(a.x - b.x, a.y - b.y);
}
__device__ inline rocfft_complex<double> operator+(const rocfft_complex<double>& a,
                                                   const rocfft_complex<double>& b)
{
    return rocfft_complex<double>(a.x + b.x, a.y + b.y);
}
__device__ inline rocfft_complex<double> operator*(const double& a, const rocfft_complex<double>& b)
{
    return rocfft_complex<double>(a * b.x, a * b.y);
}
__device__ inline rocfft_complex<double> operator*=(rocfft_complex<double>&       a,
                                                    const rocfft_complex<double>& b)
{
    a = cuCmul(a, b);
    return a;
}
__device__ inline rocfft_complex<double> operator*=(rocfft_complex<double>& a, const double& b)
{
    a = cuCmul(a, rocfft_complex<double>(b, b));
    return a;
}
__device__ inline rocfft_complex<double> operator-(const rocfft_complex<double>& a)
{
    return cuCmul(a, rocfft_complex<double>(-1.0, -1.0));
}

#endif

template <class T>
struct real_type;

template <>
struct real_type<rocfft_complex<float>>
{
    typedef float type;
};

template <>
struct real_type<rocfft_complex<double>>
{
    typedef double type;
};

template <>
struct real_type<rocfft_complex<rocfft_fp16>>
{
    typedef rocfft_fp16 type;
};

template <class T>
using real_type_t = typename real_type<T>::type;

template <class T>
struct complex_type;

template <>
struct complex_type<float>
{
    typedef rocfft_complex<float> type;
};

template <>
struct complex_type<double>
{
    typedef rocfft_complex<double> type;
};

template <class T>
using complex_type_t = typename complex_type<T>::type;

/// example of using complex_type_t:
// complex_type_t<float> float_complex_val;
// complex_type_t<double> double_complex_val;

template <typename T>
__device__ T TWLstep1(const T* twiddles, size_t u)
{
    size_t j      = u & 255;
    T      result = twiddles[j];
    return result;
}

template <typename T>
__device__ T TWLstep2(const T* twiddles, size_t u)
{
    size_t j      = u & 255;
    T      result = twiddles[j];
    u >>= 8;
    j      = u & 255;
    result = T((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
               (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
    return result;
}

template <typename T>
__device__ T TWLstep3(const T* twiddles, size_t u)
{
    size_t j      = u & 255;
    T      result = twiddles[j];
    u >>= 8;
    j      = u & 255;
    result = T((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
               (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
    u >>= 8;
    j      = u & 255;
    result = T((result.x * twiddles[512 + j].x - result.y * twiddles[512 + j].y),
               (result.y * twiddles[512 + j].x + result.x * twiddles[512 + j].y));
    return result;
}

template <typename T>
__device__ T TWLstep4(const T* twiddles, size_t u)
{
    size_t j      = u & 255;
    T      result = twiddles[j];
    u >>= 8;
    j      = u & 255;
    result = T((result.x * twiddles[256 + j].x - result.y * twiddles[256 + j].y),
               (result.y * twiddles[256 + j].x + result.x * twiddles[256 + j].y));
    u >>= 8;
    j      = u & 255;
    result = T((result.x * twiddles[512 + j].x - result.y * twiddles[512 + j].y),
               (result.y * twiddles[512 + j].x + result.x * twiddles[512 + j].y));
    u >>= 8;
    j      = u & 255;
    result = T((result.x * twiddles[768 + j].x - result.y * twiddles[768 + j].y),
               (result.y * twiddles[768 + j].x + result.x * twiddles[768 + j].y));
    return result;
}

#define TWIDDLE_STEP_MUL_FWD(TWFUNC, TWIDDLES, INDEX, REG) \
    {                                                      \
        T              W = TWFUNC(TWIDDLES, INDEX);        \
        real_type_t<T> TR, TI;                             \
        TR    = (W.x * REG.x) - (W.y * REG.y);             \
        TI    = (W.y * REG.x) + (W.x * REG.y);             \
        REG.x = TR;                                        \
        REG.y = TI;                                        \
    }

#define TWIDDLE_STEP_MUL_INV(TWFUNC, TWIDDLES, INDEX, REG) \
    {                                                      \
        T              W = TWFUNC(TWIDDLES, INDEX);        \
        real_type_t<T> TR, TI;                             \
        TR    = (W.x * REG.x) + (W.y * REG.y);             \
        TI    = -(W.y * REG.x) + (W.x * REG.y);            \
        REG.x = TR;                                        \
        REG.y = TI;                                        \
    }

#endif // COMMON_H
)_PY_EMBED_"};
const char* device_enum_h {
R"_PY_EMBED_(
// Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef DEVICE_ENUM_H
#define DEVICE_ENUM_H

enum StrideBin
{
    SB_UNIT,
    SB_NONUNIT,
};

enum class EmbeddedType : int
{
    NONE        = 0, // Works as the regular complex to complex FFT kernel
    Real2C_POST = 1, // Works with even-length real2complex post-processing
    C2Real_PRE  = 2, // Works with even-length complex2real pre-processing
};

// TODO: rework this
//
//
// NB:
// SBRC kernels can be used in various scenarios. Instead of tmeplate all
// combinations, we define/enable the cases in using only. In this way,
// the logic in POWX_LARGE_SBRC_GENERATOR() would be simple. People could
// add more later or find a way to simply POWX_LARGE_SBRC_GENERATOR().
enum SBRC_TYPE
{
    SBRC_2D = 2, // for one step in 1D middle size decomposition

    SBRC_3D_FFT_TRANS_XY_Z = 3, // for 3D C2C middle size fused kernel
    SBRC_3D_FFT_TRANS_Z_XY = 4, // for 3D R2C middle size fused kernel
    SBRC_3D_TRANS_XY_Z_FFT = 5, // for 3D C2R middle size fused kernel

    // for 3D R2C middle size, to fuse FFT, Even-length real2complex, and Transpose_Z_XY
    SBRC_3D_FFT_ERC_TRANS_Z_XY = 6,

    // for 3D C2R middle size, to fuse Transpose_XY_Z, Even-length complex2real, and FFT
    SBRC_3D_TRANS_XY_Z_ECR_FFT = 7,
};

enum SBRC_TRANSPOSE_TYPE
{
    NONE, // indicating this is a non-sbrc type, an SBRC kernel shouldn't have this
    DIAGONAL, // best, but requires cube sizes
    TILE_ALIGNED, // OK, doesn't require handling unaligned corner case
    TILE_UNALIGNED,
};

enum DirectRegType
{
    // the direct-to-from-reg codes are not even generated from generator
    // or is generated but we don't want to use it in some arch
    FORCE_OFF_OR_NOT_SUPPORT,
    TRY_ENABLE_IF_SUPPORT, // Use the direct-to-from-reg function
};

enum IntrinsicAccessType
{
    DISABLE_BOTH, // turn-off intrinsic buffer load/store
    ENABLE_LOAD_ONLY, // turn-on intrinsic buffer load only
    ENABLE_BOTH, // turn-on both intrinsic buffer load/store
};

enum BluesteinType
{
    BT_NONE,
    BT_SINGLE_KERNEL, // implementation for small lengths (that fit in LDS)
    BT_MULTI_KERNEL, // large lengths
    BT_MULTI_KERNEL_FUSED, // large lengths with fused intermediate Bluestein operations
};

enum BluesteinFuseType
{ // Fused operation types for multi-kernel Bluestein
    BFT_NONE,
    BFT_FWD_CHIRP, // fused chirp + padding + forward fft
    BFT_FWD_CHIRP_MUL, // fused chirp / input Hadamard product + padding + forward fft
    BFT_INV_CHIRP_MUL, // fused convolution Hadamard product + inverse fft + chirp Hadamard product
};

enum PartialPassType
{
    PPT_NONE,
    PPT_SBCC,
    PPT_SBRR,
};

#endif
)_PY_EMBED_"};
const char* memory_gfx_h {
R"_PY_EMBED_(
/******************************************************************************
 * Copyright 2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *****************************************************************************/
/*! \file
    \brief Architecture-specific operators on memory added for GFX9
*/
// reference:
//   https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll

#ifndef INTRINSIC_MEM_ACCESS_H
#define INTRINSIC_MEM_ACCESS_H

#if defined(__clang__) && defined(__HIP__)

#if(defined(__NVCC__) || defined(__HIPCC__)) \
    || (defined(__clang__) && (defined(__CUDA__)) || defined(__HIP__))
#define ROCFFT_DEVICE __forceinline__ __device__
#elif defined(__CUDACC_RTC__)
#define ROCFFT_DEVICE __forceinline__ __device__
#else
#define ROCFFT_DEVICE inline
#endif

#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) \
    || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__)                     \
    || defined(__gfx942__) // test device
#define USE_GFX_BUFFER_INTRINSIC
#define BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(__gfx1030__) // special device
#define USE_GFX_BUFFER_INTRINSIC
#define BUFFER_RESOURCE_3RD_DWORD 0x31014000
#else // not support
#define BUFFER_RESOURCE_3RD_DWORD -1
#endif

/// Controls AMD gfx arch cache operations
struct CacheOperation
{
    enum Kind
    {
        /// Cache at all levels - accessed again
        Always,
        /// Cache at global level; glc = 1
        Global,
        /// Streaming - likely to be accessed once; slc = 1
        Streaming,
        /// Indicates the line will not be used again, glc = 1; slc = 1
        LastUse
    };
};

using float16_t = rocfft_fp16;
using float32_t = float;

template <typename T, int N>
struct NativeVector
{
    using type = T __attribute__((ext_vector_type(N)));
};

// template <int N>
// struct NativeVector<cutlass::half_t, N>
// {
//   using type = typename NativeVector<float16_t, N>::type;
// };

// template <int N>
// struct NativeVector<cutlass::bfloat16_t, N>
// {
//   using type = typename NativeVector<float16_t, N>::type;
// };

using float32x2_t = NativeVector<float, 2>::type;
using float32x4_t = NativeVector<float, 4>::type;

using int32x4_t = NativeVector<int, 4>::type;

////////////////////////////////////////////////////////////////////////////////////////////////////

struct alignas(16) BufferResource
{
    union Desc
    {
        int32x4_t d128;
        void*     d64[2];
        uint32_t  d32[4];
    };

    ROCFFT_DEVICE
    BufferResource(void const* base_addr, uint32_t num_records = (0xFFFFFFFF - 1))
    {
        // Reference:
        //   For CDNA: see section 9.1.8 in the AMD resources
        //   https://developer.amd.com/wp-content/resources/CDNA1_Shader_ISA_14December2020.pdf
        //   For RDNA: see section 8.1.8 in the AMD resources
        //   https://developer.amd.com/wp-content/resources/RDNA2_Shader_ISA_November2020.pdf
        //   The d32[3] field represents the 0x[127] ~ [96]

        // 64-bit base address
        desc_.d64[0] = const_cast<void*>(base_addr);
        // 32-bit number of records in bytes which is used to guard against out-of-range access
        desc_.d32[2] = num_records;
        // 32-bit buffer resource descriptor
        desc_.d32[3] = BUFFER_RESOURCE_3RD_DWORD;
    }

    ROCFFT_DEVICE
    operator int32x4_t()
    {
        // return desc_.d128; // NOTE HIP: Crashes compiler; see below

        /// This hack is to enforce scalarization of the variable "base_addr", where in some
        /// circumstances it becomes vectorized and then in turn causes illegal lowering to GCN ISA
        /// since compiler effectively tries to stuff VGPRs in slots where it only accepts SGPRs
        Desc ret;
        ret.d32[0] = __builtin_amdgcn_readfirstlane(desc_.d32[0]);
        ret.d32[1] = __builtin_amdgcn_readfirstlane(desc_.d32[1]);
        ret.d64[1] = desc_.d64[1];
        return ret.d128;
        ///
    }

    Desc desc_;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

///
/// Load
///

// 1 byte
__device__ char
    llvm_amdgcn_raw_buffer_load_i8(int32x4_t buffer_resource,
                                   uint32_t  voffset,
                                   uint32_t  soffset,
                                   int32_t   cache_op) __asm("llvm.amdgcn.raw.buffer.load.i8");

// 2 bytes
__device__ float16_t
    llvm_amdgcn_raw_buffer_load_f16(int32x4_t buffer_resource,
                                    uint32_t  voffset,
                                    uint32_t  soffset,
                                    int32_t   cache_op) __asm("llvm.amdgcn.raw.buffer.load.f16");

// 4 bytes
__device__ float32_t
    llvm_amdgcn_raw_buffer_load_f32(int32x4_t buffer_resource,
                                    uint32_t  voffset,
                                    uint32_t  soffset,
                                    int32_t   cache_op) __asm("llvm.amdgcn.raw.buffer.load.f32");

// 8 bytes
__device__ float32x2_t
    llvm_amdgcn_raw_buffer_load_f32x2(int32x4_t buffer_resource,
                                      uint32_t  voffset,
                                      uint32_t  soffset,
                                      int32_t cache_op) __asm("llvm.amdgcn.raw.buffer.load.v2f32");

// 16 bytes
__device__ float32x4_t
    llvm_amdgcn_raw_buffer_load_f32x4(int32x4_t buffer_resource,
                                      uint32_t  voffset,
                                      uint32_t  soffset,
                                      int32_t cache_op) __asm("llvm.amdgcn.raw.buffer.load.v4f32");

///
/// Store
///

// 1 byte
__device__ void
    llvm_amdgcn_raw_buffer_store_i8(char      data,
                                    int32x4_t buffer_resource,
                                    uint32_t  voffset,
                                    uint32_t  soffset,
                                    int32_t   cache_op) __asm("llvm.amdgcn.raw.buffer.store.i8");

// 2 bytes
__device__ void
    llvm_amdgcn_raw_buffer_store_f16(float16_t data,
                                     int32x4_t buffer_resource,
                                     uint32_t  voffset,
                                     uint32_t  soffset,
                                     int32_t   cache_op) __asm("llvm.amdgcn.raw.buffer.store.f16");

// 4 bytes
__device__ void
    llvm_amdgcn_raw_buffer_store_f32(float32_t data,
                                     int32x4_t buffer_resource,
                                     uint32_t  voffset,
                                     uint32_t  soffset,
                                     int32_t   cache_op) __asm("llvm.amdgcn.raw.buffer.store.f32");

// 8 bytes
__device__ void llvm_amdgcn_raw_buffer_store_f32x2(
    float32x2_t data,
    int32x4_t   buffer_resource,
    uint32_t    voffset,
    uint32_t    soffset,
    int32_t     cache_op) __asm("llvm.amdgcn.raw.buffer.store.v2f32");

// 16 bytes
__device__ void llvm_amdgcn_raw_buffer_store_f32x4(
    float32x4_t data,
    int32x4_t   buffer_resource,
    uint32_t    voffset,
    uint32_t    soffset,
    int32_t     cache_op) __asm("llvm.amdgcn.raw.buffer.store.v4f32");

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    /// Fragment type to store loaded data
    typename AccessType,
    /// The bytes of loading
    int LoadBytes,
    /// Cache operation
    CacheOperation::Kind cache_op = CacheOperation::Always>
struct buffer_load;

template <typename AccessType, CacheOperation::Kind cache_op>
struct buffer_load<AccessType, 1, cache_op>
{
    ROCFFT_DEVICE
    buffer_load() {}

    ROCFFT_DEVICE
    buffer_load(
        AccessType& D, void const* base_ptr, uint32_t voffset, uint32_t soffset, bool pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset  = pred_guard ? voffset : -1;
        char ret = llvm_amdgcn_raw_buffer_load_i8(
            buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
        D = *reinterpret_cast<AccessType*>(&ret);
    }

    ROCFFT_DEVICE
    AccessType load(void const* base_ptr, uint32_t voffset, uint32_t soffset, bool pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset  = pred_guard ? voffset : -1;
        char ret = llvm_amdgcn_raw_buffer_load_i8(
            buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
        return *reinterpret_cast<AccessType*>(&ret);
    }
};

template <typename AccessType, CacheOperation::Kind cache_op>
struct buffer_load<AccessType, 2, cache_op>
{
    ROCFFT_DEVICE
    buffer_load() {}

    ROCFFT_DEVICE
    buffer_load(
        AccessType& D, void const* base_ptr, uint32_t voffset, uint32_t soffset, bool pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset       = pred_guard ? voffset : -1;
        float16_t ret = llvm_amdgcn_raw_buffer_load_f16(
            buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
        D = *reinterpret_cast<AccessType*>(&ret);
    }

    ROCFFT_DEVICE
    AccessType load(void const* base_ptr, uint32_t voffset, uint32_t soffset, bool pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset       = pred_guard ? voffset : -1;
        float16_t ret = llvm_amdgcn_raw_buffer_load_f16(
            buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
        return *reinterpret_cast<AccessType*>(&ret);
    }
};

template <typename AccessType, CacheOperation::Kind cache_op>
struct buffer_load<AccessType, 4, cache_op>
{
    ROCFFT_DEVICE
    buffer_load() {}

    ROCFFT_DEVICE
    buffer_load(
        AccessType& D, void const* base_ptr, uint32_t voffset, uint32_t soffset, bool pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset       = pred_guard ? voffset : -1;
        float32_t ret = llvm_amdgcn_raw_buffer_load_f32(
            buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
        D = *reinterpret_cast<AccessType*>(&ret);
    }

    ROCFFT_DEVICE
    AccessType load(void const* base_ptr, uint32_t voffset, uint32_t soffset, bool pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset       = pred_guard ? voffset : -1;
        float32_t ret = llvm_amdgcn_raw_buffer_load_f32(
            buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
        return *reinterpret_cast<AccessType*>(&ret);
    }
};

template <typename AccessType, CacheOperation::Kind cache_op>
struct buffer_load<AccessType, 8, cache_op>
{
    ROCFFT_DEVICE
    buffer_load() {}

    ROCFFT_DEVICE
    buffer_load(
        AccessType& D, void const* base_ptr, uint32_t voffset, uint32_t soffset, bool pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset         = pred_guard ? voffset : -1;
        float32x2_t ret = llvm_amdgcn_raw_buffer_load_f32x2(
            buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
        D = *reinterpret_cast<AccessType*>(&ret);
    }

    ROCFFT_DEVICE
    AccessType load(void const* base_ptr, uint32_t voffset, uint32_t soffset, bool pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset         = pred_guard ? voffset : -1;
        float32x2_t ret = llvm_amdgcn_raw_buffer_load_f32x2(
            buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
        return *reinterpret_cast<AccessType*>(&ret);
    }
};

template <typename AccessType, CacheOperation::Kind cache_op>
struct buffer_load<AccessType, 16, cache_op>
{
    ROCFFT_DEVICE
    buffer_load() {}

    ROCFFT_DEVICE
    buffer_load(
        AccessType& D, void const* base_ptr, uint32_t voffset, uint32_t soffset, bool pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset         = pred_guard ? voffset : -1;
        float32x4_t ret = llvm_amdgcn_raw_buffer_load_f32x4(
            buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
        D = *reinterpret_cast<AccessType*>(&ret);
    }

    ROCFFT_DEVICE
    AccessType load(void const* base_ptr, uint32_t voffset, uint32_t soffset, bool pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset         = pred_guard ? voffset : -1;
        float32x4_t ret = llvm_amdgcn_raw_buffer_load_f32x4(
            buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
        return *reinterpret_cast<AccessType*>(&ret);
    }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template <
    /// Fragment type to store loaded data
    typename AccessType,
    /// The width of loading
    int NumElements,
    /// Cache operation
    CacheOperation::Kind cache_op = CacheOperation::Always>
struct buffer_store;

template <typename AccessType, CacheOperation::Kind cache_op>
struct buffer_store<AccessType, 1, cache_op>
{
    ROCFFT_DEVICE
    buffer_store(const AccessType& D,
                 void const*       base_ptr,
                 uint32_t          voffset,
                 uint32_t          soffset,
                 bool              pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset   = pred_guard ? voffset : -1;
        char data = *reinterpret_cast<char const*>(&D);
        llvm_amdgcn_raw_buffer_store_i8(
            data, buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
    }
};

template <typename AccessType, CacheOperation::Kind cache_op>
struct buffer_store<AccessType, 2, cache_op>
{
    ROCFFT_DEVICE
    buffer_store(const AccessType& D,
                 void const*       base_ptr,
                 uint32_t          voffset,
                 uint32_t          soffset,
                 bool              pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset        = pred_guard ? voffset : -1;
        float16_t data = *reinterpret_cast<float16_t const*>(&D);
        llvm_amdgcn_raw_buffer_store_f16(
            data, buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
    }
};

template <typename AccessType, CacheOperation::Kind cache_op>
struct buffer_store<AccessType, 4, cache_op>
{
    ROCFFT_DEVICE
    buffer_store(const AccessType& D,
                 void const*       base_ptr,
                 uint32_t          voffset,
                 uint32_t          soffset,
                 bool              pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset        = pred_guard ? voffset : -1;
        float32_t data = *reinterpret_cast<float32_t const*>(&D);
        llvm_amdgcn_raw_buffer_store_f32(
            data, buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
    }
};

template <typename AccessType, CacheOperation::Kind cache_op>
struct buffer_store<AccessType, 8, cache_op>
{
    ROCFFT_DEVICE
    buffer_store(const AccessType& D,
                 void const*       base_ptr,
                 uint32_t          voffset,
                 uint32_t          soffset,
                 bool              pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset          = pred_guard ? voffset : -1;
        float32x2_t data = *reinterpret_cast<float32x2_t const*>(&D);
        llvm_amdgcn_raw_buffer_store_f32x2(
            data, buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
    }
};

template <typename AccessType, CacheOperation::Kind cache_op>
struct buffer_store<AccessType, 16, cache_op>
{
    ROCFFT_DEVICE
    buffer_store(const AccessType& D,
                 void const*       base_ptr,
                 uint32_t          voffset,
                 uint32_t          soffset,
                 bool              pred_guard)
    {
        BufferResource buffer_rsc(base_ptr);
        voffset          = pred_guard ? voffset : -1;
        float32x4_t data = *reinterpret_cast<float32x4_t const*>(&D);
        llvm_amdgcn_raw_buffer_store_f32x4(
            data, buffer_rsc, voffset, __builtin_amdgcn_readfirstlane(soffset), cache_op);
    }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

#endif // defined(__clang__) && defined(__HIP__)

#endif // INTRINSIC_MEM_ACCESS_H
)_PY_EMBED_"};
const char* callback_h {
R"_PY_EMBED_(
// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef ROCFFT_DEVICE_CALLBACK_H
#define ROCFFT_DEVICE_CALLBACK_H



// user-provided data saying what callbacks to run
struct UserCallbacks
{
    void*  load_cb_fn        = nullptr;
    void*  load_cb_data      = nullptr;
    size_t load_cb_lds_bytes = 0;

    void*  store_cb_fn        = nullptr;
    void*  store_cb_data      = nullptr;
    size_t store_cb_lds_bytes = 0;
};

// default callback implementations that just do simple load/store
template <typename T>
__device__ T load_cb_default(T* data, size_t offset, void* cbdata, void* sharedMem)
{
    return data[offset];
}

template <typename T>
__device__ void store_cb_default(T* data, size_t offset, T element, void* cbdata, void* sharedMem)
{
    data[offset] = element;
}

// callback function types
template <typename T>
struct callback_type;

template <>
struct callback_type<rocfft_complex<rocfft_fp16>>
{
    typedef rocfft_complex<rocfft_fp16> (*load)(rocfft_complex<rocfft_fp16>* data,
                                                size_t                       offset,
                                                void*                        cbdata,
                                                void*                        sharedMem);
    typedef void (*store)(rocfft_complex<rocfft_fp16>* data,
                          size_t                       offset,
                          rocfft_complex<rocfft_fp16>  element,
                          void*                        cbdata,
                          void*                        sharedMem);
};

static __device__ auto load_cb_default_complex_half = load_cb_default<rocfft_complex<rocfft_fp16>>;
static __device__ auto store_cb_default_complex_half
    = store_cb_default<rocfft_complex<rocfft_fp16>>;

template <>
struct callback_type<rocfft_complex<float>>
{
    typedef rocfft_complex<float> (*load)(rocfft_complex<float>* data,
                                          size_t                 offset,
                                          void*                  cbdata,
                                          void*                  sharedMem);
    typedef void (*store)(rocfft_complex<float>* data,
                          size_t                 offset,
                          rocfft_complex<float>  element,
                          void*                  cbdata,
                          void*                  sharedMem);
};

static __device__ auto load_cb_default_complex_float  = load_cb_default<rocfft_complex<float>>;
static __device__ auto store_cb_default_complex_float = store_cb_default<rocfft_complex<float>>;

template <>
struct callback_type<rocfft_complex<double>>
{
    typedef rocfft_complex<double> (*load)(rocfft_complex<double>* data,
                                           size_t                  offset,
                                           void*                   cbdata,
                                           void*                   sharedMem);
    typedef void (*store)(rocfft_complex<double>* data,
                          size_t                  offset,
                          rocfft_complex<double>  element,
                          void*                   cbdata,
                          void*                   sharedMem);
};

static __device__ auto load_cb_default_complex_double  = load_cb_default<rocfft_complex<double>>;
static __device__ auto store_cb_default_complex_double = store_cb_default<rocfft_complex<double>>;

template <>
struct callback_type<rocfft_fp16>
{
    typedef rocfft_fp16 (*load)(rocfft_fp16* data, size_t offset, void* cbdata, void* sharedMem);
    typedef void (*store)(
        rocfft_fp16* data, size_t offset, rocfft_fp16 element, void* cbdata, void* sharedMem);
};

static __device__ auto load_cb_default_half  = load_cb_default<rocfft_fp16>;
static __device__ auto store_cb_default_half = store_cb_default<rocfft_fp16>;

template <>
struct callback_type<float>
{
    typedef float (*load)(float* data, size_t offset, void* cbdata, void* sharedMem);
    typedef void (*store)(float* data, size_t offset, float element, void* cbdata, void* sharedMem);
};

static __device__ auto load_cb_default_float  = load_cb_default<float>;
static __device__ auto store_cb_default_float = store_cb_default<float>;

template <>
struct callback_type<double>
{
    typedef double (*load)(double* data, size_t offset, void* cbdata, void* sharedMem);
    typedef void (*store)(
        double* data, size_t offset, double element, void* cbdata, void* sharedMem);
};

static __device__ auto load_cb_default_double  = load_cb_default<double>;
static __device__ auto store_cb_default_double = store_cb_default<double>;

// planar helpers
template <typename Tfloat>
__device__ rocfft_complex<Tfloat>
           load_planar(const Tfloat* dataRe, const Tfloat* dataIm, size_t offset)
{
    return rocfft_complex<Tfloat>{dataRe[offset], dataIm[offset]};
}

template <typename Tfloat>
__device__ void
    store_planar(Tfloat* dataRe, Tfloat* dataIm, size_t offset, rocfft_complex<Tfloat> element)
{
    dataRe[offset] = element.x;
    dataIm[offset] = element.y;
}

// intrinsic
template <typename T>
__device__ void intrinsic_load_to_dest(
    T& target, const T* data, unsigned int voffset, unsigned int soffset, bool rw)
{
#ifdef USE_GFX_BUFFER_INTRINSIC
    buffer_load<T, sizeof(T)>(target,
                              reinterpret_cast<void*>(const_cast<T*>(data)),
                              (uint32_t)(voffset * sizeof(T)),
                              (uint32_t)(soffset * sizeof(T)),
                              rw);
#else
    target = rw ? data[soffset + voffset] : target;
#endif
}

template <typename T>
__device__ T intrinsic_load(const T* data, unsigned int voffset, unsigned int soffset, bool rw)
{
#ifdef USE_GFX_BUFFER_INTRINSIC
    return buffer_load<T, sizeof(T)>().load(reinterpret_cast<void*>(const_cast<T*>(data)),
                                            (uint32_t)(voffset * sizeof(T)),
                                            (uint32_t)(soffset * sizeof(T)),
                                            rw);
#else
    return rw ? data[soffset + voffset] : T();
#endif
}

template <typename Tfloat>
__device__ rocfft_complex<Tfloat> intrinsic_load_planar(
    const Tfloat* dataRe, const Tfloat* dataIm, unsigned int voffset, unsigned int soffset, bool rw)
{
#ifdef USE_GFX_BUFFER_INTRINSIC
    return rocfft_complex<Tfloat>{buffer_load<Tfloat, sizeof(Tfloat)>().load(
                                      reinterpret_cast<void*>(const_cast<Tfloat*>(dataRe)),
                                      (uint32_t)(voffset * sizeof(Tfloat)),
                                      (uint32_t)(soffset * sizeof(Tfloat)),
                                      rw),
                                  buffer_load<Tfloat, sizeof(Tfloat)>().load(
                                      reinterpret_cast<void*>(const_cast<Tfloat*>(dataIm)),
                                      (uint32_t)(voffset * sizeof(Tfloat)),
                                      (uint32_t)(soffset * sizeof(Tfloat)),
                                      rw)};
#else
    return rw ? rocfft_complex<Tfloat>{dataRe[soffset + voffset], dataIm[soffset + voffset]}
              : rocfft_complex<Tfloat>();
#endif
}

template <typename T>
__device__ void
    store_intrinsic(T* data, unsigned int voffset, unsigned int soffset, T element, bool rw)
{
#ifdef USE_GFX_BUFFER_INTRINSIC
    buffer_store<T, sizeof(T)>(element,
                               reinterpret_cast<void*>(const_cast<T*>(data)),
                               (uint32_t)(voffset * sizeof(T)),
                               (uint32_t)(soffset * sizeof(T)),
                               rw);
#else
    if(rw)
        data[soffset + voffset] = element;
#endif
}

template <typename Tfloat>
__device__ void store_intrinsic_planar(Tfloat*                dataRe,
                                       Tfloat*                dataIm,
                                       unsigned int           voffset,
                                       unsigned int           soffset,
                                       rocfft_complex<Tfloat> element,
                                       bool                   rw)
{
#ifdef USE_GFX_BUFFER_INTRINSIC
    buffer_store<Tfloat, sizeof(Tfloat)>(element.x,
                                         reinterpret_cast<void*>(const_cast<Tfloat*>(dataRe)),
                                         (uint32_t)(voffset * sizeof(Tfloat)),
                                         (uint32_t)(soffset * sizeof(Tfloat)),
                                         rw);
    buffer_store<Tfloat, sizeof(Tfloat)>(element.y,
                                         reinterpret_cast<void*>(const_cast<Tfloat*>(dataIm)),
                                         (uint32_t)(voffset * sizeof(Tfloat)),
                                         (uint32_t)(soffset * sizeof(Tfloat)),
                                         rw);
#else
    if(rw)
    {
        dataRe[soffset + voffset] = element.x;
        dataIm[soffset + voffset] = element.y;
    }
#endif
}

enum struct CallbackType
{
    // don't run user callbacks
    NONE,
    // run user load/store callbacks
    USER_LOAD_STORE,
    // run user load/store callbacks, but user code loads
    // reals and the kernel wants complex
    USER_LOAD_STORE_R2C,
    // run user load/store callbacks, but user code stores
    // reals and the kernel wants complex
    USER_LOAD_STORE_C2R,
};

// helpers to cast void* to the correct function pointer type
template <typename T, CallbackType cbtype>
static __device__ typename callback_type<T>::load get_load_cb(void* ptr)
{
#ifdef ROCFFT_CALLBACKS_ENABLED
    if(cbtype != CallbackType::NONE)
        return reinterpret_cast<typename callback_type<T>::load>(ptr);
#endif
    return load_cb_default<T>;
}

template <typename T, CallbackType cbtype>
static __device__ typename callback_type<T>::store get_store_cb(void* ptr)
{
#ifdef ROCFFT_CALLBACKS_ENABLED
    if(cbtype != CallbackType::NONE)
        return reinterpret_cast<typename callback_type<T>::store>(ptr);
#endif
    return store_cb_default<T>;
}

#endif
)_PY_EMBED_"};
const char* butterfly_constant_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

#ifndef BUTTERFLY_CONSTANT_H
#define BUTTERFLY_CONSTANT_H

// butterfly radix-3 constants
#define C3QA static_cast<real_type_t<T>>(0.50000000000000000000000000000000)
#define C3QB static_cast<real_type_t<T>>(0.86602540378443864676372317075294)

// butterfly radix-5 constants
#define C5QA static_cast<real_type_t<T>>(0.30901699437494742410229341718282)
#define C5QB static_cast<real_type_t<T>>(0.95105651629515357211643933337938)
#define C5QC static_cast<real_type_t<T>>(0.50000000000000000000000000000000)
#define C5QD static_cast<real_type_t<T>>(0.58778525229247312916870595463907)
#define C5QE static_cast<real_type_t<T>>(0.80901699437494742410229341718282)

// butterfly radix-7 constants
#define C7Q1 static_cast<real_type_t<T>>(-1.16666666666666651863693004997913)
#define C7Q2 static_cast<real_type_t<T>>(0.79015646852540022404554065360571)
#define C7Q3 static_cast<real_type_t<T>>(0.05585426728964774240049351305970)
#define C7Q4 static_cast<real_type_t<T>>(0.73430220123575240531721419756650)
#define C7Q5 static_cast<real_type_t<T>>(0.44095855184409837868031445395900)
#define C7Q6 static_cast<real_type_t<T>>(0.34087293062393136944265847887436)
#define C7Q7 static_cast<real_type_t<T>>(-0.53396936033772524066165487965918)
#define C7Q8 static_cast<real_type_t<T>>(0.87484229096165666561546458979137)

// butterfly radix-8 constants
#define C8Q static_cast<real_type_t<T>>(0.70710678118654752440084436210485)

// butterfly radix-9 constants
#define C9QA static_cast<real_type_t<T>>(0.766044443118978)
#define C9QB static_cast<real_type_t<T>>(0.6427876096865393)
#define C9QC static_cast<real_type_t<T>>(0.1736481776669304)
#define C9QD static_cast<real_type_t<T>>(0.984807753012208)
#define C9QE static_cast<real_type_t<T>>(0.5000000000000000)
#define C9QF static_cast<real_type_t<T>>(0.8660254037844387)
#define C9QG static_cast<real_type_t<T>>(0.9396926207859083)
#define C9QH static_cast<real_type_t<T>>(0.3420201433256689)

//
// For radix-11 and radix-13 the butterfly constants correspond to
// the roots of unity for the radix; and are named according to:
//
//   "Q" + radix + "i" + i + "j" + j + "R"/"I"
//
// where i and j are the row/col indicies of the DFT matrix A
// corresponding to the radix and R/I is the real/imaginary part.
// More specifically:
//
//  A[i,j] = exp(-2 pi I i j / radix)
//
// and hence, for example
//
//  Q11i2j5R = Re( exp(-2 pi I 2 * 5 / 11) )
//

// butterfly radix-11 constants
#define Q11i1j1R static_cast<real_type_t<T>>((0.8412535328311811688618))
#define Q11i1j1I static_cast<real_type_t<T>>((-0.5406408174555975821076))
#define Q11i1j2R static_cast<real_type_t<T>>((0.4154150130018864255293))
#define Q11i1j2I static_cast<real_type_t<T>>((-0.9096319953545183714117))
#define Q11i1j3R static_cast<real_type_t<T>>((-0.1423148382732851404438))
#define Q11i1j3I static_cast<real_type_t<T>>((-0.9898214418809327323761))
#define Q11i1j4R static_cast<real_type_t<T>>((-0.6548607339452850640569))
#define Q11i1j4I static_cast<real_type_t<T>>((-0.7557495743542582837740))
#define Q11i1j5R static_cast<real_type_t<T>>((-0.9594929736144973898904))
#define Q11i1j5I static_cast<real_type_t<T>>((-0.2817325568414296977114))
#define Q11i2j1R static_cast<real_type_t<T>>((0.4154150130018864255293))
#define Q11i2j1I static_cast<real_type_t<T>>((-0.9096319953545183714117))
#define Q11i2j2R static_cast<real_type_t<T>>((-0.6548607339452850640569))
#define Q11i2j2I static_cast<real_type_t<T>>((-0.7557495743542582837740))
#define Q11i2j3R static_cast<real_type_t<T>>((-0.9594929736144973898904))
#define Q11i2j3I static_cast<real_type_t<T>>((0.2817325568414296977114))
#define Q11i2j4R static_cast<real_type_t<T>>((-0.1423148382732851404438))
#define Q11i2j4I static_cast<real_type_t<T>>((0.9898214418809327323761))
#define Q11i2j5R static_cast<real_type_t<T>>((0.8412535328311811688618))
#define Q11i2j5I static_cast<real_type_t<T>>((0.5406408174555975821076))
#define Q11i3j1R static_cast<real_type_t<T>>((-0.1423148382732851404438))
#define Q11i3j1I static_cast<real_type_t<T>>((-0.9898214418809327323761))
#define Q11i3j2R static_cast<real_type_t<T>>((-0.9594929736144973898904))
#define Q11i3j2I static_cast<real_type_t<T>>((0.2817325568414296977114))
#define Q11i3j3R static_cast<real_type_t<T>>((0.4154150130018864255293))
#define Q11i3j3I static_cast<real_type_t<T>>((0.9096319953545183714117))
#define Q11i3j4R static_cast<real_type_t<T>>((0.8412535328311811688618))
#define Q11i3j4I static_cast<real_type_t<T>>((-0.5406408174555975821076))
#define Q11i3j5R static_cast<real_type_t<T>>((-0.6548607339452850640569))
#define Q11i3j5I static_cast<real_type_t<T>>((-0.7557495743542582837740))
#define Q11i4j1R static_cast<real_type_t<T>>((-0.6548607339452850640569))
#define Q11i4j1I static_cast<real_type_t<T>>((-0.7557495743542582837740))
#define Q11i4j2R static_cast<real_type_t<T>>((-0.1423148382732851404438))
#define Q11i4j2I static_cast<real_type_t<T>>((0.9898214418809327323761))
#define Q11i4j3R static_cast<real_type_t<T>>((0.8412535328311811688618))
#define Q11i4j3I static_cast<real_type_t<T>>((-0.5406408174555975821076))
#define Q11i4j4R static_cast<real_type_t<T>>((-0.9594929736144973898904))
#define Q11i4j4I static_cast<real_type_t<T>>((-0.2817325568414296977114))
#define Q11i4j5R static_cast<real_type_t<T>>((0.4154150130018864255293))
#define Q11i4j5I static_cast<real_type_t<T>>((0.9096319953545183714117))
#define Q11i5j1R static_cast<real_type_t<T>>((-0.9594929736144973898904))
#define Q11i5j1I static_cast<real_type_t<T>>((-0.2817325568414296977114))
#define Q11i5j2R static_cast<real_type_t<T>>((0.8412535328311811688618))
#define Q11i5j2I static_cast<real_type_t<T>>((0.5406408174555975821076))
#define Q11i5j3R static_cast<real_type_t<T>>((-0.6548607339452850640569))
#define Q11i5j3I static_cast<real_type_t<T>>((-0.7557495743542582837740))
#define Q11i5j4R static_cast<real_type_t<T>>((0.4154150130018864255293))
#define Q11i5j4I static_cast<real_type_t<T>>((0.9096319953545183714117))
#define Q11i5j5R static_cast<real_type_t<T>>((-0.1423148382732851404438))
#define Q11i5j5I static_cast<real_type_t<T>>((-0.9898214418809327323761))

// butterfly radix-13 constants
#define Q13i1j1R static_cast<real_type_t<T>>((0.8854560256532098959004))
#define Q13i1j1I static_cast<real_type_t<T>>((-0.4647231720437685456560))
#define Q13i1j2R static_cast<real_type_t<T>>((0.5680647467311558025118))
#define Q13i1j2I static_cast<real_type_t<T>>((-0.8229838658936563945796))
#define Q13i1j3R static_cast<real_type_t<T>>((0.1205366802553230533491))
#define Q13i1j3I static_cast<real_type_t<T>>((-0.9927088740980539928007))
#define Q13i1j4R static_cast<real_type_t<T>>((-0.3546048870425356259696))
#define Q13i1j4I static_cast<real_type_t<T>>((-0.9350162426854148234398))
#define Q13i1j5R static_cast<real_type_t<T>>((-0.7485107481711010986346))
#define Q13i1j5I static_cast<real_type_t<T>>((-0.6631226582407952023768))
#define Q13i1j6R static_cast<real_type_t<T>>((-0.9709418174260520271570))
#define Q13i1j6I static_cast<real_type_t<T>>((-0.2393156642875577671488))
#define Q13i2j1R static_cast<real_type_t<T>>((0.5680647467311558025118))
#define Q13i2j1I static_cast<real_type_t<T>>((-0.8229838658936563945796))
#define Q13i2j2R static_cast<real_type_t<T>>((-0.3546048870425356259696))
#define Q13i2j2I static_cast<real_type_t<T>>((-0.9350162426854148234398))
#define Q13i2j3R static_cast<real_type_t<T>>((-0.9709418174260520271570))
#define Q13i2j3I static_cast<real_type_t<T>>((-0.2393156642875577671488))
#define Q13i2j4R static_cast<real_type_t<T>>((-0.7485107481711010986346))
#define Q13i2j4I static_cast<real_type_t<T>>((0.6631226582407952023768))
#define Q13i2j5R static_cast<real_type_t<T>>((0.1205366802553230533491))
#define Q13i2j5I static_cast<real_type_t<T>>((0.9927088740980539928007))
#define Q13i2j6R static_cast<real_type_t<T>>((0.8854560256532098959004))
#define Q13i2j6I static_cast<real_type_t<T>>((0.4647231720437685456560))
#define Q13i3j1R static_cast<real_type_t<T>>((0.1205366802553230533491))
#define Q13i3j1I static_cast<real_type_t<T>>((-0.9927088740980539928007))
#define Q13i3j2R static_cast<real_type_t<T>>((-0.9709418174260520271570))
#define Q13i3j2I static_cast<real_type_t<T>>((-0.2393156642875577671488))
#define Q13i3j3R static_cast<real_type_t<T>>((-0.3546048870425356259696))
#define Q13i3j3I static_cast<real_type_t<T>>((0.9350162426854148234398))
#define Q13i3j4R static_cast<real_type_t<T>>((0.8854560256532098959004))
#define Q13i3j4I static_cast<real_type_t<T>>((0.4647231720437685456560))
#define Q13i3j5R static_cast<real_type_t<T>>((0.5680647467311558025118))
#define Q13i3j5I static_cast<real_type_t<T>>((-0.8229838658936563945796))
#define Q13i3j6R static_cast<real_type_t<T>>((-0.7485107481711010986346))
#define Q13i3j6I static_cast<real_type_t<T>>((-0.6631226582407952023768))
#define Q13i4j1R static_cast<real_type_t<T>>((-0.3546048870425356259696))
#define Q13i4j1I static_cast<real_type_t<T>>((-0.9350162426854148234398))
#define Q13i4j2R static_cast<real_type_t<T>>((-0.7485107481711010986346))
#define Q13i4j2I static_cast<real_type_t<T>>((0.6631226582407952023768))
#define Q13i4j3R static_cast<real_type_t<T>>((0.8854560256532098959004))
#define Q13i4j3I static_cast<real_type_t<T>>((0.4647231720437685456560))
#define Q13i4j4R static_cast<real_type_t<T>>((0.1205366802553230533491))
#define Q13i4j4I static_cast<real_type_t<T>>((-0.9927088740980539928007))
#define Q13i4j5R static_cast<real_type_t<T>>((-0.9709418174260520271570))
#define Q13i4j5I static_cast<real_type_t<T>>((0.2393156642875577671488))
#define Q13i4j6R static_cast<real_type_t<T>>((0.5680647467311558025118))
#define Q13i4j6I static_cast<real_type_t<T>>((0.8229838658936563945796))
#define Q13i5j1R static_cast<real_type_t<T>>((-0.7485107481711010986346))
#define Q13i5j1I static_cast<real_type_t<T>>((-0.6631226582407952023768))
#define Q13i5j2R static_cast<real_type_t<T>>((0.1205366802553230533491))
#define Q13i5j2I static_cast<real_type_t<T>>((0.9927088740980539928007))
#define Q13i5j3R static_cast<real_type_t<T>>((0.5680647467311558025118))
#define Q13i5j3I static_cast<real_type_t<T>>((-0.8229838658936563945796))
#define Q13i5j4R static_cast<real_type_t<T>>((-0.9709418174260520271570))
#define Q13i5j4I static_cast<real_type_t<T>>((0.2393156642875577671488))
#define Q13i5j5R static_cast<real_type_t<T>>((0.8854560256532098959004))
#define Q13i5j5I static_cast<real_type_t<T>>((0.4647231720437685456560))
#define Q13i5j6R static_cast<real_type_t<T>>((-0.3546048870425356259696))
#define Q13i5j6I static_cast<real_type_t<T>>((-0.9350162426854148234398))
#define Q13i6j1R static_cast<real_type_t<T>>((-0.9709418174260520271570))
#define Q13i6j1I static_cast<real_type_t<T>>((-0.2393156642875577671488))
#define Q13i6j2R static_cast<real_type_t<T>>((0.8854560256532098959004))
#define Q13i6j2I static_cast<real_type_t<T>>((0.4647231720437685456560))
#define Q13i6j3R static_cast<real_type_t<T>>((-0.7485107481711010986346))
#define Q13i6j3I static_cast<real_type_t<T>>((-0.6631226582407952023768))
#define Q13i6j4R static_cast<real_type_t<T>>((0.5680647467311558025118))
#define Q13i6j4I static_cast<real_type_t<T>>((0.8229838658936563945796))
#define Q13i6j5R static_cast<real_type_t<T>>((-0.3546048870425356259696))
#define Q13i6j5I static_cast<real_type_t<T>>((-0.9350162426854148234398))
#define Q13i6j6R static_cast<real_type_t<T>>((0.1205366802553230533491))
#define Q13i6j6I static_cast<real_type_t<T>>((0.9927088740980539928007))

#define Q17i1j1R static_cast<real_type_t<T>>((0.9324722294043558045731))
#define Q17i1j1I static_cast<real_type_t<T>>((-0.3612416661871529487447))
#define Q17i1j2R static_cast<real_type_t<T>>((0.7390089172206591159245))
#define Q17i1j2I static_cast<real_type_t<T>>((-0.6736956436465572117127))
#define Q17i1j3R static_cast<real_type_t<T>>((0.4457383557765382673965))
#define Q17i1j3I static_cast<real_type_t<T>>((-0.8951632913550623220670))
#define Q17i1j4R static_cast<real_type_t<T>>((0.09226835946330199523965))
#define Q17i1j4I static_cast<real_type_t<T>>((-0.9957341762950345218712))
#define Q17i1j5R static_cast<real_type_t<T>>((-0.2736629900720828635391))
#define Q17i1j5I static_cast<real_type_t<T>>((-0.9618256431728190704088))
#define Q17i1j6R static_cast<real_type_t<T>>((-0.6026346363792563891786))
#define Q17i1j6I static_cast<real_type_t<T>>((-0.7980172272802395033328))
#define Q17i1j7R static_cast<real_type_t<T>>((-0.8502171357296141521341))
#define Q17i1j7I static_cast<real_type_t<T>>((-0.5264321628773558002446))
#define Q17i1j8R static_cast<real_type_t<T>>((-0.9829730996839017782819))
#define Q17i1j8I static_cast<real_type_t<T>>((-0.1837495178165703315744))
#define Q17i2j1R static_cast<real_type_t<T>>((0.7390089172206591159245))
#define Q17i2j1I static_cast<real_type_t<T>>((-0.6736956436465572117127))
#define Q17i2j2R static_cast<real_type_t<T>>((0.09226835946330199523965))
#define Q17i2j2I static_cast<real_type_t<T>>((-0.9957341762950345218712))
#define Q17i2j3R static_cast<real_type_t<T>>((-0.6026346363792563891786))
#define Q17i2j3I static_cast<real_type_t<T>>((-0.7980172272802395033328))
#define Q17i2j4R static_cast<real_type_t<T>>((-0.9829730996839017782819))
#define Q17i2j4I static_cast<real_type_t<T>>((-0.1837495178165703315744))
#define Q17i2j5R static_cast<real_type_t<T>>((-0.8502171357296141521341))
#define Q17i2j5I static_cast<real_type_t<T>>((0.5264321628773558002446))
#define Q17i2j6R static_cast<real_type_t<T>>((-0.2736629900720828635391))
#define Q17i2j6I static_cast<real_type_t<T>>((0.9618256431728190704088))
#define Q17i2j7R static_cast<real_type_t<T>>((0.4457383557765382673965))
#define Q17i2j7I static_cast<real_type_t<T>>((0.8951632913550623220670))
#define Q17i2j8R static_cast<real_type_t<T>>((0.9324722294043558045731))
#define Q17i2j8I static_cast<real_type_t<T>>((0.3612416661871529487447))
#define Q17i3j1R static_cast<real_type_t<T>>((0.4457383557765382673965))
#define Q17i3j1I static_cast<real_type_t<T>>((-0.8951632913550623220670))
#define Q17i3j2R static_cast<real_type_t<T>>((-0.6026346363792563891786))
#define Q17i3j2I static_cast<real_type_t<T>>((-0.7980172272802395033328))
#define Q17i3j3R static_cast<real_type_t<T>>((-0.9829730996839017782819))
#define Q17i3j3I static_cast<real_type_t<T>>((0.1837495178165703315744))
#define Q17i3j4R static_cast<real_type_t<T>>((-0.2736629900720828635391))
#define Q17i3j4I static_cast<real_type_t<T>>((0.9618256431728190704088))
#define Q17i3j5R static_cast<real_type_t<T>>((0.7390089172206591159245))
#define Q17i3j5I static_cast<real_type_t<T>>((0.6736956436465572117127))
#define Q17i3j6R static_cast<real_type_t<T>>((0.9324722294043558045731))
#define Q17i3j6I static_cast<real_type_t<T>>((-0.3612416661871529487447))
#define Q17i3j7R static_cast<real_type_t<T>>((0.09226835946330199523965))
#define Q17i3j7I static_cast<real_type_t<T>>((-0.9957341762950345218712))
#define Q17i3j8R static_cast<real_type_t<T>>((-0.8502171357296141521341))
#define Q17i3j8I static_cast<real_type_t<T>>((-0.5264321628773558002446))
#define Q17i4j1R static_cast<real_type_t<T>>((0.09226835946330199523965))
#define Q17i4j1I static_cast<real_type_t<T>>((-0.9957341762950345218712))
#define Q17i4j2R static_cast<real_type_t<T>>((-0.9829730996839017782819))
#define Q17i4j2I static_cast<real_type_t<T>>((-0.1837495178165703315744))
#define Q17i4j3R static_cast<real_type_t<T>>((-0.2736629900720828635391))
#define Q17i4j3I static_cast<real_type_t<T>>((0.9618256431728190704088))
#define Q17i4j4R static_cast<real_type_t<T>>((0.9324722294043558045731))
#define Q17i4j4I static_cast<real_type_t<T>>((0.3612416661871529487447))
#define Q17i4j5R static_cast<real_type_t<T>>((0.4457383557765382673965))
#define Q17i4j5I static_cast<real_type_t<T>>((-0.8951632913550623220670))
#define Q17i4j6R static_cast<real_type_t<T>>((-0.8502171357296141521341))
#define Q17i4j6I static_cast<real_type_t<T>>((-0.5264321628773558002446))
#define Q17i4j7R static_cast<real_type_t<T>>((-0.6026346363792563891786))
#define Q17i4j7I static_cast<real_type_t<T>>((0.7980172272802395033328))
#define Q17i4j8R static_cast<real_type_t<T>>((0.7390089172206591159245))
#define Q17i4j8I static_cast<real_type_t<T>>((0.6736956436465572117127))
#define Q17i5j1R static_cast<real_type_t<T>>((-0.2736629900720828635391))
#define Q17i5j1I static_cast<real_type_t<T>>((-0.9618256431728190704088))
#define Q17i5j2R static_cast<real_type_t<T>>((-0.8502171357296141521341))
#define Q17i5j2I static_cast<real_type_t<T>>((0.5264321628773558002446))
#define Q17i5j3R static_cast<real_type_t<T>>((0.7390089172206591159245))
#define Q17i5j3I static_cast<real_type_t<T>>((0.6736956436465572117127))
#define Q17i5j4R static_cast<real_type_t<T>>((0.4457383557765382673965))
#define Q17i5j4I static_cast<real_type_t<T>>((-0.8951632913550623220670))
#define Q17i5j5R static_cast<real_type_t<T>>((-0.9829730996839017782819))
#define Q17i5j5I static_cast<real_type_t<T>>((-0.1837495178165703315744))
#define Q17i5j6R static_cast<real_type_t<T>>((0.09226835946330199523965))
#define Q17i5j6I static_cast<real_type_t<T>>((0.9957341762950345218712))
#define Q17i5j7R static_cast<real_type_t<T>>((0.9324722294043558045731))
#define Q17i5j7I static_cast<real_type_t<T>>((-0.3612416661871529487447))
#define Q17i5j8R static_cast<real_type_t<T>>((-0.6026346363792563891786))
#define Q17i5j8I static_cast<real_type_t<T>>((-0.7980172272802395033328))
#define Q17i6j1R static_cast<real_type_t<T>>((-0.6026346363792563891786))
#define Q17i6j1I static_cast<real_type_t<T>>((-0.7980172272802395033328))
#define Q17i6j2R static_cast<real_type_t<T>>((-0.2736629900720828635391))
#define Q17i6j2I static_cast<real_type_t<T>>((0.9618256431728190704088))
#define Q17i6j3R static_cast<real_type_t<T>>((0.9324722294043558045731))
#define Q17i6j3I static_cast<real_type_t<T>>((-0.3612416661871529487447))
#define Q17i6j4R static_cast<real_type_t<T>>((-0.8502171357296141521341))
#define Q17i6j4I static_cast<real_type_t<T>>((-0.5264321628773558002446))
#define Q17i6j5R static_cast<real_type_t<T>>((0.09226835946330199523965))
#define Q17i6j5I static_cast<real_type_t<T>>((0.9957341762950345218712))
#define Q17i6j6R static_cast<real_type_t<T>>((0.7390089172206591159245))
#define Q17i6j6I static_cast<real_type_t<T>>((-0.6736956436465572117127))
#define Q17i6j7R static_cast<real_type_t<T>>((-0.9829730996839017782819))
#define Q17i6j7I static_cast<real_type_t<T>>((-0.1837495178165703315744))
#define Q17i6j8R static_cast<real_type_t<T>>((0.4457383557765382673965))
#define Q17i6j8I static_cast<real_type_t<T>>((0.8951632913550623220670))
#define Q17i7j1R static_cast<real_type_t<T>>((-0.8502171357296141521341))
#define Q17i7j1I static_cast<real_type_t<T>>((-0.5264321628773558002446))
#define Q17i7j2R static_cast<real_type_t<T>>((0.4457383557765382673965))
#define Q17i7j2I static_cast<real_type_t<T>>((0.8951632913550623220670))
#define Q17i7j3R static_cast<real_type_t<T>>((0.09226835946330199523965))
#define Q17i7j3I static_cast<real_type_t<T>>((-0.9957341762950345218712))
#define Q17i7j4R static_cast<real_type_t<T>>((-0.6026346363792563891786))
#define Q17i7j4I static_cast<real_type_t<T>>((0.7980172272802395033328))
#define Q17i7j5R static_cast<real_type_t<T>>((0.9324722294043558045731))
#define Q17i7j5I static_cast<real_type_t<T>>((-0.3612416661871529487447))
#define Q17i7j6R static_cast<real_type_t<T>>((-0.9829730996839017782819))
#define Q17i7j6I static_cast<real_type_t<T>>((-0.1837495178165703315744))
#define Q17i7j7R static_cast<real_type_t<T>>((0.7390089172206591159245))
#define Q17i7j7I static_cast<real_type_t<T>>((0.6736956436465572117127))
#define Q17i7j8R static_cast<real_type_t<T>>((-0.2736629900720828635391))
#define Q17i7j8I static_cast<real_type_t<T>>((-0.9618256431728190704088))
#define Q17i8j1R static_cast<real_type_t<T>>((-0.9829730996839017782819))
#define Q17i8j1I static_cast<real_type_t<T>>((-0.1837495178165703315744))
#define Q17i8j2R static_cast<real_type_t<T>>((0.9324722294043558045731))
#define Q17i8j2I static_cast<real_type_t<T>>((0.3612416661871529487447))
#define Q17i8j3R static_cast<real_type_t<T>>((-0.8502171357296141521341))
#define Q17i8j3I static_cast<real_type_t<T>>((-0.5264321628773558002446))
#define Q17i8j4R static_cast<real_type_t<T>>((0.7390089172206591159245))
#define Q17i8j4I static_cast<real_type_t<T>>((0.6736956436465572117127))
#define Q17i8j5R static_cast<real_type_t<T>>((-0.6026346363792563891786))
#define Q17i8j5I static_cast<real_type_t<T>>((-0.7980172272802395033328))
#define Q17i8j6R static_cast<real_type_t<T>>((0.4457383557765382673965))
#define Q17i8j6I static_cast<real_type_t<T>>((0.8951632913550623220670))
#define Q17i8j7R static_cast<real_type_t<T>>((-0.2736629900720828635391))
#define Q17i8j7I static_cast<real_type_t<T>>((-0.9618256431728190704088))
#define Q17i8j8R static_cast<real_type_t<T>>((0.09226835946330199523965))
#define Q17i8j8I static_cast<real_type_t<T>>((0.9957341762950345218712))

// butterfly radix-11 constants
#define b11_0 static_cast<real_type_t<T>>(0.9898214418809327)
#define b11_1 static_cast<real_type_t<T>>(0.9594929736144973)
#define b11_2 static_cast<real_type_t<T>>(0.9189859472289947)
#define b11_3 static_cast<real_type_t<T>>(0.8767688310025893)
#define b11_4 static_cast<real_type_t<T>>(0.8308300260037728)
#define b11_5 static_cast<real_type_t<T>>(0.7784344533346518)
#define b11_6 static_cast<real_type_t<T>>(0.7153703234534297)
#define b11_7 static_cast<real_type_t<T>>(0.6343562706824244)
#define b11_8 static_cast<real_type_t<T>>(0.3425847256816375)
#define b11_9 static_cast<real_type_t<T>>(0.5211085581132027)

// butterfly radix-13 constants
#define b13_0 static_cast<real_type_t<T>>(0.9682872443619840)
#define b13_1 static_cast<real_type_t<T>>(0.9578059925946651)
#define b13_2 static_cast<real_type_t<T>>(0.8755023024091479)
#define b13_3 static_cast<real_type_t<T>>(0.8660254037844386)
#define b13_4 static_cast<real_type_t<T>>(0.8595425350987748)
#define b13_5 static_cast<real_type_t<T>>(0.8534800018598239)
#define b13_6 static_cast<real_type_t<T>>(0.7693388175729806)
#define b13_7 static_cast<real_type_t<T>>(0.6865583707817543)
#define b13_8 static_cast<real_type_t<T>>(0.6122646503767565)
#define b13_9 static_cast<real_type_t<T>>(0.6004772719326652)
#define b13_10 static_cast<real_type_t<T>>(0.5817047785105157)
#define b13_11 static_cast<real_type_t<T>>(0.5751407294740031)
#define b13_12 static_cast<real_type_t<T>>(0.5220263851612750)
#define b13_13 static_cast<real_type_t<T>>(0.5200285718888646)
#define b13_14 static_cast<real_type_t<T>>(0.5165207806234897)
#define b13_15 static_cast<real_type_t<T>>(0.5149187780863157)
#define b13_16 static_cast<real_type_t<T>>(0.5035370328637666)
#define b13_17 static_cast<real_type_t<T>>(0.5000000000000000)
#define b13_18 static_cast<real_type_t<T>>(0.3027756377319946)
#define b13_19 static_cast<real_type_t<T>>(0.3014792600477098)
#define b13_20 static_cast<real_type_t<T>>(0.3004626062886657)
#define b13_21 static_cast<real_type_t<T>>(0.2517685164318833)
#define b13_22 static_cast<real_type_t<T>>(0.2261094450357824)
#define b13_23 static_cast<real_type_t<T>>(0.0833333333333333)
#define b13_24 static_cast<real_type_t<T>>(0.0386329546443481)

// butterfly radix-16 constants
#define C16A static_cast<real_type_t<T>>(0.923879532511286738)
#define C16B static_cast<real_type_t<T>>(0.382683432365089837)

#endif //  BUTTERFLY_CONSTANT_H
)_PY_EMBED_"};
const char* real2complex_device_h {
R"_PY_EMBED_(
// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef REAL_TO_COMPLEX_DEVICE_H
#define REAL_TO_COMPLEX_DEVICE_H

// The even-length real to complex post process device kernel
template <typename Tcomplex, bool Ndiv4, CallbackType cbtype, bool SCALE = false>
__device__ inline void post_process_interleaved(const size_t    idx_p,
                                                const size_t    idx_q,
                                                const size_t    half_N,
                                                const size_t    quarter_N,
                                                const Tcomplex* input,
                                                Tcomplex*       output,
                                                size_t          output_base,
                                                const Tcomplex* twiddles,
                                                void* __restrict__ load_cb_fn,
                                                void* __restrict__ load_cb_data,
                                                uint32_t load_cb_lds_bytes,
                                                void* __restrict__ store_cb_fn,
                                                void* __restrict__ store_cb_data,
                                                const real_type_t<Tcomplex> scale_factor = 0.0)
{
    // post process can't be the first kernel, so don't bother
    // going through the load cb to read global memory
    auto store_cb = get_store_cb<Tcomplex, cbtype>(store_cb_fn);

    Tcomplex outval;

    if(idx_p == 0)
    {
        outval.x = input[0].x - input[0].y;
        outval.y = 0;
        store_cb(output,
                 output_base + half_N,
                 SCALE ? (outval * scale_factor) : outval,
                 store_cb_data,
                 nullptr);

        outval.x = input[0].x + input[0].y;
        outval.y = 0;
        store_cb(output,
                 output_base + 0,
                 SCALE ? (outval * scale_factor) : outval,
                 store_cb_data,
                 nullptr);

        if(Ndiv4)
        {
            outval.x = input[quarter_N].x;
            outval.y = -input[quarter_N].y;

            store_cb(output,
                     output_base + quarter_N,
                     SCALE ? (outval * scale_factor) : outval,
                     store_cb_data,
                     nullptr);
        }
    }
    else
    {
        const Tcomplex p = input[idx_p];
        const Tcomplex q = input[idx_q];
        const Tcomplex u = 0.5 * (p + q);
        const Tcomplex v = 0.5 * (p - q);

        const Tcomplex twd_p = twiddles[idx_p];
        // NB: twd_q = -conj(twd_p) = (-twd_p.x, twd_p.y);

        outval.x = u.x + v.x * twd_p.y + u.y * twd_p.x;
        outval.y = v.y + u.y * twd_p.y - v.x * twd_p.x;
        store_cb(output,
                 output_base + idx_p,
                 SCALE ? (outval * scale_factor) : outval,
                 store_cb_data,
                 nullptr);

        outval.x = u.x - v.x * twd_p.y - u.y * twd_p.x;
        outval.y = -v.y + u.y * twd_p.y - v.x * twd_p.x;
        store_cb(output,
                 output_base + idx_q,
                 SCALE ? (outval * scale_factor) : outval,
                 store_cb_data,
                 nullptr);
    }
}

// TODO: rework pre/post processing
template <typename T, bool Ndiv4, CallbackType cbtype>
__device__ inline void post_process_interleaved_inplace(const size_t idx_p,
                                                        const size_t idx_q,
                                                        const size_t half_N,
                                                        const size_t quarter_N,
                                                        T*           inout,
                                                        size_t       offset_base,
                                                        const T*     twiddles,
                                                        void* __restrict__ load_cb_fn,
                                                        void* __restrict__ load_cb_data,
                                                        uint32_t load_cb_lds_bytes,
                                                        void* __restrict__ store_cb_fn,
                                                        void* __restrict__ store_cb_data)
{
    // post process can't be the first kernel, so don't bother
    // going through the load cb to read global memory
    auto store_cb = get_store_cb<T, cbtype>(store_cb_fn);

    T p, q, outval;
    if(idx_p < quarter_N)
    {
        p = inout[offset_base + idx_p];
        q = inout[offset_base + idx_q];
    }

    __syncthreads();

    if(idx_p == 0)
    {
        outval.x = p.x + p.y;
        outval.y = 0;
        store_cb(inout, offset_base + idx_p, outval, store_cb_data, nullptr);

        outval.x = p.x - p.y;
        outval.y = 0;
        store_cb(inout, offset_base + idx_q, outval, store_cb_data, nullptr);

        if(Ndiv4)
        {
            outval   = inout[offset_base + quarter_N];
            outval.y = -outval.y;
            store_cb(inout, offset_base + quarter_N, outval, store_cb_data, nullptr);
        }
    }
    else if(idx_p < quarter_N)
    {
        const T u = 0.5 * (p + q);
        const T v = 0.5 * (p - q);

        const T twd_p = twiddles[idx_p];
        // NB: twd_q = -conj(twd_p) = (-twd_p.x, twd_p.y);

        outval.x = u.x + v.x * twd_p.y + u.y * twd_p.x;
        outval.y = v.y + u.y * twd_p.y - v.x * twd_p.x;
        store_cb(inout, offset_base + idx_p, outval, store_cb_data, nullptr);

        outval.x = u.x - v.x * twd_p.y - u.y * twd_p.x;
        outval.y = -v.y + u.y * twd_p.y - v.x * twd_p.x;
        store_cb(inout, offset_base + idx_q, outval, store_cb_data, nullptr);
    }
}

// The below 2 functions are only for inplace in lds. So no callback.
template <typename Tcomplex, bool Ndiv4>
__device__ inline void real_post_process_kernel_inplace(const size_t    idx_p,
                                                        const size_t    idx_q,
                                                        const size_t    quarter_N,
                                                        Tcomplex*       inout,
                                                        size_t          offset_base,
                                                        const Tcomplex* twiddles)
{
    if(idx_p < quarter_N)
    {
        Tcomplex p = inout[offset_base + idx_p];
        Tcomplex q = inout[offset_base + idx_q];

        if(idx_p == 0)
        {
            inout[offset_base + idx_p].x = p.x + p.y;
            inout[offset_base + idx_p].y = 0;

            inout[offset_base + idx_q].x = p.x - p.y;
            inout[offset_base + idx_q].y = 0;

            if(Ndiv4)
            {
                inout[offset_base + quarter_N].y = -inout[offset_base + quarter_N].y;
            }
        }
        else
        {
            const Tcomplex u = 0.5 * (p + q);
            const Tcomplex v = 0.5 * (p - q);

            const Tcomplex twd_p = twiddles[idx_p];
            // NB: twd_q = -conj(twd_p) = (-twd_p.x, twd_p.y);

            inout[offset_base + idx_p].x = u.x + v.x * twd_p.y + u.y * twd_p.x;
            inout[offset_base + idx_p].y = v.y + u.y * twd_p.y - v.x * twd_p.x;

            inout[offset_base + idx_q].x = u.x - v.x * twd_p.y - u.y * twd_p.x;
            inout[offset_base + idx_q].y = -v.y + u.y * twd_p.y - v.x * twd_p.x;
        }
    }
}

template <typename Tcomplex, bool Ndiv4>
__device__ inline void real_pre_process_kernel_inplace(const size_t    idx_p,
                                                       const size_t    idx_q,
                                                       const size_t    quarter_N,
                                                       Tcomplex*       inout,
                                                       size_t          offset_base,
                                                       const Tcomplex* twiddles)
{
    if(idx_p < quarter_N)
    {
        Tcomplex p = inout[offset_base + idx_p];
        Tcomplex q = inout[offset_base + idx_q];

        if(idx_p == 0)
        {
            // NB: multi-dimensional transforms may have non-zero
            // imaginary part at index 0 or at the Nyquist frequency.
            inout[offset_base + idx_p].x = p.x + q.x;
            inout[offset_base + idx_p].y = p.x - q.x;

            if(Ndiv4)
            {
                auto quarter_elem                = inout[offset_base + quarter_N];
                inout[offset_base + quarter_N].x = 2.0 * quarter_elem.x;
                inout[offset_base + quarter_N].y = -2.0 * quarter_elem.y;
            }
        }
        else
        {
            const Tcomplex u = p + q;
            const Tcomplex v = p - q;

            const Tcomplex twd_p = twiddles[idx_p];
            // NB: twd_q = -conj(twd_p);

            inout[offset_base + idx_p].x = u.x + v.x * twd_p.y - u.y * twd_p.x;
            inout[offset_base + idx_p].y = v.y + u.y * twd_p.y + v.x * twd_p.x;

            inout[offset_base + idx_q].x = u.x - v.x * twd_p.y + u.y * twd_p.x;
            inout[offset_base + idx_q].y = -v.y + u.y * twd_p.y + v.x * twd_p.x;
        }
    }
}

#endif
)_PY_EMBED_"};
const char* large_twiddles_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T, size_t Base, size_t Steps>
__device__ T TW_NSteps(const T* const twiddles, size_t u)
{
    size_t j      = u & ((1 << Base) - 1); // get the lowest Base bits
    T      result = twiddles[j];
    u >>= Base; // discard the lowest Base bits
    int i = 0;
    // static compiled, currently, steps can only be 2 or 3
    if(Steps >= 2)
    {
        i += 1;
        j      = u & ((1 << Base) - 1);
        result = T((result.x * twiddles[(1 << Base) * i + j].x
                    - result.y * twiddles[(1 << Base) * i + j].y),
                   (result.y * twiddles[(1 << Base) * i + j].x
                    + result.x * twiddles[(1 << Base) * i + j].y));
    }
    // static compiled
    if(Steps >= 3)
    {
        u >>= Base; // discard the lowest Base bits

        i += 1;
        j      = u & ((1 << Base) - 1);
        result = T((result.x * twiddles[(1 << Base) * i + j].x
                    - result.y * twiddles[(1 << Base) * i + j].y),
                   (result.y * twiddles[(1 << Base) * i + j].x
                    + result.x * twiddles[(1 << Base) * i + j].y));
    }
    static_assert(Steps < 4, "4-steps is not support");
    // if(Steps >= 4){...}

    return result;
}
)_PY_EMBED_"};
const char* radix_2_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad2B1(T* R0, T* R1)
{

    (*R1) = (*R0) - (*R1);
    (*R0) = 2.0 * (*R0) - (*R1);
}

template <typename T>
__device__ void InvRad2B1(T* R0, T* R1)
{

    (*R1) = (*R0) - (*R1);
    (*R0) = 2.0 * (*R0) - (*R1);
}
)_PY_EMBED_"};
const char* radix_3_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad3B1(T* R0, T* R1, T* R2)
{

    real_type_t<T> TR0, TI0, TR1, TI1, TR2, TI2;

    TR0 = (*R0).x + (*R1).x + (*R2).x;
    TR1 = ((*R0).x - C3QA * ((*R1).x + (*R2).x)) + C3QB * ((*R1).y - (*R2).y);
    TR2 = ((*R0).x - C3QA * ((*R1).x + (*R2).x)) - C3QB * ((*R1).y - (*R2).y);

    TI0 = (*R0).y + (*R1).y + (*R2).y;
    TI1 = ((*R0).y - C3QA * ((*R1).y + (*R2).y)) - C3QB * ((*R1).x - (*R2).x);
    TI2 = ((*R0).y - C3QA * ((*R1).y + (*R2).y)) + C3QB * ((*R1).x - (*R2).x);

    ((*R0).x) = TR0;
    ((*R0).y) = TI0;
    ((*R1).x) = TR1;
    ((*R1).y) = TI1;
    ((*R2).x) = TR2;
    ((*R2).y) = TI2;
}

template <typename T>
__device__ void InvRad3B1(T* R0, T* R1, T* R2)
{

    real_type_t<T> TR0, TI0, TR1, TI1, TR2, TI2;

    TR0 = (*R0).x + (*R1).x + (*R2).x;
    TR1 = ((*R0).x - C3QA * ((*R1).x + (*R2).x)) - C3QB * ((*R1).y - (*R2).y);
    TR2 = ((*R0).x - C3QA * ((*R1).x + (*R2).x)) + C3QB * ((*R1).y - (*R2).y);

    TI0 = (*R0).y + (*R1).y + (*R2).y;
    TI1 = ((*R0).y - C3QA * ((*R1).y + (*R2).y)) + C3QB * ((*R1).x - (*R2).x);
    TI2 = ((*R0).y - C3QA * ((*R1).y + (*R2).y)) - C3QB * ((*R1).x - (*R2).x);

    ((*R0).x) = TR0;
    ((*R0).y) = TI0;
    ((*R1).x) = TR1;
    ((*R1).y) = TI1;
    ((*R2).x) = TR2;
    ((*R2).y) = TI2;
}
)_PY_EMBED_"};
const char* radix_4_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad4B1(T* R0, T* R2, T* R1, T* R3)
{

    T res;

    (*R1) = (*R0) - (*R1);
    (*R0) = 2.0 * (*R0) - (*R1);
    (*R3) = (*R2) - (*R3);
    (*R2) = 2.0 * (*R2) - (*R3);

    (*R2) = (*R0) - (*R2);
    (*R0) = 2.0 * (*R0) - (*R2);

    (*R3) = (*R1) + T(-(*R3).y, (*R3).x);
    (*R1) = 2.0 * (*R1) - (*R3);

    res   = (*R1);
    (*R1) = (*R2);
    (*R2) = res;
}

template <typename T>
__device__ void InvRad4B1(T* R0, T* R2, T* R1, T* R3)
{

    T res;

    (*R1) = (*R0) - (*R1);
    (*R0) = 2.0 * (*R0) - (*R1);
    (*R3) = (*R2) - (*R3);
    (*R2) = 2.0 * (*R2) - (*R3);

    (*R2) = (*R0) - (*R2);
    (*R0) = 2.0 * (*R0) - (*R2);
    (*R3) = (*R1) + T((*R3).y, -(*R3).x);
    (*R1) = 2.0 * (*R1) - (*R3);

    res   = (*R1);
    (*R1) = (*R2);
    (*R2) = res;
}
)_PY_EMBED_"};
const char* radix_5_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad5B1(T* R0, T* R1, T* R2, T* R3, T* R4)
{

    real_type_t<T> TR0, TI0, TR1, TI1, TR2, TI2, TR3, TI3, TR4, TI4;

    TR0 = (*R0).x + (*R1).x + (*R2).x + (*R3).x + (*R4).x;
    TR1 = ((*R0).x - C5QC * ((*R2).x + (*R3).x)) + C5QB * ((*R1).y - (*R4).y)
          + C5QD * ((*R2).y - (*R3).y) + C5QA * (((*R1).x - (*R2).x) + ((*R4).x - (*R3).x));
    TR4 = ((*R0).x - C5QC * ((*R2).x + (*R3).x)) - C5QB * ((*R1).y - (*R4).y)
          - C5QD * ((*R2).y - (*R3).y) + C5QA * (((*R1).x - (*R2).x) + ((*R4).x - (*R3).x));
    TR2 = ((*R0).x - C5QC * ((*R1).x + (*R4).x)) - C5QB * ((*R2).y - (*R3).y)
          + C5QD * ((*R1).y - (*R4).y) + C5QA * (((*R2).x - (*R1).x) + ((*R3).x - (*R4).x));
    TR3 = ((*R0).x - C5QC * ((*R1).x + (*R4).x)) + C5QB * ((*R2).y - (*R3).y)
          - C5QD * ((*R1).y - (*R4).y) + C5QA * (((*R2).x - (*R1).x) + ((*R3).x - (*R4).x));

    TI0 = (*R0).y + (*R1).y + (*R2).y + (*R3).y + (*R4).y;
    TI1 = ((*R0).y - C5QC * ((*R2).y + (*R3).y)) - C5QB * ((*R1).x - (*R4).x)
          - C5QD * ((*R2).x - (*R3).x) + C5QA * (((*R1).y - (*R2).y) + ((*R4).y - (*R3).y));
    TI4 = ((*R0).y - C5QC * ((*R2).y + (*R3).y)) + C5QB * ((*R1).x - (*R4).x)
          + C5QD * ((*R2).x - (*R3).x) + C5QA * (((*R1).y - (*R2).y) + ((*R4).y - (*R3).y));
    TI2 = ((*R0).y - C5QC * ((*R1).y + (*R4).y)) + C5QB * ((*R2).x - (*R3).x)
          - C5QD * ((*R1).x - (*R4).x) + C5QA * (((*R2).y - (*R1).y) + ((*R3).y - (*R4).y));
    TI3 = ((*R0).y - C5QC * ((*R1).y + (*R4).y)) - C5QB * ((*R2).x - (*R3).x)
          + C5QD * ((*R1).x - (*R4).x) + C5QA * (((*R2).y - (*R1).y) + ((*R3).y - (*R4).y));

    ((*R0).x) = TR0;
    ((*R0).y) = TI0;
    ((*R1).x) = TR1;
    ((*R1).y) = TI1;
    ((*R2).x) = TR2;
    ((*R2).y) = TI2;
    ((*R3).x) = TR3;
    ((*R3).y) = TI3;
    ((*R4).x) = TR4;
    ((*R4).y) = TI4;
}

template <typename T>
__device__ void InvRad5B1(T* R0, T* R1, T* R2, T* R3, T* R4)
{

    real_type_t<T> TR0, TI0, TR1, TI1, TR2, TI2, TR3, TI3, TR4, TI4;

    TR0 = (*R0).x + (*R1).x + (*R2).x + (*R3).x + (*R4).x;
    TR1 = ((*R0).x - C5QC * ((*R2).x + (*R3).x)) - C5QB * ((*R1).y - (*R4).y)
          - C5QD * ((*R2).y - (*R3).y) + C5QA * (((*R1).x - (*R2).x) + ((*R4).x - (*R3).x));
    TR4 = ((*R0).x - C5QC * ((*R2).x + (*R3).x)) + C5QB * ((*R1).y - (*R4).y)
          + C5QD * ((*R2).y - (*R3).y) + C5QA * (((*R1).x - (*R2).x) + ((*R4).x - (*R3).x));
    TR2 = ((*R0).x - C5QC * ((*R1).x + (*R4).x)) + C5QB * ((*R2).y - (*R3).y)
          - C5QD * ((*R1).y - (*R4).y) + C5QA * (((*R2).x - (*R1).x) + ((*R3).x - (*R4).x));
    TR3 = ((*R0).x - C5QC * ((*R1).x + (*R4).x)) - C5QB * ((*R2).y - (*R3).y)
          + C5QD * ((*R1).y - (*R4).y) + C5QA * (((*R2).x - (*R1).x) + ((*R3).x - (*R4).x));

    TI0 = (*R0).y + (*R1).y + (*R2).y + (*R3).y + (*R4).y;
    TI1 = ((*R0).y - C5QC * ((*R2).y + (*R3).y)) + C5QB * ((*R1).x - (*R4).x)
          + C5QD * ((*R2).x - (*R3).x) + C5QA * (((*R1).y - (*R2).y) + ((*R4).y - (*R3).y));
    TI4 = ((*R0).y - C5QC * ((*R2).y + (*R3).y)) - C5QB * ((*R1).x - (*R4).x)
          - C5QD * ((*R2).x - (*R3).x) + C5QA * (((*R1).y - (*R2).y) + ((*R4).y - (*R3).y));
    TI2 = ((*R0).y - C5QC * ((*R1).y + (*R4).y)) - C5QB * ((*R2).x - (*R3).x)
          + C5QD * ((*R1).x - (*R4).x) + C5QA * (((*R2).y - (*R1).y) + ((*R3).y - (*R4).y));
    TI3 = ((*R0).y - C5QC * ((*R1).y + (*R4).y)) + C5QB * ((*R2).x - (*R3).x)
          - C5QD * ((*R1).x - (*R4).x) + C5QA * (((*R2).y - (*R1).y) + ((*R3).y - (*R4).y));

    ((*R0).x) = TR0;
    ((*R0).y) = TI0;
    ((*R1).x) = TR1;
    ((*R1).y) = TI1;
    ((*R2).x) = TR2;
    ((*R2).y) = TI2;
    ((*R3).x) = TR3;
    ((*R3).y) = TI3;
    ((*R4).x) = TR4;
    ((*R4).y) = TI4;
}
)_PY_EMBED_"};
const char* radix_6_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad6B1(T* R0, T* R1, T* R2, T* R3, T* R4, T* R5)
{

    real_type_t<T> TR0, TI0, TR1, TI1, TR2, TI2, TR3, TI3, TR4, TI4, TR5, TI5;

    TR0 = (*R0).x + (*R2).x + (*R4).x;
    TR2 = ((*R0).x - C3QA * ((*R2).x + (*R4).x)) + C3QB * ((*R2).y - (*R4).y);
    TR4 = ((*R0).x - C3QA * ((*R2).x + (*R4).x)) - C3QB * ((*R2).y - (*R4).y);

    TI0 = (*R0).y + (*R2).y + (*R4).y;
    TI2 = ((*R0).y - C3QA * ((*R2).y + (*R4).y)) - C3QB * ((*R2).x - (*R4).x);
    TI4 = ((*R0).y - C3QA * ((*R2).y + (*R4).y)) + C3QB * ((*R2).x - (*R4).x);

    TR1 = (*R1).x + (*R3).x + (*R5).x;
    TR3 = ((*R1).x - C3QA * ((*R3).x + (*R5).x)) + C3QB * ((*R3).y - (*R5).y);
    TR5 = ((*R1).x - C3QA * ((*R3).x + (*R5).x)) - C3QB * ((*R3).y - (*R5).y);

    TI1 = (*R1).y + (*R3).y + (*R5).y;
    TI3 = ((*R1).y - C3QA * ((*R3).y + (*R5).y)) - C3QB * ((*R3).x - (*R5).x);
    TI5 = ((*R1).y - C3QA * ((*R3).y + (*R5).y)) + C3QB * ((*R3).x - (*R5).x);

    (*R0).x = TR0 + TR1;
    (*R1).x = TR2 + (C3QA * TR3 + C3QB * TI3);
    (*R2).x = TR4 + (-C3QA * TR5 + C3QB * TI5);

    (*R0).y = TI0 + TI1;
    (*R1).y = TI2 + (-C3QB * TR3 + C3QA * TI3);
    (*R2).y = TI4 + (-C3QB * TR5 - C3QA * TI5);

    (*R3).x = TR0 - TR1;
    (*R4).x = TR2 - (C3QA * TR3 + C3QB * TI3);
    (*R5).x = TR4 - (-C3QA * TR5 + C3QB * TI5);

    (*R3).y = TI0 - TI1;
    (*R4).y = TI2 - (-C3QB * TR3 + C3QA * TI3);
    (*R5).y = TI4 - (-C3QB * TR5 - C3QA * TI5);
}

template <typename T>
__device__ void InvRad6B1(T* R0, T* R1, T* R2, T* R3, T* R4, T* R5)
{

    real_type_t<T> TR0, TI0, TR1, TI1, TR2, TI2, TR3, TI3, TR4, TI4, TR5, TI5;

    TR0 = (*R0).x + (*R2).x + (*R4).x;
    TR2 = ((*R0).x - C3QA * ((*R2).x + (*R4).x)) - C3QB * ((*R2).y - (*R4).y);
    TR4 = ((*R0).x - C3QA * ((*R2).x + (*R4).x)) + C3QB * ((*R2).y - (*R4).y);

    TI0 = (*R0).y + (*R2).y + (*R4).y;
    TI2 = ((*R0).y - C3QA * ((*R2).y + (*R4).y)) + C3QB * ((*R2).x - (*R4).x);
    TI4 = ((*R0).y - C3QA * ((*R2).y + (*R4).y)) - C3QB * ((*R2).x - (*R4).x);

    TR1 = (*R1).x + (*R3).x + (*R5).x;
    TR3 = ((*R1).x - C3QA * ((*R3).x + (*R5).x)) - C3QB * ((*R3).y - (*R5).y);
    TR5 = ((*R1).x - C3QA * ((*R3).x + (*R5).x)) + C3QB * ((*R3).y - (*R5).y);

    TI1 = (*R1).y + (*R3).y + (*R5).y;
    TI3 = ((*R1).y - C3QA * ((*R3).y + (*R5).y)) + C3QB * ((*R3).x - (*R5).x);
    TI5 = ((*R1).y - C3QA * ((*R3).y + (*R5).y)) - C3QB * ((*R3).x - (*R5).x);

    (*R0).x = TR0 + TR1;
    (*R1).x = TR2 + (C3QA * TR3 - C3QB * TI3);
    (*R2).x = TR4 + (-C3QA * TR5 - C3QB * TI5);

    (*R0).y = TI0 + TI1;
    (*R1).y = TI2 + (C3QB * TR3 + C3QA * TI3);
    (*R2).y = TI4 + (C3QB * TR5 - C3QA * TI5);

    (*R3).x = TR0 - TR1;
    (*R4).x = TR2 - (C3QA * TR3 - C3QB * TI3);
    (*R5).x = TR4 - (-C3QA * TR5 - C3QB * TI5);

    (*R3).y = TI0 - TI1;
    (*R4).y = TI2 - (C3QB * TR3 + C3QA * TI3);
    (*R5).y = TI4 - (C3QB * TR5 - C3QA * TI5);
}
)_PY_EMBED_"};
const char* radix_7_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad7B1(T* R0, T* R1, T* R2, T* R3, T* R4, T* R5, T* R6)
{

    T p0;
    T p1;
    T p2;
    T p3;
    T p4;
    T p5;
    T p6;
    T p7;
    T p8;
    T p9;
    T q0;
    T q1;
    T q2;
    T q3;
    T q4;
    T q5;
    T q6;
    T q7;
    T q8;
    /*FFT7 Forward Complex */

    p0 = *R1 + *R6;
    p1 = *R1 - *R6;
    p2 = *R2 + *R5;
    p3 = *R2 - *R5;
    p4 = *R4 + *R3;
    p5 = *R4 - *R3;

    p6 = p2 + p0;
    q4 = p2 - p0;
    q2 = p0 - p4;
    q3 = p4 - p2;
    p7 = p5 + p3;
    q7 = p5 - p3;
    q6 = p1 - p5;
    q8 = p3 - p1;
    q1 = p6 + p4;
    q5 = p7 + p1;
    q0 = *R0 + q1;

    q1 *= C7Q1;
    q2 *= C7Q2;
    q3 *= C7Q3;
    q4 *= C7Q4;

    q5 *= (C7Q5);
    q6 *= (C7Q6);
    q7 *= (C7Q7);
    q8 *= (C7Q8);

    p0 = q0 + q1;
    p1 = q2 + q3;
    p2 = q4 - q3;
    p3 = -q2 - q4;
    p4 = q6 + q7;
    p5 = q8 - q7;
    p6 = -q8 - q6;
    p7 = p0 + p1;
    p8 = p0 + p2;
    p9 = p0 + p3;
    q6 = p4 + q5;
    q7 = p5 + q5;
    q8 = p6 + q5;

    *R0     = q0;
    (*R1).x = p7.x + q6.y;
    (*R1).y = p7.y - q6.x;
    (*R2).x = p9.x + q8.y;
    (*R2).y = p9.y - q8.x;
    (*R3).x = p8.x - q7.y;
    (*R3).y = p8.y + q7.x;
    (*R4).x = p8.x + q7.y;
    (*R4).y = p8.y - q7.x;
    (*R5).x = p9.x - q8.y;
    (*R5).y = p9.y + q8.x;
    (*R6).x = p7.x - q6.y;
    (*R6).y = p7.y + q6.x;
}

template <typename T>
__device__ void InvRad7B1(T* R0, T* R1, T* R2, T* R3, T* R4, T* R5, T* R6)
{

    T p0;
    T p1;
    T p2;
    T p3;
    T p4;
    T p5;
    T p6;
    T p7;
    T p8;
    T p9;
    T q0;
    T q1;
    T q2;
    T q3;
    T q4;
    T q5;
    T q6;
    T q7;
    T q8;
    /*FFT7 Backward Complex */

    p0 = *R1 + *R6;
    p1 = *R1 - *R6;
    p2 = *R2 + *R5;
    p3 = *R2 - *R5;
    p4 = *R4 + *R3;
    p5 = *R4 - *R3;

    p6 = p2 + p0;
    q4 = p2 - p0;
    q2 = p0 - p4;
    q3 = p4 - p2;
    p7 = p5 + p3;
    q7 = p5 - p3;
    q6 = p1 - p5;
    q8 = p3 - p1;
    q1 = p6 + p4;
    q5 = p7 + p1;
    q0 = *R0 + q1;

    q1 *= C7Q1;
    q2 *= C7Q2;
    q3 *= C7Q3;
    q4 *= C7Q4;

    q5 *= -(C7Q5);
    q6 *= -(C7Q6);
    q7 *= -(C7Q7);
    q8 *= -(C7Q8);

    p0 = q0 + q1;
    p1 = q2 + q3;
    p2 = q4 - q3;
    p3 = -q2 - q4;
    p4 = q6 + q7;
    p5 = q8 - q7;
    p6 = -q8 - q6;
    p7 = p0 + p1;
    p8 = p0 + p2;
    p9 = p0 + p3;
    q6 = p4 + q5;
    q7 = p5 + q5;
    q8 = p6 + q5;

    *R0     = q0;
    (*R1).x = p7.x + q6.y;
    (*R1).y = p7.y - q6.x;
    (*R2).x = p9.x + q8.y;
    (*R2).y = p9.y - q8.x;
    (*R3).x = p8.x - q7.y;
    (*R3).y = p8.y + q7.x;
    (*R4).x = p8.x + q7.y;
    (*R4).y = p8.y - q7.x;
    (*R5).x = p9.x - q8.y;
    (*R5).y = p9.y + q8.x;
    (*R6).x = p7.x - q6.y;
    (*R6).y = p7.y + q6.x;
}
)_PY_EMBED_"};
const char* radix_8_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad8B1(T* R0, T* R4, T* R2, T* R6, T* R1, T* R5, T* R3, T* R7)
{

    T res;

    (*R1) = (*R0) - (*R1);
    (*R0) = 2.0 * (*R0) - (*R1);
    (*R3) = (*R2) - (*R3);
    (*R2) = 2.0 * (*R2) - (*R3);
    (*R5) = (*R4) - (*R5);
    (*R4) = 2.0 * (*R4) - (*R5);
    (*R7) = (*R6) - (*R7);
    (*R6) = 2.0 * (*R6) - (*R7);

    (*R2) = (*R0) - (*R2);
    (*R0) = 2.0 * (*R0) - (*R2);
    (*R3) = (*R1) + T(-(*R3).y, (*R3).x);
    (*R1) = 2.0 * (*R1) - (*R3);
    (*R6) = (*R4) - (*R6);
    (*R4) = 2.0 * (*R4) - (*R6);
    (*R7) = (*R5) + T(-(*R7).y, (*R7).x);

    (*R5) = 2.0 * (*R5) - (*R7);

    (*R4) = (*R0) - (*R4);
    (*R0) = 2.0 * (*R0) - (*R4);
    (*R5) = ((*R1) - C8Q * (*R5)) - C8Q * T((*R5).y, -(*R5).x);
    (*R1) = 2.0 * (*R1) - (*R5);
    (*R6) = (*R2) + T(-(*R6).y, (*R6).x);
    (*R2) = 2.0 * (*R2) - (*R6);
    (*R7) = ((*R3) + C8Q * (*R7)) - C8Q * T((*R7).y, -(*R7).x);
    (*R3) = 2.0 * (*R3) - (*R7);

    res   = (*R1);
    (*R1) = (*R4);
    (*R4) = res;
    res   = (*R3);
    (*R3) = (*R6);
    (*R6) = res;
}

template <typename T>
__device__ void InvRad8B1(T* R0, T* R4, T* R2, T* R6, T* R1, T* R5, T* R3, T* R7)
{

    T res;

    (*R1) = (*R0) - (*R1);
    (*R0) = 2.0 * (*R0) - (*R1);
    (*R3) = (*R2) - (*R3);
    (*R2) = 2.0 * (*R2) - (*R3);
    (*R5) = (*R4) - (*R5);
    (*R4) = 2.0 * (*R4) - (*R5);
    (*R7) = (*R6) - (*R7);
    (*R6) = 2.0 * (*R6) - (*R7);

    (*R2) = (*R0) - (*R2);
    (*R0) = 2.0 * (*R0) - (*R2);
    (*R3) = (*R1) + T((*R3).y, -(*R3).x);
    (*R1) = 2.0 * (*R1) - (*R3);
    (*R6) = (*R4) - (*R6);
    (*R4) = 2.0 * (*R4) - (*R6);
    (*R7) = (*R5) + T((*R7).y, -(*R7).x);
    (*R5) = 2.0 * (*R5) - (*R7);

    (*R4) = (*R0) - (*R4);
    (*R0) = 2.0 * (*R0) - (*R4);
    (*R5) = ((*R1) - C8Q * (*R5)) + C8Q * T((*R5).y, -(*R5).x);
    (*R1) = 2.0 * (*R1) - (*R5);
    (*R6) = (*R2) + T((*R6).y, -(*R6).x);
    (*R2) = 2.0 * (*R2) - (*R6);
    (*R7) = ((*R3) + C8Q * (*R7)) + C8Q * T((*R7).y, -(*R7).x);
    (*R3) = 2.0 * (*R3) - (*R7);

    res   = (*R1);
    (*R1) = (*R4);
    (*R4) = res;
    res   = (*R3);
    (*R3) = (*R6);
    (*R6) = res;
}
)_PY_EMBED_"};
const char* radix_9_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad9B1(T* R0, T* R1, T* R2, T* R3, T* R4, T* R5, T* R6, T* R7, T* R8)
{
    // p2 is always multiplied by C9QF, so do it once in p2
    // update R0 and the end since the original R0 is used by others
    // we can also use v3 = R4 + R5 and p3 = R4 - R5
    // but it's ok to do without them and save regs
    T v0 = (*R1) + (*R8);
    T v1 = (*R2) + (*R7);
    T v2 = (*R3) + (*R6);

    T p0 = (*R1) - (*R8);
    T p1 = (*R2) - (*R7);
    T p2 = ((*R3) - (*R6)) * C9QF;

    // borrow R8 as temp
    (*R8) = (C9QB * p0) + (C9QD * p1) + (p2) + (C9QH * ((*R4) - (*R5)));
    (*R1) = ((*R0) + (C9QA * v0) + (C9QC * v1) - (C9QE * v2) - (C9QG * ((*R4) + (*R5))))
            + T((*R8).y, -(*R8).x);
    (*R8) = (*R1) + 2.0 * T(-(*R8).y, (*R8).x);
    // borrow R7 as temp
    (*R7) = -(C9QB * ((*R4) - (*R5))) + (C9QD * p0) - (p2) + (C9QH * p1);
    (*R2) = ((*R0) + (C9QA * ((*R4) + (*R5))) + (C9QC * v0) - (C9QE * v2) - (C9QG * v1))
            + T((*R7).y, -(*R7).x);
    (*R7) = (*R2) + 2.0 * T(-(*R7).y, (*R7).x);
    // borrow R6 temp
    (*R6) = C9QF * (p0 + ((*R4) - (*R5)) - p1);
    (*R3) = ((*R0) + v2 - C9QE * (v0 + v1 + ((*R4) + (*R5)))) + T((*R6).y, -(*R6).x);
    (*R6) = (*R3) + 2.0 * T(-(*R6).y, (*R6).x);
    // borrow p0 as temp
    p0 = -(C9QB * p1) - (C9QD * ((*R4) - (*R5))) + (p2) + (C9QH * p0);
    p1 = (*R0);
    (*R0) += (v0 + v1 + v2 + (*R4) + (*R5));
    (*R4) = (p1 + (C9QA * v1) + (C9QC * ((*R4) + (*R5))) - (C9QE * v2) - (C9QG * v0))
            + T(p0.y, -p0.x);
    (*R5) = (*R4) + 2.0 * T(-p0.y, p0.x);
}

template <typename T>
__device__ void InvRad9B1(T* R0, T* R1, T* R2, T* R3, T* R4, T* R5, T* R6, T* R7, T* R8)
{
    // p2 is always multiplied by C9QF, so do it once in p2
    // update R0 and the end since the original R0 is used by others
    T v0 = (*R1) + (*R8);
    T v1 = (*R2) + (*R7);
    T v2 = (*R3) + (*R6);

    T p0 = (*R1) - (*R8);
    T p1 = (*R2) - (*R7);
    T p2 = ((*R3) - (*R6)) * C9QF;

    // borrow R8 as temp
    (*R8) = (C9QB * p0) + (C9QD * p1) + (p2) + (C9QH * ((*R4) - (*R5)));
    (*R1) = ((*R0) + (C9QA * v0) + (C9QC * v1) - (C9QE * v2) - (C9QG * ((*R4) + (*R5))))
            + T(-(*R8).y, (*R8).x);
    (*R8) = (*R1) + 2.0 * T((*R8).y, -(*R8).x);
    // borrow R7 as temp
    (*R7) = -(C9QB * ((*R4) - (*R5))) + (C9QD * p0) - (p2) + (C9QH * p1);
    (*R2) = ((*R0) + (C9QA * ((*R4) + (*R5))) + (C9QC * v0) - (C9QE * v2) - (C9QG * v1))
            + T(-(*R7).y, (*R7).x);
    (*R7) = (*R2) + 2.0 * T((*R7).y, -(*R7).x);
    // borrow R6 temp
    (*R6) = C9QF * (p0 + ((*R4) - (*R5)) - p1);
    (*R3) = ((*R0) + v2 - C9QE * (v0 + v1 + ((*R4) + (*R5)))) + T(-(*R6).y, (*R6).x);
    (*R6) = (*R3) + 2.0 * T((*R6).y, -(*R6).x);
    // borrow p0 as temp
    p0 = -(C9QB * p1) - (C9QD * ((*R4) - (*R5))) + (p2) + (C9QH * p0);
    p1 = (*R0);
    (*R0) += (v0 + v1 + v2 + (*R4) + (*R5));
    (*R4) = (p1 + (C9QA * v1) + (C9QC * ((*R4) + (*R5))) - (C9QE * v2) - (C9QG * v0))
            + T(-p0.y, p0.x);
    (*R5) = (*R4) + 2.0 * T(p0.y, -p0.x);
}
)_PY_EMBED_"};
const char* radix_10_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad10B1(T* R0, T* R1, T* R2, T* R3, T* R4, T* R5, T* R6, T* R7, T* R8, T* R9)
{

    real_type_t<T> TR0, TI0, TR1, TI1, TR2, TI2, TR3, TI3, TR4, TI4, TR5, TI5, TR6, TI6, TR7, TI7,
        TR8, TI8, TR9, TI9;

    TR0 = (*R0).x + (*R2).x + (*R4).x + (*R6).x + (*R8).x;
    TR2 = ((*R0).x - C5QC * ((*R4).x + (*R6).x)) + C5QB * ((*R2).y - (*R8).y)
          + C5QD * ((*R4).y - (*R6).y) + C5QA * (((*R2).x - (*R4).x) + ((*R8).x - (*R6).x));
    TR8 = ((*R0).x - C5QC * ((*R4).x + (*R6).x)) - C5QB * ((*R2).y - (*R8).y)
          - C5QD * ((*R4).y - (*R6).y) + C5QA * (((*R2).x - (*R4).x) + ((*R8).x - (*R6).x));
    TR4 = ((*R0).x - C5QC * ((*R2).x + (*R8).x)) - C5QB * ((*R4).y - (*R6).y)
          + C5QD * ((*R2).y - (*R8).y) + C5QA * (((*R4).x - (*R2).x) + ((*R6).x - (*R8).x));
    TR6 = ((*R0).x - C5QC * ((*R2).x + (*R8).x)) + C5QB * ((*R4).y - (*R6).y)
          - C5QD * ((*R2).y - (*R8).y) + C5QA * (((*R4).x - (*R2).x) + ((*R6).x - (*R8).x));

    TI0 = (*R0).y + (*R2).y + (*R4).y + (*R6).y + (*R8).y;
    TI2 = ((*R0).y - C5QC * ((*R4).y + (*R6).y)) - C5QB * ((*R2).x - (*R8).x)
          - C5QD * ((*R4).x - (*R6).x) + C5QA * (((*R2).y - (*R4).y) + ((*R8).y - (*R6).y));
    TI8 = ((*R0).y - C5QC * ((*R4).y + (*R6).y)) + C5QB * ((*R2).x - (*R8).x)
          + C5QD * ((*R4).x - (*R6).x) + C5QA * (((*R2).y - (*R4).y) + ((*R8).y - (*R6).y));
    TI4 = ((*R0).y - C5QC * ((*R2).y + (*R8).y)) + C5QB * ((*R4).x - (*R6).x)
          - C5QD * ((*R2).x - (*R8).x) + C5QA * (((*R4).y - (*R2).y) + ((*R6).y - (*R8).y));
    TI6 = ((*R0).y - C5QC * ((*R2).y + (*R8).y)) - C5QB * ((*R4).x - (*R6).x)
          + C5QD * ((*R2).x - (*R8).x) + C5QA * (((*R4).y - (*R2).y) + ((*R6).y - (*R8).y));

    TR1 = (*R1).x + (*R3).x + (*R5).x + (*R7).x + (*R9).x;
    TR3 = ((*R1).x - C5QC * ((*R5).x + (*R7).x)) + C5QB * ((*R3).y - (*R9).y)
          + C5QD * ((*R5).y - (*R7).y) + C5QA * (((*R3).x - (*R5).x) + ((*R9).x - (*R7).x));
    TR9 = ((*R1).x - C5QC * ((*R5).x + (*R7).x)) - C5QB * ((*R3).y - (*R9).y)
          - C5QD * ((*R5).y - (*R7).y) + C5QA * (((*R3).x - (*R5).x) + ((*R9).x - (*R7).x));
    TR5 = ((*R1).x - C5QC * ((*R3).x + (*R9).x)) - C5QB * ((*R5).y - (*R7).y)
          + C5QD * ((*R3).y - (*R9).y) + C5QA * (((*R5).x - (*R3).x) + ((*R7).x - (*R9).x));
    TR7 = ((*R1).x - C5QC * ((*R3).x + (*R9).x)) + C5QB * ((*R5).y - (*R7).y)
          - C5QD * ((*R3).y - (*R9).y) + C5QA * (((*R5).x - (*R3).x) + ((*R7).x - (*R9).x));

    TI1 = (*R1).y + (*R3).y + (*R5).y + (*R7).y + (*R9).y;
    TI3 = ((*R1).y - C5QC * ((*R5).y + (*R7).y)) - C5QB * ((*R3).x - (*R9).x)
          - C5QD * ((*R5).x - (*R7).x) + C5QA * (((*R3).y - (*R5).y) + ((*R9).y - (*R7).y));
    TI9 = ((*R1).y - C5QC * ((*R5).y + (*R7).y)) + C5QB * ((*R3).x - (*R9).x)
          + C5QD * ((*R5).x - (*R7).x) + C5QA * (((*R3).y - (*R5).y) + ((*R9).y - (*R7).y));
    TI5 = ((*R1).y - C5QC * ((*R3).y + (*R9).y)) + C5QB * ((*R5).x - (*R7).x)
          - C5QD * ((*R3).x - (*R9).x) + C5QA * (((*R5).y - (*R3).y) + ((*R7).y - (*R9).y));
    TI7 = ((*R1).y - C5QC * ((*R3).y + (*R9).y)) - C5QB * ((*R5).x - (*R7).x)
          + C5QD * ((*R3).x - (*R9).x) + C5QA * (((*R5).y - (*R3).y) + ((*R7).y - (*R9).y));

    (*R0).x = TR0 + TR1;
    (*R1).x = TR2 + (C5QE * TR3 + C5QD * TI3);
    (*R2).x = TR4 + (C5QA * TR5 + C5QB * TI5);
    (*R3).x = TR6 + (-C5QA * TR7 + C5QB * TI7);
    (*R4).x = TR8 + (-C5QE * TR9 + C5QD * TI9);

    (*R0).y = TI0 + TI1;
    (*R1).y = TI2 + (-C5QD * TR3 + C5QE * TI3);
    (*R2).y = TI4 + (-C5QB * TR5 + C5QA * TI5);
    (*R3).y = TI6 + (-C5QB * TR7 - C5QA * TI7);
    (*R4).y = TI8 + (-C5QD * TR9 - C5QE * TI9);

    (*R5).x = TR0 - TR1;
    (*R6).x = TR2 - (C5QE * TR3 + C5QD * TI3);
    (*R7).x = TR4 - (C5QA * TR5 + C5QB * TI5);
    (*R8).x = TR6 - (-C5QA * TR7 + C5QB * TI7);
    (*R9).x = TR8 - (-C5QE * TR9 + C5QD * TI9);

    (*R5).y = TI0 - TI1;
    (*R6).y = TI2 - (-C5QD * TR3 + C5QE * TI3);
    (*R7).y = TI4 - (-C5QB * TR5 + C5QA * TI5);
    (*R8).y = TI6 - (-C5QB * TR7 - C5QA * TI7);
    (*R9).y = TI8 - (-C5QD * TR9 - C5QE * TI9);
}

template <typename T>
__device__ void InvRad10B1(T* R0, T* R1, T* R2, T* R3, T* R4, T* R5, T* R6, T* R7, T* R8, T* R9)
{

    real_type_t<T> TR0, TI0, TR1, TI1, TR2, TI2, TR3, TI3, TR4, TI4, TR5, TI5, TR6, TI6, TR7, TI7,
        TR8, TI8, TR9, TI9;

    TR0 = (*R0).x + (*R2).x + (*R4).x + (*R6).x + (*R8).x;
    TR2 = ((*R0).x - C5QC * ((*R4).x + (*R6).x)) - C5QB * ((*R2).y - (*R8).y)
          - C5QD * ((*R4).y - (*R6).y) + C5QA * (((*R2).x - (*R4).x) + ((*R8).x - (*R6).x));
    TR8 = ((*R0).x - C5QC * ((*R4).x + (*R6).x)) + C5QB * ((*R2).y - (*R8).y)
          + C5QD * ((*R4).y - (*R6).y) + C5QA * (((*R2).x - (*R4).x) + ((*R8).x - (*R6).x));
    TR4 = ((*R0).x - C5QC * ((*R2).x + (*R8).x)) + C5QB * ((*R4).y - (*R6).y)
          - C5QD * ((*R2).y - (*R8).y) + C5QA * (((*R4).x - (*R2).x) + ((*R6).x - (*R8).x));
    TR6 = ((*R0).x - C5QC * ((*R2).x + (*R8).x)) - C5QB * ((*R4).y - (*R6).y)
          + C5QD * ((*R2).y - (*R8).y) + C5QA * (((*R4).x - (*R2).x) + ((*R6).x - (*R8).x));

    TI0 = (*R0).y + (*R2).y + (*R4).y + (*R6).y + (*R8).y;
    TI2 = ((*R0).y - C5QC * ((*R4).y + (*R6).y)) + C5QB * ((*R2).x - (*R8).x)
          + C5QD * ((*R4).x - (*R6).x) + C5QA * (((*R2).y - (*R4).y) + ((*R8).y - (*R6).y));
    TI8 = ((*R0).y - C5QC * ((*R4).y + (*R6).y)) - C5QB * ((*R2).x - (*R8).x)
          - C5QD * ((*R4).x - (*R6).x) + C5QA * (((*R2).y - (*R4).y) + ((*R8).y - (*R6).y));
    TI4 = ((*R0).y - C5QC * ((*R2).y + (*R8).y)) - C5QB * ((*R4).x - (*R6).x)
          + C5QD * ((*R2).x - (*R8).x) + C5QA * (((*R4).y - (*R2).y) + ((*R6).y - (*R8).y));
    TI6 = ((*R0).y - C5QC * ((*R2).y + (*R8).y)) + C5QB * ((*R4).x - (*R6).x)
          - C5QD * ((*R2).x - (*R8).x) + C5QA * (((*R4).y - (*R2).y) + ((*R6).y - (*R8).y));

    TR1 = (*R1).x + (*R3).x + (*R5).x + (*R7).x + (*R9).x;
    TR3 = ((*R1).x - C5QC * ((*R5).x + (*R7).x)) - C5QB * ((*R3).y - (*R9).y)
          - C5QD * ((*R5).y - (*R7).y) + C5QA * (((*R3).x - (*R5).x) + ((*R9).x - (*R7).x));
    TR9 = ((*R1).x - C5QC * ((*R5).x + (*R7).x)) + C5QB * ((*R3).y - (*R9).y)
          + C5QD * ((*R5).y - (*R7).y) + C5QA * (((*R3).x - (*R5).x) + ((*R9).x - (*R7).x));
    TR5 = ((*R1).x - C5QC * ((*R3).x + (*R9).x)) + C5QB * ((*R5).y - (*R7).y)
          - C5QD * ((*R3).y - (*R9).y) + C5QA * (((*R5).x - (*R3).x) + ((*R7).x - (*R9).x));
    TR7 = ((*R1).x - C5QC * ((*R3).x + (*R9).x)) - C5QB * ((*R5).y - (*R7).y)
          + C5QD * ((*R3).y - (*R9).y) + C5QA * (((*R5).x - (*R3).x) + ((*R7).x - (*R9).x));

    TI1 = (*R1).y + (*R3).y + (*R5).y + (*R7).y + (*R9).y;
    TI3 = ((*R1).y - C5QC * ((*R5).y + (*R7).y)) + C5QB * ((*R3).x - (*R9).x)
          + C5QD * ((*R5).x - (*R7).x) + C5QA * (((*R3).y - (*R5).y) + ((*R9).y - (*R7).y));
    TI9 = ((*R1).y - C5QC * ((*R5).y + (*R7).y)) - C5QB * ((*R3).x - (*R9).x)
          - C5QD * ((*R5).x - (*R7).x) + C5QA * (((*R3).y - (*R5).y) + ((*R9).y - (*R7).y));
    TI5 = ((*R1).y - C5QC * ((*R3).y + (*R9).y)) - C5QB * ((*R5).x - (*R7).x)
          + C5QD * ((*R3).x - (*R9).x) + C5QA * (((*R5).y - (*R3).y) + ((*R7).y - (*R9).y));
    TI7 = ((*R1).y - C5QC * ((*R3).y + (*R9).y)) + C5QB * ((*R5).x - (*R7).x)
          - C5QD * ((*R3).x - (*R9).x) + C5QA * (((*R5).y - (*R3).y) + ((*R7).y - (*R9).y));

    (*R0).x = TR0 + TR1;
    (*R1).x = TR2 + (C5QE * TR3 - C5QD * TI3);
    (*R2).x = TR4 + (C5QA * TR5 - C5QB * TI5);
    (*R3).x = TR6 + (-C5QA * TR7 - C5QB * TI7);
    (*R4).x = TR8 + (-C5QE * TR9 - C5QD * TI9);

    (*R0).y = TI0 + TI1;
    (*R1).y = TI2 + (C5QD * TR3 + C5QE * TI3);
    (*R2).y = TI4 + (C5QB * TR5 + C5QA * TI5);
    (*R3).y = TI6 + (C5QB * TR7 - C5QA * TI7);
    (*R4).y = TI8 + (C5QD * TR9 - C5QE * TI9);

    (*R5).x = TR0 - TR1;
    (*R6).x = TR2 - (C5QE * TR3 - C5QD * TI3);
    (*R7).x = TR4 - (C5QA * TR5 - C5QB * TI5);
    (*R8).x = TR6 - (-C5QA * TR7 - C5QB * TI7);
    (*R9).x = TR8 - (-C5QE * TR9 - C5QD * TI9);

    (*R5).y = TI0 - TI1;
    (*R6).y = TI2 - (C5QD * TR3 + C5QE * TI3);
    (*R7).y = TI4 - (C5QB * TR5 + C5QA * TI5);
    (*R8).y = TI6 - (C5QB * TR7 - C5QA * TI7);
    (*R9).y = TI8 - (C5QD * TR9 - C5QE * TI9);
}
)_PY_EMBED_"};
const char* radix_11_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void
    FwdRad11B1(T* R0, T* R1, T* R2, T* R3, T* R4, T* R5, T* R6, T* R7, T* R8, T* R9, T* R10)
{
    T x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, dp, dm;

    x0  = (*R0) + (*R1) + (*R2) + (*R3) + (*R4) + (*R5) + (*R6) + (*R7) + (*R8) + (*R9) + (*R10);
    x1  = (*R0);
    x2  = (*R0);
    x3  = (*R0);
    x4  = (*R0);
    x5  = (*R0);
    x6  = (*R0);
    x7  = (*R0);
    x8  = (*R0);
    x9  = (*R0);
    x10 = (*R0);
    dp  = (*R1) + (*R10);
    dm  = (*R1) - (*R10);
    x1.x += Q11i1j1R * dp.x - Q11i1j1I * dm.y;
    x1.y += Q11i1j1R * dp.y + Q11i1j1I * dm.x;
    x10.x += Q11i1j1R * dp.x + Q11i1j1I * dm.y;
    x10.y += Q11i1j1R * dp.y - Q11i1j1I * dm.x;
    x2.x += Q11i2j1R * dp.x - Q11i2j1I * dm.y;
    x2.y += Q11i2j1R * dp.y + Q11i2j1I * dm.x;
    x9.x += Q11i2j1R * dp.x + Q11i2j1I * dm.y;
    x9.y += Q11i2j1R * dp.y - Q11i2j1I * dm.x;
    x3.x += Q11i3j1R * dp.x - Q11i3j1I * dm.y;
    x3.y += Q11i3j1R * dp.y + Q11i3j1I * dm.x;
    x8.x += Q11i3j1R * dp.x + Q11i3j1I * dm.y;
    x8.y += Q11i3j1R * dp.y - Q11i3j1I * dm.x;
    x4.x += Q11i4j1R * dp.x - Q11i4j1I * dm.y;
    x4.y += Q11i4j1R * dp.y + Q11i4j1I * dm.x;
    x7.x += Q11i4j1R * dp.x + Q11i4j1I * dm.y;
    x7.y += Q11i4j1R * dp.y - Q11i4j1I * dm.x;
    x5.x += Q11i5j1R * dp.x - Q11i5j1I * dm.y;
    x5.y += Q11i5j1R * dp.y + Q11i5j1I * dm.x;
    x6.x += Q11i5j1R * dp.x + Q11i5j1I * dm.y;
    x6.y += Q11i5j1R * dp.y - Q11i5j1I * dm.x;
    dp = (*R2) + (*R9);
    dm = (*R2) - (*R9);
    x1.x += Q11i1j2R * dp.x - Q11i1j2I * dm.y;
    x1.y += Q11i1j2R * dp.y + Q11i1j2I * dm.x;
    x10.x += Q11i1j2R * dp.x + Q11i1j2I * dm.y;
    x10.y += Q11i1j2R * dp.y - Q11i1j2I * dm.x;
    x2.x += Q11i2j2R * dp.x - Q11i2j2I * dm.y;
    x2.y += Q11i2j2R * dp.y + Q11i2j2I * dm.x;
    x9.x += Q11i2j2R * dp.x + Q11i2j2I * dm.y;
    x9.y += Q11i2j2R * dp.y - Q11i2j2I * dm.x;
    x3.x += Q11i3j2R * dp.x - Q11i3j2I * dm.y;
    x3.y += Q11i3j2R * dp.y + Q11i3j2I * dm.x;
    x8.x += Q11i3j2R * dp.x + Q11i3j2I * dm.y;
    x8.y += Q11i3j2R * dp.y - Q11i3j2I * dm.x;
    x4.x += Q11i4j2R * dp.x - Q11i4j2I * dm.y;
    x4.y += Q11i4j2R * dp.y + Q11i4j2I * dm.x;
    x7.x += Q11i4j2R * dp.x + Q11i4j2I * dm.y;
    x7.y += Q11i4j2R * dp.y - Q11i4j2I * dm.x;
    x5.x += Q11i5j2R * dp.x - Q11i5j2I * dm.y;
    x5.y += Q11i5j2R * dp.y + Q11i5j2I * dm.x;
    x6.x += Q11i5j2R * dp.x + Q11i5j2I * dm.y;
    x6.y += Q11i5j2R * dp.y - Q11i5j2I * dm.x;
    dp = (*R3) + (*R8);
    dm = (*R3) - (*R8);
    x1.x += Q11i1j3R * dp.x - Q11i1j3I * dm.y;
    x1.y += Q11i1j3R * dp.y + Q11i1j3I * dm.x;
    x10.x += Q11i1j3R * dp.x + Q11i1j3I * dm.y;
    x10.y += Q11i1j3R * dp.y - Q11i1j3I * dm.x;
    x2.x += Q11i2j3R * dp.x - Q11i2j3I * dm.y;
    x2.y += Q11i2j3R * dp.y + Q11i2j3I * dm.x;
    x9.x += Q11i2j3R * dp.x + Q11i2j3I * dm.y;
    x9.y += Q11i2j3R * dp.y - Q11i2j3I * dm.x;
    x3.x += Q11i3j3R * dp.x - Q11i3j3I * dm.y;
    x3.y += Q11i3j3R * dp.y + Q11i3j3I * dm.x;
    x8.x += Q11i3j3R * dp.x + Q11i3j3I * dm.y;
    x8.y += Q11i3j3R * dp.y - Q11i3j3I * dm.x;
    x4.x += Q11i4j3R * dp.x - Q11i4j3I * dm.y;
    x4.y += Q11i4j3R * dp.y + Q11i4j3I * dm.x;
    x7.x += Q11i4j3R * dp.x + Q11i4j3I * dm.y;
    x7.y += Q11i4j3R * dp.y - Q11i4j3I * dm.x;
    x5.x += Q11i5j3R * dp.x - Q11i5j3I * dm.y;
    x5.y += Q11i5j3R * dp.y + Q11i5j3I * dm.x;
    x6.x += Q11i5j3R * dp.x + Q11i5j3I * dm.y;
    x6.y += Q11i5j3R * dp.y - Q11i5j3I * dm.x;
    dp = (*R4) + (*R7);
    dm = (*R4) - (*R7);
    x1.x += Q11i1j4R * dp.x - Q11i1j4I * dm.y;
    x1.y += Q11i1j4R * dp.y + Q11i1j4I * dm.x;
    x10.x += Q11i1j4R * dp.x + Q11i1j4I * dm.y;
    x10.y += Q11i1j4R * dp.y - Q11i1j4I * dm.x;
    x2.x += Q11i2j4R * dp.x - Q11i2j4I * dm.y;
    x2.y += Q11i2j4R * dp.y + Q11i2j4I * dm.x;
    x9.x += Q11i2j4R * dp.x + Q11i2j4I * dm.y;
    x9.y += Q11i2j4R * dp.y - Q11i2j4I * dm.x;
    x3.x += Q11i3j4R * dp.x - Q11i3j4I * dm.y;
    x3.y += Q11i3j4R * dp.y + Q11i3j4I * dm.x;
    x8.x += Q11i3j4R * dp.x + Q11i3j4I * dm.y;
    x8.y += Q11i3j4R * dp.y - Q11i3j4I * dm.x;
    x4.x += Q11i4j4R * dp.x - Q11i4j4I * dm.y;
    x4.y += Q11i4j4R * dp.y + Q11i4j4I * dm.x;
    x7.x += Q11i4j4R * dp.x + Q11i4j4I * dm.y;
    x7.y += Q11i4j4R * dp.y - Q11i4j4I * dm.x;
    x5.x += Q11i5j4R * dp.x - Q11i5j4I * dm.y;
    x5.y += Q11i5j4R * dp.y + Q11i5j4I * dm.x;
    x6.x += Q11i5j4R * dp.x + Q11i5j4I * dm.y;
    x6.y += Q11i5j4R * dp.y - Q11i5j4I * dm.x;
    dp = (*R5) + (*R6);
    dm = (*R5) - (*R6);
    x1.x += Q11i1j5R * dp.x - Q11i1j5I * dm.y;
    x1.y += Q11i1j5R * dp.y + Q11i1j5I * dm.x;
    x10.x += Q11i1j5R * dp.x + Q11i1j5I * dm.y;
    x10.y += Q11i1j5R * dp.y - Q11i1j5I * dm.x;
    x2.x += Q11i2j5R * dp.x - Q11i2j5I * dm.y;
    x2.y += Q11i2j5R * dp.y + Q11i2j5I * dm.x;
    x9.x += Q11i2j5R * dp.x + Q11i2j5I * dm.y;
    x9.y += Q11i2j5R * dp.y - Q11i2j5I * dm.x;
    x3.x += Q11i3j5R * dp.x - Q11i3j5I * dm.y;
    x3.y += Q11i3j5R * dp.y + Q11i3j5I * dm.x;
    x8.x += Q11i3j5R * dp.x + Q11i3j5I * dm.y;
    x8.y += Q11i3j5R * dp.y - Q11i3j5I * dm.x;
    x4.x += Q11i4j5R * dp.x - Q11i4j5I * dm.y;
    x4.y += Q11i4j5R * dp.y + Q11i4j5I * dm.x;
    x7.x += Q11i4j5R * dp.x + Q11i4j5I * dm.y;
    x7.y += Q11i4j5R * dp.y - Q11i4j5I * dm.x;
    x5.x += Q11i5j5R * dp.x - Q11i5j5I * dm.y;
    x5.y += Q11i5j5R * dp.y + Q11i5j5I * dm.x;
    x6.x += Q11i5j5R * dp.x + Q11i5j5I * dm.y;
    x6.y += Q11i5j5R * dp.y - Q11i5j5I * dm.x;
    (*R0)  = x0;
    (*R1)  = x1;
    (*R2)  = x2;
    (*R3)  = x3;
    (*R4)  = x4;
    (*R5)  = x5;
    (*R6)  = x6;
    (*R7)  = x7;
    (*R8)  = x8;
    (*R9)  = x9;
    (*R10) = x10;
}

template <typename T>
__device__ void
    InvRad11B1(T* R0, T* R1, T* R2, T* R3, T* R4, T* R5, T* R6, T* R7, T* R8, T* R9, T* R10)
{
    T x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, dp, dm;

    x0  = (*R0) + (*R1) + (*R2) + (*R3) + (*R4) + (*R5) + (*R6) + (*R7) + (*R8) + (*R9) + (*R10);
    x1  = (*R0);
    x2  = (*R0);
    x3  = (*R0);
    x4  = (*R0);
    x5  = (*R0);
    x6  = (*R0);
    x7  = (*R0);
    x8  = (*R0);
    x9  = (*R0);
    x10 = (*R0);
    dp  = (*R1) + (*R10);
    dm  = (*R1) - (*R10);
    x1.x += Q11i1j1R * dp.x + Q11i1j1I * dm.y;
    x1.y += Q11i1j1R * dp.y - Q11i1j1I * dm.x;
    x10.x += Q11i1j1R * dp.x - Q11i1j1I * dm.y;
    x10.y += Q11i1j1R * dp.y + Q11i1j1I * dm.x;
    x2.x += Q11i2j1R * dp.x + Q11i2j1I * dm.y;
    x2.y += Q11i2j1R * dp.y - Q11i2j1I * dm.x;
    x9.x += Q11i2j1R * dp.x - Q11i2j1I * dm.y;
    x9.y += Q11i2j1R * dp.y + Q11i2j1I * dm.x;
    x3.x += Q11i3j1R * dp.x + Q11i3j1I * dm.y;
    x3.y += Q11i3j1R * dp.y - Q11i3j1I * dm.x;
    x8.x += Q11i3j1R * dp.x - Q11i3j1I * dm.y;
    x8.y += Q11i3j1R * dp.y + Q11i3j1I * dm.x;
    x4.x += Q11i4j1R * dp.x + Q11i4j1I * dm.y;
    x4.y += Q11i4j1R * dp.y - Q11i4j1I * dm.x;
    x7.x += Q11i4j1R * dp.x - Q11i4j1I * dm.y;
    x7.y += Q11i4j1R * dp.y + Q11i4j1I * dm.x;
    x5.x += Q11i5j1R * dp.x + Q11i5j1I * dm.y;
    x5.y += Q11i5j1R * dp.y - Q11i5j1I * dm.x;
    x6.x += Q11i5j1R * dp.x - Q11i5j1I * dm.y;
    x6.y += Q11i5j1R * dp.y + Q11i5j1I * dm.x;
    dp = (*R2) + (*R9);
    dm = (*R2) - (*R9);
    x1.x += Q11i1j2R * dp.x + Q11i1j2I * dm.y;
    x1.y += Q11i1j2R * dp.y - Q11i1j2I * dm.x;
    x10.x += Q11i1j2R * dp.x - Q11i1j2I * dm.y;
    x10.y += Q11i1j2R * dp.y + Q11i1j2I * dm.x;
    x2.x += Q11i2j2R * dp.x + Q11i2j2I * dm.y;
    x2.y += Q11i2j2R * dp.y - Q11i2j2I * dm.x;
    x9.x += Q11i2j2R * dp.x - Q11i2j2I * dm.y;
    x9.y += Q11i2j2R * dp.y + Q11i2j2I * dm.x;
    x3.x += Q11i3j2R * dp.x + Q11i3j2I * dm.y;
    x3.y += Q11i3j2R * dp.y - Q11i3j2I * dm.x;
    x8.x += Q11i3j2R * dp.x - Q11i3j2I * dm.y;
    x8.y += Q11i3j2R * dp.y + Q11i3j2I * dm.x;
    x4.x += Q11i4j2R * dp.x + Q11i4j2I * dm.y;
    x4.y += Q11i4j2R * dp.y - Q11i4j2I * dm.x;
    x7.x += Q11i4j2R * dp.x - Q11i4j2I * dm.y;
    x7.y += Q11i4j2R * dp.y + Q11i4j2I * dm.x;
    x5.x += Q11i5j2R * dp.x + Q11i5j2I * dm.y;
    x5.y += Q11i5j2R * dp.y - Q11i5j2I * dm.x;
    x6.x += Q11i5j2R * dp.x - Q11i5j2I * dm.y;
    x6.y += Q11i5j2R * dp.y + Q11i5j2I * dm.x;
    dp = (*R3) + (*R8);
    dm = (*R3) - (*R8);
    x1.x += Q11i1j3R * dp.x + Q11i1j3I * dm.y;
    x1.y += Q11i1j3R * dp.y - Q11i1j3I * dm.x;
    x10.x += Q11i1j3R * dp.x - Q11i1j3I * dm.y;
    x10.y += Q11i1j3R * dp.y + Q11i1j3I * dm.x;
    x2.x += Q11i2j3R * dp.x + Q11i2j3I * dm.y;
    x2.y += Q11i2j3R * dp.y - Q11i2j3I * dm.x;
    x9.x += Q11i2j3R * dp.x - Q11i2j3I * dm.y;
    x9.y += Q11i2j3R * dp.y + Q11i2j3I * dm.x;
    x3.x += Q11i3j3R * dp.x + Q11i3j3I * dm.y;
    x3.y += Q11i3j3R * dp.y - Q11i3j3I * dm.x;
    x8.x += Q11i3j3R * dp.x - Q11i3j3I * dm.y;
    x8.y += Q11i3j3R * dp.y + Q11i3j3I * dm.x;
    x4.x += Q11i4j3R * dp.x + Q11i4j3I * dm.y;
    x4.y += Q11i4j3R * dp.y - Q11i4j3I * dm.x;
    x7.x += Q11i4j3R * dp.x - Q11i4j3I * dm.y;
    x7.y += Q11i4j3R * dp.y + Q11i4j3I * dm.x;
    x5.x += Q11i5j3R * dp.x + Q11i5j3I * dm.y;
    x5.y += Q11i5j3R * dp.y - Q11i5j3I * dm.x;
    x6.x += Q11i5j3R * dp.x - Q11i5j3I * dm.y;
    x6.y += Q11i5j3R * dp.y + Q11i5j3I * dm.x;
    dp = (*R4) + (*R7);
    dm = (*R4) - (*R7);
    x1.x += Q11i1j4R * dp.x + Q11i1j4I * dm.y;
    x1.y += Q11i1j4R * dp.y - Q11i1j4I * dm.x;
    x10.x += Q11i1j4R * dp.x - Q11i1j4I * dm.y;
    x10.y += Q11i1j4R * dp.y + Q11i1j4I * dm.x;
    x2.x += Q11i2j4R * dp.x + Q11i2j4I * dm.y;
    x2.y += Q11i2j4R * dp.y - Q11i2j4I * dm.x;
    x9.x += Q11i2j4R * dp.x - Q11i2j4I * dm.y;
    x9.y += Q11i2j4R * dp.y + Q11i2j4I * dm.x;
    x3.x += Q11i3j4R * dp.x + Q11i3j4I * dm.y;
    x3.y += Q11i3j4R * dp.y - Q11i3j4I * dm.x;
    x8.x += Q11i3j4R * dp.x - Q11i3j4I * dm.y;
    x8.y += Q11i3j4R * dp.y + Q11i3j4I * dm.x;
    x4.x += Q11i4j4R * dp.x + Q11i4j4I * dm.y;
    x4.y += Q11i4j4R * dp.y - Q11i4j4I * dm.x;
    x7.x += Q11i4j4R * dp.x - Q11i4j4I * dm.y;
    x7.y += Q11i4j4R * dp.y + Q11i4j4I * dm.x;
    x5.x += Q11i5j4R * dp.x + Q11i5j4I * dm.y;
    x5.y += Q11i5j4R * dp.y - Q11i5j4I * dm.x;
    x6.x += Q11i5j4R * dp.x - Q11i5j4I * dm.y;
    x6.y += Q11i5j4R * dp.y + Q11i5j4I * dm.x;
    dp = (*R5) + (*R6);
    dm = (*R5) - (*R6);
    x1.x += Q11i1j5R * dp.x + Q11i1j5I * dm.y;
    x1.y += Q11i1j5R * dp.y - Q11i1j5I * dm.x;
    x10.x += Q11i1j5R * dp.x - Q11i1j5I * dm.y;
    x10.y += Q11i1j5R * dp.y + Q11i1j5I * dm.x;
    x2.x += Q11i2j5R * dp.x + Q11i2j5I * dm.y;
    x2.y += Q11i2j5R * dp.y - Q11i2j5I * dm.x;
    x9.x += Q11i2j5R * dp.x - Q11i2j5I * dm.y;
    x9.y += Q11i2j5R * dp.y + Q11i2j5I * dm.x;
    x3.x += Q11i3j5R * dp.x + Q11i3j5I * dm.y;
    x3.y += Q11i3j5R * dp.y - Q11i3j5I * dm.x;
    x8.x += Q11i3j5R * dp.x - Q11i3j5I * dm.y;
    x8.y += Q11i3j5R * dp.y + Q11i3j5I * dm.x;
    x4.x += Q11i4j5R * dp.x + Q11i4j5I * dm.y;
    x4.y += Q11i4j5R * dp.y - Q11i4j5I * dm.x;
    x7.x += Q11i4j5R * dp.x - Q11i4j5I * dm.y;
    x7.y += Q11i4j5R * dp.y + Q11i4j5I * dm.x;
    x5.x += Q11i5j5R * dp.x + Q11i5j5I * dm.y;
    x5.y += Q11i5j5R * dp.y - Q11i5j5I * dm.x;
    x6.x += Q11i5j5R * dp.x - Q11i5j5I * dm.y;
    x6.y += Q11i5j5R * dp.y + Q11i5j5I * dm.x;
    (*R0)  = x0;
    (*R1)  = x1;
    (*R2)  = x2;
    (*R3)  = x3;
    (*R4)  = x4;
    (*R5)  = x5;
    (*R6)  = x6;
    (*R7)  = x7;
    (*R8)  = x8;
    (*R9)  = x9;
    (*R10) = x10;
}
)_PY_EMBED_"};
const char* radix_13_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad13B1(
    T* R0, T* R1, T* R2, T* R3, T* R4, T* R5, T* R6, T* R7, T* R8, T* R9, T* R10, T* R11, T* R12)
{
    T x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, dp, dm;

    x0 = (*R0) + (*R1) + (*R2) + (*R3) + (*R4) + (*R5) + (*R6) + (*R7) + (*R8) + (*R9) + (*R10)
         + (*R11) + (*R12);
    x1  = (*R0);
    x2  = (*R0);
    x3  = (*R0);
    x4  = (*R0);
    x5  = (*R0);
    x6  = (*R0);
    x7  = (*R0);
    x8  = (*R0);
    x9  = (*R0);
    x10 = (*R0);
    x11 = (*R0);
    x12 = (*R0);
    dp  = (*R1) + (*R12);
    dm  = (*R1) - (*R12);
    x1.x += Q13i1j1R * dp.x - Q13i1j1I * dm.y;
    x1.y += Q13i1j1R * dp.y + Q13i1j1I * dm.x;
    x12.x += Q13i1j1R * dp.x + Q13i1j1I * dm.y;
    x12.y += Q13i1j1R * dp.y - Q13i1j1I * dm.x;
    x2.x += Q13i2j1R * dp.x - Q13i2j1I * dm.y;
    x2.y += Q13i2j1R * dp.y + Q13i2j1I * dm.x;
    x11.x += Q13i2j1R * dp.x + Q13i2j1I * dm.y;
    x11.y += Q13i2j1R * dp.y - Q13i2j1I * dm.x;
    x3.x += Q13i3j1R * dp.x - Q13i3j1I * dm.y;
    x3.y += Q13i3j1R * dp.y + Q13i3j1I * dm.x;
    x10.x += Q13i3j1R * dp.x + Q13i3j1I * dm.y;
    x10.y += Q13i3j1R * dp.y - Q13i3j1I * dm.x;
    x4.x += Q13i4j1R * dp.x - Q13i4j1I * dm.y;
    x4.y += Q13i4j1R * dp.y + Q13i4j1I * dm.x;
    x9.x += Q13i4j1R * dp.x + Q13i4j1I * dm.y;
    x9.y += Q13i4j1R * dp.y - Q13i4j1I * dm.x;
    x5.x += Q13i5j1R * dp.x - Q13i5j1I * dm.y;
    x5.y += Q13i5j1R * dp.y + Q13i5j1I * dm.x;
    x8.x += Q13i5j1R * dp.x + Q13i5j1I * dm.y;
    x8.y += Q13i5j1R * dp.y - Q13i5j1I * dm.x;
    x6.x += Q13i6j1R * dp.x - Q13i6j1I * dm.y;
    x6.y += Q13i6j1R * dp.y + Q13i6j1I * dm.x;
    x7.x += Q13i6j1R * dp.x + Q13i6j1I * dm.y;
    x7.y += Q13i6j1R * dp.y - Q13i6j1I * dm.x;
    dp = (*R2) + (*R11);
    dm = (*R2) - (*R11);
    x1.x += Q13i1j2R * dp.x - Q13i1j2I * dm.y;
    x1.y += Q13i1j2R * dp.y + Q13i1j2I * dm.x;
    x12.x += Q13i1j2R * dp.x + Q13i1j2I * dm.y;
    x12.y += Q13i1j2R * dp.y - Q13i1j2I * dm.x;
    x2.x += Q13i2j2R * dp.x - Q13i2j2I * dm.y;
    x2.y += Q13i2j2R * dp.y + Q13i2j2I * dm.x;
    x11.x += Q13i2j2R * dp.x + Q13i2j2I * dm.y;
    x11.y += Q13i2j2R * dp.y - Q13i2j2I * dm.x;
    x3.x += Q13i3j2R * dp.x - Q13i3j2I * dm.y;
    x3.y += Q13i3j2R * dp.y + Q13i3j2I * dm.x;
    x10.x += Q13i3j2R * dp.x + Q13i3j2I * dm.y;
    x10.y += Q13i3j2R * dp.y - Q13i3j2I * dm.x;
    x4.x += Q13i4j2R * dp.x - Q13i4j2I * dm.y;
    x4.y += Q13i4j2R * dp.y + Q13i4j2I * dm.x;
    x9.x += Q13i4j2R * dp.x + Q13i4j2I * dm.y;
    x9.y += Q13i4j2R * dp.y - Q13i4j2I * dm.x;
    x5.x += Q13i5j2R * dp.x - Q13i5j2I * dm.y;
    x5.y += Q13i5j2R * dp.y + Q13i5j2I * dm.x;
    x8.x += Q13i5j2R * dp.x + Q13i5j2I * dm.y;
    x8.y += Q13i5j2R * dp.y - Q13i5j2I * dm.x;
    x6.x += Q13i6j2R * dp.x - Q13i6j2I * dm.y;
    x6.y += Q13i6j2R * dp.y + Q13i6j2I * dm.x;
    x7.x += Q13i6j2R * dp.x + Q13i6j2I * dm.y;
    x7.y += Q13i6j2R * dp.y - Q13i6j2I * dm.x;
    dp = (*R3) + (*R10);
    dm = (*R3) - (*R10);
    x1.x += Q13i1j3R * dp.x - Q13i1j3I * dm.y;
    x1.y += Q13i1j3R * dp.y + Q13i1j3I * dm.x;
    x12.x += Q13i1j3R * dp.x + Q13i1j3I * dm.y;
    x12.y += Q13i1j3R * dp.y - Q13i1j3I * dm.x;
    x2.x += Q13i2j3R * dp.x - Q13i2j3I * dm.y;
    x2.y += Q13i2j3R * dp.y + Q13i2j3I * dm.x;
    x11.x += Q13i2j3R * dp.x + Q13i2j3I * dm.y;
    x11.y += Q13i2j3R * dp.y - Q13i2j3I * dm.x;
    x3.x += Q13i3j3R * dp.x - Q13i3j3I * dm.y;
    x3.y += Q13i3j3R * dp.y + Q13i3j3I * dm.x;
    x10.x += Q13i3j3R * dp.x + Q13i3j3I * dm.y;
    x10.y += Q13i3j3R * dp.y - Q13i3j3I * dm.x;
    x4.x += Q13i4j3R * dp.x - Q13i4j3I * dm.y;
    x4.y += Q13i4j3R * dp.y + Q13i4j3I * dm.x;
    x9.x += Q13i4j3R * dp.x + Q13i4j3I * dm.y;
    x9.y += Q13i4j3R * dp.y - Q13i4j3I * dm.x;
    x5.x += Q13i5j3R * dp.x - Q13i5j3I * dm.y;
    x5.y += Q13i5j3R * dp.y + Q13i5j3I * dm.x;
    x8.x += Q13i5j3R * dp.x + Q13i5j3I * dm.y;
    x8.y += Q13i5j3R * dp.y - Q13i5j3I * dm.x;
    x6.x += Q13i6j3R * dp.x - Q13i6j3I * dm.y;
    x6.y += Q13i6j3R * dp.y + Q13i6j3I * dm.x;
    x7.x += Q13i6j3R * dp.x + Q13i6j3I * dm.y;
    x7.y += Q13i6j3R * dp.y - Q13i6j3I * dm.x;
    dp = (*R4) + (*R9);
    dm = (*R4) - (*R9);
    x1.x += Q13i1j4R * dp.x - Q13i1j4I * dm.y;
    x1.y += Q13i1j4R * dp.y + Q13i1j4I * dm.x;
    x12.x += Q13i1j4R * dp.x + Q13i1j4I * dm.y;
    x12.y += Q13i1j4R * dp.y - Q13i1j4I * dm.x;
    x2.x += Q13i2j4R * dp.x - Q13i2j4I * dm.y;
    x2.y += Q13i2j4R * dp.y + Q13i2j4I * dm.x;
    x11.x += Q13i2j4R * dp.x + Q13i2j4I * dm.y;
    x11.y += Q13i2j4R * dp.y - Q13i2j4I * dm.x;
    x3.x += Q13i3j4R * dp.x - Q13i3j4I * dm.y;
    x3.y += Q13i3j4R * dp.y + Q13i3j4I * dm.x;
    x10.x += Q13i3j4R * dp.x + Q13i3j4I * dm.y;
    x10.y += Q13i3j4R * dp.y - Q13i3j4I * dm.x;
    x4.x += Q13i4j4R * dp.x - Q13i4j4I * dm.y;
    x4.y += Q13i4j4R * dp.y + Q13i4j4I * dm.x;
    x9.x += Q13i4j4R * dp.x + Q13i4j4I * dm.y;
    x9.y += Q13i4j4R * dp.y - Q13i4j4I * dm.x;
    x5.x += Q13i5j4R * dp.x - Q13i5j4I * dm.y;
    x5.y += Q13i5j4R * dp.y + Q13i5j4I * dm.x;
    x8.x += Q13i5j4R * dp.x + Q13i5j4I * dm.y;
    x8.y += Q13i5j4R * dp.y - Q13i5j4I * dm.x;
    x6.x += Q13i6j4R * dp.x - Q13i6j4I * dm.y;
    x6.y += Q13i6j4R * dp.y + Q13i6j4I * dm.x;
    x7.x += Q13i6j4R * dp.x + Q13i6j4I * dm.y;
    x7.y += Q13i6j4R * dp.y - Q13i6j4I * dm.x;
    dp = (*R5) + (*R8);
    dm = (*R5) - (*R8);
    x1.x += Q13i1j5R * dp.x - Q13i1j5I * dm.y;
    x1.y += Q13i1j5R * dp.y + Q13i1j5I * dm.x;
    x12.x += Q13i1j5R * dp.x + Q13i1j5I * dm.y;
    x12.y += Q13i1j5R * dp.y - Q13i1j5I * dm.x;
    x2.x += Q13i2j5R * dp.x - Q13i2j5I * dm.y;
    x2.y += Q13i2j5R * dp.y + Q13i2j5I * dm.x;
    x11.x += Q13i2j5R * dp.x + Q13i2j5I * dm.y;
    x11.y += Q13i2j5R * dp.y - Q13i2j5I * dm.x;
    x3.x += Q13i3j5R * dp.x - Q13i3j5I * dm.y;
    x3.y += Q13i3j5R * dp.y + Q13i3j5I * dm.x;
    x10.x += Q13i3j5R * dp.x + Q13i3j5I * dm.y;
    x10.y += Q13i3j5R * dp.y - Q13i3j5I * dm.x;
    x4.x += Q13i4j5R * dp.x - Q13i4j5I * dm.y;
    x4.y += Q13i4j5R * dp.y + Q13i4j5I * dm.x;
    x9.x += Q13i4j5R * dp.x + Q13i4j5I * dm.y;
    x9.y += Q13i4j5R * dp.y - Q13i4j5I * dm.x;
    x5.x += Q13i5j5R * dp.x - Q13i5j5I * dm.y;
    x5.y += Q13i5j5R * dp.y + Q13i5j5I * dm.x;
    x8.x += Q13i5j5R * dp.x + Q13i5j5I * dm.y;
    x8.y += Q13i5j5R * dp.y - Q13i5j5I * dm.x;
    x6.x += Q13i6j5R * dp.x - Q13i6j5I * dm.y;
    x6.y += Q13i6j5R * dp.y + Q13i6j5I * dm.x;
    x7.x += Q13i6j5R * dp.x + Q13i6j5I * dm.y;
    x7.y += Q13i6j5R * dp.y - Q13i6j5I * dm.x;
    dp = (*R6) + (*R7);
    dm = (*R6) - (*R7);
    x1.x += Q13i1j6R * dp.x - Q13i1j6I * dm.y;
    x1.y += Q13i1j6R * dp.y + Q13i1j6I * dm.x;
    x12.x += Q13i1j6R * dp.x + Q13i1j6I * dm.y;
    x12.y += Q13i1j6R * dp.y - Q13i1j6I * dm.x;
    x2.x += Q13i2j6R * dp.x - Q13i2j6I * dm.y;
    x2.y += Q13i2j6R * dp.y + Q13i2j6I * dm.x;
    x11.x += Q13i2j6R * dp.x + Q13i2j6I * dm.y;
    x11.y += Q13i2j6R * dp.y - Q13i2j6I * dm.x;
    x3.x += Q13i3j6R * dp.x - Q13i3j6I * dm.y;
    x3.y += Q13i3j6R * dp.y + Q13i3j6I * dm.x;
    x10.x += Q13i3j6R * dp.x + Q13i3j6I * dm.y;
    x10.y += Q13i3j6R * dp.y - Q13i3j6I * dm.x;
    x4.x += Q13i4j6R * dp.x - Q13i4j6I * dm.y;
    x4.y += Q13i4j6R * dp.y + Q13i4j6I * dm.x;
    x9.x += Q13i4j6R * dp.x + Q13i4j6I * dm.y;
    x9.y += Q13i4j6R * dp.y - Q13i4j6I * dm.x;
    x5.x += Q13i5j6R * dp.x - Q13i5j6I * dm.y;
    x5.y += Q13i5j6R * dp.y + Q13i5j6I * dm.x;
    x8.x += Q13i5j6R * dp.x + Q13i5j6I * dm.y;
    x8.y += Q13i5j6R * dp.y - Q13i5j6I * dm.x;
    x6.x += Q13i6j6R * dp.x - Q13i6j6I * dm.y;
    x6.y += Q13i6j6R * dp.y + Q13i6j6I * dm.x;
    x7.x += Q13i6j6R * dp.x + Q13i6j6I * dm.y;
    x7.y += Q13i6j6R * dp.y - Q13i6j6I * dm.x;
    (*R0)  = x0;
    (*R1)  = x1;
    (*R2)  = x2;
    (*R3)  = x3;
    (*R4)  = x4;
    (*R5)  = x5;
    (*R6)  = x6;
    (*R7)  = x7;
    (*R8)  = x8;
    (*R9)  = x9;
    (*R10) = x10;
    (*R11) = x11;
    (*R12) = x12;
}

template <typename T>
__device__ void InvRad13B1(
    T* R0, T* R1, T* R2, T* R3, T* R4, T* R5, T* R6, T* R7, T* R8, T* R9, T* R10, T* R11, T* R12)
{
    T x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, dp, dm;

    x0 = (*R0) + (*R1) + (*R2) + (*R3) + (*R4) + (*R5) + (*R6) + (*R7) + (*R8) + (*R9) + (*R10)
         + (*R11) + (*R12);
    x1  = (*R0);
    x2  = (*R0);
    x3  = (*R0);
    x4  = (*R0);
    x5  = (*R0);
    x6  = (*R0);
    x7  = (*R0);
    x8  = (*R0);
    x9  = (*R0);
    x10 = (*R0);
    x11 = (*R0);
    x12 = (*R0);
    dp  = (*R1) + (*R12);
    dm  = (*R1) - (*R12);
    x1.x += Q13i1j1R * dp.x + Q13i1j1I * dm.y;
    x1.y += Q13i1j1R * dp.y - Q13i1j1I * dm.x;
    x12.x += Q13i1j1R * dp.x - Q13i1j1I * dm.y;
    x12.y += Q13i1j1R * dp.y + Q13i1j1I * dm.x;
    x2.x += Q13i2j1R * dp.x + Q13i2j1I * dm.y;
    x2.y += Q13i2j1R * dp.y - Q13i2j1I * dm.x;
    x11.x += Q13i2j1R * dp.x - Q13i2j1I * dm.y;
    x11.y += Q13i2j1R * dp.y + Q13i2j1I * dm.x;
    x3.x += Q13i3j1R * dp.x + Q13i3j1I * dm.y;
    x3.y += Q13i3j1R * dp.y - Q13i3j1I * dm.x;
    x10.x += Q13i3j1R * dp.x - Q13i3j1I * dm.y;
    x10.y += Q13i3j1R * dp.y + Q13i3j1I * dm.x;
    x4.x += Q13i4j1R * dp.x + Q13i4j1I * dm.y;
    x4.y += Q13i4j1R * dp.y - Q13i4j1I * dm.x;
    x9.x += Q13i4j1R * dp.x - Q13i4j1I * dm.y;
    x9.y += Q13i4j1R * dp.y + Q13i4j1I * dm.x;
    x5.x += Q13i5j1R * dp.x + Q13i5j1I * dm.y;
    x5.y += Q13i5j1R * dp.y - Q13i5j1I * dm.x;
    x8.x += Q13i5j1R * dp.x - Q13i5j1I * dm.y;
    x8.y += Q13i5j1R * dp.y + Q13i5j1I * dm.x;
    x6.x += Q13i6j1R * dp.x + Q13i6j1I * dm.y;
    x6.y += Q13i6j1R * dp.y - Q13i6j1I * dm.x;
    x7.x += Q13i6j1R * dp.x - Q13i6j1I * dm.y;
    x7.y += Q13i6j1R * dp.y + Q13i6j1I * dm.x;
    dp = (*R2) + (*R11);
    dm = (*R2) - (*R11);
    x1.x += Q13i1j2R * dp.x + Q13i1j2I * dm.y;
    x1.y += Q13i1j2R * dp.y - Q13i1j2I * dm.x;
    x12.x += Q13i1j2R * dp.x - Q13i1j2I * dm.y;
    x12.y += Q13i1j2R * dp.y + Q13i1j2I * dm.x;
    x2.x += Q13i2j2R * dp.x + Q13i2j2I * dm.y;
    x2.y += Q13i2j2R * dp.y - Q13i2j2I * dm.x;
    x11.x += Q13i2j2R * dp.x - Q13i2j2I * dm.y;
    x11.y += Q13i2j2R * dp.y + Q13i2j2I * dm.x;
    x3.x += Q13i3j2R * dp.x + Q13i3j2I * dm.y;
    x3.y += Q13i3j2R * dp.y - Q13i3j2I * dm.x;
    x10.x += Q13i3j2R * dp.x - Q13i3j2I * dm.y;
    x10.y += Q13i3j2R * dp.y + Q13i3j2I * dm.x;
    x4.x += Q13i4j2R * dp.x + Q13i4j2I * dm.y;
    x4.y += Q13i4j2R * dp.y - Q13i4j2I * dm.x;
    x9.x += Q13i4j2R * dp.x - Q13i4j2I * dm.y;
    x9.y += Q13i4j2R * dp.y + Q13i4j2I * dm.x;
    x5.x += Q13i5j2R * dp.x + Q13i5j2I * dm.y;
    x5.y += Q13i5j2R * dp.y - Q13i5j2I * dm.x;
    x8.x += Q13i5j2R * dp.x - Q13i5j2I * dm.y;
    x8.y += Q13i5j2R * dp.y + Q13i5j2I * dm.x;
    x6.x += Q13i6j2R * dp.x + Q13i6j2I * dm.y;
    x6.y += Q13i6j2R * dp.y - Q13i6j2I * dm.x;
    x7.x += Q13i6j2R * dp.x - Q13i6j2I * dm.y;
    x7.y += Q13i6j2R * dp.y + Q13i6j2I * dm.x;
    dp = (*R3) + (*R10);
    dm = (*R3) - (*R10);
    x1.x += Q13i1j3R * dp.x + Q13i1j3I * dm.y;
    x1.y += Q13i1j3R * dp.y - Q13i1j3I * dm.x;
    x12.x += Q13i1j3R * dp.x - Q13i1j3I * dm.y;
    x12.y += Q13i1j3R * dp.y + Q13i1j3I * dm.x;
    x2.x += Q13i2j3R * dp.x + Q13i2j3I * dm.y;
    x2.y += Q13i2j3R * dp.y - Q13i2j3I * dm.x;
    x11.x += Q13i2j3R * dp.x - Q13i2j3I * dm.y;
    x11.y += Q13i2j3R * dp.y + Q13i2j3I * dm.x;
    x3.x += Q13i3j3R * dp.x + Q13i3j3I * dm.y;
    x3.y += Q13i3j3R * dp.y - Q13i3j3I * dm.x;
    x10.x += Q13i3j3R * dp.x - Q13i3j3I * dm.y;
    x10.y += Q13i3j3R * dp.y + Q13i3j3I * dm.x;
    x4.x += Q13i4j3R * dp.x + Q13i4j3I * dm.y;
    x4.y += Q13i4j3R * dp.y - Q13i4j3I * dm.x;
    x9.x += Q13i4j3R * dp.x - Q13i4j3I * dm.y;
    x9.y += Q13i4j3R * dp.y + Q13i4j3I * dm.x;
    x5.x += Q13i5j3R * dp.x + Q13i5j3I * dm.y;
    x5.y += Q13i5j3R * dp.y - Q13i5j3I * dm.x;
    x8.x += Q13i5j3R * dp.x - Q13i5j3I * dm.y;
    x8.y += Q13i5j3R * dp.y + Q13i5j3I * dm.x;
    x6.x += Q13i6j3R * dp.x + Q13i6j3I * dm.y;
    x6.y += Q13i6j3R * dp.y - Q13i6j3I * dm.x;
    x7.x += Q13i6j3R * dp.x - Q13i6j3I * dm.y;
    x7.y += Q13i6j3R * dp.y + Q13i6j3I * dm.x;
    dp = (*R4) + (*R9);
    dm = (*R4) - (*R9);
    x1.x += Q13i1j4R * dp.x + Q13i1j4I * dm.y;
    x1.y += Q13i1j4R * dp.y - Q13i1j4I * dm.x;
    x12.x += Q13i1j4R * dp.x - Q13i1j4I * dm.y;
    x12.y += Q13i1j4R * dp.y + Q13i1j4I * dm.x;
    x2.x += Q13i2j4R * dp.x + Q13i2j4I * dm.y;
    x2.y += Q13i2j4R * dp.y - Q13i2j4I * dm.x;
    x11.x += Q13i2j4R * dp.x - Q13i2j4I * dm.y;
    x11.y += Q13i2j4R * dp.y + Q13i2j4I * dm.x;
    x3.x += Q13i3j4R * dp.x + Q13i3j4I * dm.y;
    x3.y += Q13i3j4R * dp.y - Q13i3j4I * dm.x;
    x10.x += Q13i3j4R * dp.x - Q13i3j4I * dm.y;
    x10.y += Q13i3j4R * dp.y + Q13i3j4I * dm.x;
    x4.x += Q13i4j4R * dp.x + Q13i4j4I * dm.y;
    x4.y += Q13i4j4R * dp.y - Q13i4j4I * dm.x;
    x9.x += Q13i4j4R * dp.x - Q13i4j4I * dm.y;
    x9.y += Q13i4j4R * dp.y + Q13i4j4I * dm.x;
    x5.x += Q13i5j4R * dp.x + Q13i5j4I * dm.y;
    x5.y += Q13i5j4R * dp.y - Q13i5j4I * dm.x;
    x8.x += Q13i5j4R * dp.x - Q13i5j4I * dm.y;
    x8.y += Q13i5j4R * dp.y + Q13i5j4I * dm.x;
    x6.x += Q13i6j4R * dp.x + Q13i6j4I * dm.y;
    x6.y += Q13i6j4R * dp.y - Q13i6j4I * dm.x;
    x7.x += Q13i6j4R * dp.x - Q13i6j4I * dm.y;
    x7.y += Q13i6j4R * dp.y + Q13i6j4I * dm.x;
    dp = (*R5) + (*R8);
    dm = (*R5) - (*R8);
    x1.x += Q13i1j5R * dp.x + Q13i1j5I * dm.y;
    x1.y += Q13i1j5R * dp.y - Q13i1j5I * dm.x;
    x12.x += Q13i1j5R * dp.x - Q13i1j5I * dm.y;
    x12.y += Q13i1j5R * dp.y + Q13i1j5I * dm.x;
    x2.x += Q13i2j5R * dp.x + Q13i2j5I * dm.y;
    x2.y += Q13i2j5R * dp.y - Q13i2j5I * dm.x;
    x11.x += Q13i2j5R * dp.x - Q13i2j5I * dm.y;
    x11.y += Q13i2j5R * dp.y + Q13i2j5I * dm.x;
    x3.x += Q13i3j5R * dp.x + Q13i3j5I * dm.y;
    x3.y += Q13i3j5R * dp.y - Q13i3j5I * dm.x;
    x10.x += Q13i3j5R * dp.x - Q13i3j5I * dm.y;
    x10.y += Q13i3j5R * dp.y + Q13i3j5I * dm.x;
    x4.x += Q13i4j5R * dp.x + Q13i4j5I * dm.y;
    x4.y += Q13i4j5R * dp.y - Q13i4j5I * dm.x;
    x9.x += Q13i4j5R * dp.x - Q13i4j5I * dm.y;
    x9.y += Q13i4j5R * dp.y + Q13i4j5I * dm.x;
    x5.x += Q13i5j5R * dp.x + Q13i5j5I * dm.y;
    x5.y += Q13i5j5R * dp.y - Q13i5j5I * dm.x;
    x8.x += Q13i5j5R * dp.x - Q13i5j5I * dm.y;
    x8.y += Q13i5j5R * dp.y + Q13i5j5I * dm.x;
    x6.x += Q13i6j5R * dp.x + Q13i6j5I * dm.y;
    x6.y += Q13i6j5R * dp.y - Q13i6j5I * dm.x;
    x7.x += Q13i6j5R * dp.x - Q13i6j5I * dm.y;
    x7.y += Q13i6j5R * dp.y + Q13i6j5I * dm.x;
    dp = (*R6) + (*R7);
    dm = (*R6) - (*R7);
    x1.x += Q13i1j6R * dp.x + Q13i1j6I * dm.y;
    x1.y += Q13i1j6R * dp.y - Q13i1j6I * dm.x;
    x12.x += Q13i1j6R * dp.x - Q13i1j6I * dm.y;
    x12.y += Q13i1j6R * dp.y + Q13i1j6I * dm.x;
    x2.x += Q13i2j6R * dp.x + Q13i2j6I * dm.y;
    x2.y += Q13i2j6R * dp.y - Q13i2j6I * dm.x;
    x11.x += Q13i2j6R * dp.x - Q13i2j6I * dm.y;
    x11.y += Q13i2j6R * dp.y + Q13i2j6I * dm.x;
    x3.x += Q13i3j6R * dp.x + Q13i3j6I * dm.y;
    x3.y += Q13i3j6R * dp.y - Q13i3j6I * dm.x;
    x10.x += Q13i3j6R * dp.x - Q13i3j6I * dm.y;
    x10.y += Q13i3j6R * dp.y + Q13i3j6I * dm.x;
    x4.x += Q13i4j6R * dp.x + Q13i4j6I * dm.y;
    x4.y += Q13i4j6R * dp.y - Q13i4j6I * dm.x;
    x9.x += Q13i4j6R * dp.x - Q13i4j6I * dm.y;
    x9.y += Q13i4j6R * dp.y + Q13i4j6I * dm.x;
    x5.x += Q13i5j6R * dp.x + Q13i5j6I * dm.y;
    x5.y += Q13i5j6R * dp.y - Q13i5j6I * dm.x;
    x8.x += Q13i5j6R * dp.x - Q13i5j6I * dm.y;
    x8.y += Q13i5j6R * dp.y + Q13i5j6I * dm.x;
    x6.x += Q13i6j6R * dp.x + Q13i6j6I * dm.y;
    x6.y += Q13i6j6R * dp.y - Q13i6j6I * dm.x;
    x7.x += Q13i6j6R * dp.x - Q13i6j6I * dm.y;
    x7.y += Q13i6j6R * dp.y + Q13i6j6I * dm.x;
    (*R0)  = x0;
    (*R1)  = x1;
    (*R2)  = x2;
    (*R3)  = x3;
    (*R4)  = x4;
    (*R5)  = x5;
    (*R6)  = x6;
    (*R7)  = x7;
    (*R8)  = x8;
    (*R9)  = x9;
    (*R10) = x10;
    (*R11) = x11;
    (*R12) = x12;
}
)_PY_EMBED_"};
const char* radix_16_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad16B1(T* R0,
                           T* R8,
                           T* R4,
                           T* R12,
                           T* R2,
                           T* R10,
                           T* R6,
                           T* R14,
                           T* R1,
                           T* R9,
                           T* R5,
                           T* R13,
                           T* R3,
                           T* R11,
                           T* R7,
                           T* R15)
{

    T res;

    (*R1)  = (*R0) - (*R1);
    (*R0)  = 2.0 * (*R0) - (*R1);
    (*R3)  = (*R2) - (*R3);
    (*R2)  = 2.0 * (*R2) - (*R3);
    (*R5)  = (*R4) - (*R5);
    (*R4)  = 2.0 * (*R4) - (*R5);
    (*R7)  = (*R6) - (*R7);
    (*R6)  = 2.0 * (*R6) - (*R7);
    (*R9)  = (*R8) - (*R9);
    (*R8)  = 2.0 * (*R8) - (*R9);
    (*R11) = (*R10) - (*R11);
    (*R10) = 2.0 * (*R10) - (*R11);
    (*R13) = (*R12) - (*R13);
    (*R12) = 2.0 * (*R12) - (*R13);
    (*R15) = (*R14) - (*R15);
    (*R14) = 2.0 * (*R14) - (*R15);

    (*R2)  = (*R0) - (*R2);
    (*R0)  = 2.0 * (*R0) - (*R2);
    (*R3)  = (*R1) + T(-(*R3).y, (*R3).x);
    (*R1)  = 2.0 * (*R1) - (*R3);
    (*R6)  = (*R4) - (*R6);
    (*R4)  = 2.0 * (*R4) - (*R6);
    (*R7)  = (*R5) + T(-(*R7).y, (*R7).x);
    (*R5)  = 2.0 * (*R5) - (*R7);
    (*R10) = (*R8) - (*R10);
    (*R8)  = 2.0 * (*R8) - (*R10);
    (*R11) = (*R9) + T(-(*R11).y, (*R11).x);
    (*R9)  = 2.0 * (*R9) - (*R11);
    (*R14) = (*R12) - (*R14);
    (*R12) = 2.0 * (*R12) - (*R14);
    (*R15) = (*R13) + T(-(*R15).y, (*R15).x);
    (*R13) = 2.0 * (*R13) - (*R15);

    (*R4)  = (*R0) - (*R4);
    (*R0)  = 2.0 * (*R0) - (*R4);
    (*R5)  = ((*R1) - C8Q * (*R5)) - C8Q * T((*R5).y, -(*R5).x);
    (*R1)  = 2.0 * (*R1) - (*R5);
    (*R6)  = (*R2) + T(-(*R6).y, (*R6).x);
    (*R2)  = 2.0 * (*R2) - (*R6);
    (*R7)  = ((*R3) + C8Q * (*R7)) - C8Q * T((*R7).y, -(*R7).x);
    (*R3)  = 2.0 * (*R3) - (*R7);
    (*R12) = (*R8) - (*R12);
    (*R8)  = 2.0 * (*R8) - (*R12);
    (*R13) = ((*R9) - C8Q * (*R13)) - C8Q * T((*R13).y, -(*R13).x);
    (*R9)  = 2.0 * (*R9) - (*R13);
    (*R14) = (*R10) + T(-(*R14).y, (*R14).x);
    (*R10) = 2.0 * (*R10) - (*R14);
    (*R15) = ((*R11) + C8Q * (*R15)) - C8Q * T((*R15).y, -(*R15).x);
    (*R11) = 2.0 * (*R11) - (*R15);

    (*R8) = (*R0) - (*R8);
    (*R0) = 2.0 * (*R0) - (*R8);
    (*R9) = ((*R1) - C16A * (*R9)) - C16B * T((*R9).y, -(*R9).x);
    res   = (*R8);
    (*R1) = 2.0 * (*R1) - (*R9);

    (*R10) = ((*R2) - C8Q * (*R10)) - C8Q * T((*R10).y, -(*R10).x);
    (*R2)  = 2.0 * (*R2) - (*R10);
    (*R11) = ((*R3) - C16B * (*R11)) - C16A * T((*R11).y, -(*R11).x);
    (*R3)  = 2.0 * (*R3) - (*R11);

    (*R12) = (*R4) + T(-(*R12).y, (*R12).x);
    (*R4)  = 2.0 * (*R4) - (*R12);
    (*R13) = ((*R5) + C16B * (*R13)) - C16A * T((*R13).y, -(*R13).x);
    (*R5)  = 2.0 * (*R5) - (*R13);

    (*R14) = ((*R6) + C8Q * (*R14)) - C8Q * T((*R14).y, -(*R14).x);
    (*R6)  = 2.0 * (*R6) - (*R14);
    (*R15) = ((*R7) + C16A * (*R15)) - C16B * T((*R15).y, -(*R15).x);
    (*R7)  = 2.0 * (*R7) - (*R15);

    res    = (*R1);
    (*R1)  = (*R8);
    (*R8)  = res;
    res    = (*R2);
    (*R2)  = (*R4);
    (*R4)  = res;
    res    = (*R3);
    (*R3)  = (*R12);
    (*R12) = res;
    res    = (*R5);
    (*R5)  = (*R10);
    (*R10) = res;
    res    = (*R7);
    (*R7)  = (*R14);
    (*R14) = res;
    res    = (*R11);
    (*R11) = (*R13);
    (*R13) = res;
}

template <typename T>
__device__ void InvRad16B1(T* R0,
                           T* R8,
                           T* R4,
                           T* R12,
                           T* R2,
                           T* R10,
                           T* R6,
                           T* R14,
                           T* R1,
                           T* R9,
                           T* R5,
                           T* R13,
                           T* R3,
                           T* R11,
                           T* R7,
                           T* R15)
{

    T res;

    (*R1)  = (*R0) - (*R1);
    (*R0)  = 2.0 * (*R0) - (*R1);
    (*R3)  = (*R2) - (*R3);
    (*R2)  = 2.0 * (*R2) - (*R3);
    (*R5)  = (*R4) - (*R5);
    (*R4)  = 2.0 * (*R4) - (*R5);
    (*R7)  = (*R6) - (*R7);
    (*R6)  = 2.0 * (*R6) - (*R7);
    (*R9)  = (*R8) - (*R9);
    (*R8)  = 2.0 * (*R8) - (*R9);
    (*R11) = (*R10) - (*R11);
    (*R10) = 2.0 * (*R10) - (*R11);
    (*R13) = (*R12) - (*R13);
    (*R12) = 2.0 * (*R12) - (*R13);
    (*R15) = (*R14) - (*R15);
    (*R14) = 2.0 * (*R14) - (*R15);

    (*R2)  = (*R0) - (*R2);
    (*R0)  = 2.0 * (*R0) - (*R2);
    (*R3)  = (*R1) + T((*R3).y, -(*R3).x);
    (*R1)  = 2.0 * (*R1) - (*R3);
    (*R6)  = (*R4) - (*R6);
    (*R4)  = 2.0 * (*R4) - (*R6);
    (*R7)  = (*R5) + T((*R7).y, -(*R7).x);
    (*R5)  = 2.0 * (*R5) - (*R7);
    (*R10) = (*R8) - (*R10);
    (*R8)  = 2.0 * (*R8) - (*R10);
    (*R11) = (*R9) + T((*R11).y, -(*R11).x);
    (*R9)  = 2.0 * (*R9) - (*R11);
    (*R14) = (*R12) - (*R14);
    (*R12) = 2.0 * (*R12) - (*R14);
    (*R15) = (*R13) + T((*R15).y, -(*R15).x);
    (*R13) = 2.0 * (*R13) - (*R15);

    (*R4)  = (*R0) - (*R4);
    (*R0)  = 2.0 * (*R0) - (*R4);
    (*R5)  = ((*R1) - C8Q * (*R5)) + C8Q * T((*R5).y, -(*R5).x);
    (*R1)  = 2.0 * (*R1) - (*R5);
    (*R6)  = (*R2) + T((*R6).y, -(*R6).x);
    (*R2)  = 2.0 * (*R2) - (*R6);
    (*R7)  = ((*R3) + C8Q * (*R7)) + C8Q * T((*R7).y, -(*R7).x);
    (*R3)  = 2.0 * (*R3) - (*R7);
    (*R12) = (*R8) - (*R12);
    (*R8)  = 2.0 * (*R8) - (*R12);
    (*R13) = ((*R9) - C8Q * (*R13)) + C8Q * T((*R13).y, -(*R13).x);
    (*R9)  = 2.0 * (*R9) - (*R13);
    (*R14) = (*R10) + T((*R14).y, -(*R14).x);
    (*R10) = 2.0 * (*R10) - (*R14);
    (*R15) = ((*R11) + C8Q * (*R15)) + C8Q * T((*R15).y, -(*R15).x);
    (*R11) = 2.0 * (*R11) - (*R15);

    (*R8)  = (*R0) - (*R8);
    (*R0)  = 2.0 * (*R0) - (*R8);
    (*R9)  = ((*R1) - C16A * (*R9)) + C16B * T((*R9).y, -(*R9).x);
    (*R1)  = 2.0 * (*R1) - (*R9);
    (*R10) = ((*R2) - C8Q * (*R10)) + C8Q * T((*R10).y, -(*R10).x);
    (*R2)  = 2.0 * (*R2) - (*R10);
    (*R11) = ((*R3) - C16B * (*R11)) + C16A * T((*R11).y, -(*R11).x);
    (*R3)  = 2.0 * (*R3) - (*R11);
    (*R12) = (*R4) + T((*R12).y, -(*R12).x);
    (*R4)  = 2.0 * (*R4) - (*R12);
    (*R13) = ((*R5) + C16B * (*R13)) + C16A * T((*R13).y, -(*R13).x);
    (*R5)  = 2.0 * (*R5) - (*R13);
    (*R14) = ((*R6) + C8Q * (*R14)) + C8Q * T((*R14).y, -(*R14).x);
    (*R6)  = 2.0 * (*R6) - (*R14);
    (*R15) = ((*R7) + C16A * (*R15)) + C16B * T((*R15).y, -(*R15).x);
    (*R7)  = 2.0 * (*R7) - (*R15);

    res    = (*R1);
    (*R1)  = (*R8);
    (*R8)  = res;
    res    = (*R2);
    (*R2)  = (*R4);
    (*R4)  = res;
    res    = (*R3);
    (*R3)  = (*R12);
    (*R12) = res;
    res    = (*R5);
    (*R5)  = (*R10);
    (*R10) = res;
    res    = (*R7);
    (*R7)  = (*R14);
    (*R14) = res;
    res    = (*R11);
    (*R11) = (*R13);
    (*R13) = res;
}
)_PY_EMBED_"};
const char* radix_17_h {
R"_PY_EMBED_(
/*******************************************************************************
 * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
 ******************************************************************************/

template <typename T>
__device__ void FwdRad17B1(T* R0,
                           T* R1,
                           T* R2,
                           T* R3,
                           T* R4,
                           T* R5,
                           T* R6,
                           T* R7,
                           T* R8,
                           T* R9,
                           T* R10,
                           T* R11,
                           T* R12,
                           T* R13,
                           T* R14,
                           T* R15,
                           T* R16)
{
    T x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, dp, dm;

    x0 = (*R0) + (*R1) + (*R2) + (*R3) + (*R4) + (*R5) + (*R6) + (*R7) + (*R8) + (*R9) + (*R10)
         + (*R11) + (*R12) + (*R13) + (*R14) + (*R15) + (*R16);
    x1  = (*R0);
    x2  = (*R0);
    x3  = (*R0);
    x4  = (*R0);
    x5  = (*R0);
    x6  = (*R0);
    x7  = (*R0);
    x8  = (*R0);
    x9  = (*R0);
    x10 = (*R0);
    x11 = (*R0);
    x12 = (*R0);
    x13 = (*R0);
    x14 = (*R0);
    x15 = (*R0);
    x16 = (*R0);
    dp  = (*R1) + (*R16);
    dm  = (*R1) - (*R16);
    x1.x += Q17i1j1R * dp.x - Q17i1j1I * dm.y;
    x1.y += Q17i1j1R * dp.y + Q17i1j1I * dm.x;
    x16.x += Q17i1j1R * dp.x + Q17i1j1I * dm.y;
    x16.y += Q17i1j1R * dp.y - Q17i1j1I * dm.x;
    x2.x += Q17i2j1R * dp.x - Q17i2j1I * dm.y;
    x2.y += Q17i2j1R * dp.y + Q17i2j1I * dm.x;
    x15.x += Q17i2j1R * dp.x + Q17i2j1I * dm.y;
    x15.y += Q17i2j1R * dp.y - Q17i2j1I * dm.x;
    x3.x += Q17i3j1R * dp.x - Q17i3j1I * dm.y;
    x3.y += Q17i3j1R * dp.y + Q17i3j1I * dm.x;
    x14.x += Q17i3j1R * dp.x + Q17i3j1I * dm.y;
    x14.y += Q17i3j1R * dp.y - Q17i3j1I * dm.x;
    x4.x += Q17i4j1R * dp.x - Q17i4j1I * dm.y;
    x4.y += Q17i4j1R * dp.y + Q17i4j1I * dm.x;
    x13.x += Q17i4j1R * dp.x + Q17i4j1I * dm.y;
    x13.y += Q17i4j1R * dp.y - Q17i4j1I * dm.x;
    x5.x += Q17i5j1R * dp.x - Q17i5j1I * dm.y;
    x5.y += Q17i5j1R * dp.y + Q17i5j1I * dm.x;
    x12.x += Q17i5j1R * dp.x + Q17i5j1I * dm.y;
    x12.y += Q17i5j1R * dp.y - Q17i5j1I * dm.x;
    x6.x += Q17i6j1R * dp.x - Q17i6j1I * dm.y;
    x6.y += Q17i6j1R * dp.y + Q17i6j1I * dm.x;
    x11.x += Q17i6j1R * dp.x + Q17i6j1I * dm.y;
    x11.y += Q17i6j1R * dp.y - Q17i6j1I * dm.x;
    x7.x += Q17i7j1R * dp.x - Q17i7j1I * dm.y;
    x7.y += Q17i7j1R * dp.y + Q17i7j1I * dm.x;
    x10.x += Q17i7j1R * dp.x + Q17i7j1I * dm.y;
    x10.y += Q17i7j1R * dp.y - Q17i7j1I * dm.x;
    x8.x += Q17i8j1R * dp.x - Q17i8j1I * dm.y;
    x8.y += Q17i8j1R * dp.y + Q17i8j1I * dm.x;
    x9.x += Q17i8j1R * dp.x + Q17i8j1I * dm.y;
    x9.y += Q17i8j1R * dp.y - Q17i8j1I * dm.x;
    dp = (*R2) + (*R15);
    dm = (*R2) - (*R15);
    x1.x += Q17i1j2R * dp.x - Q17i1j2I * dm.y;
    x1.y += Q17i1j2R * dp.y + Q17i1j2I * dm.x;
    x16.x += Q17i1j2R * dp.x + Q17i1j2I * dm.y;
    x16.y += Q17i1j2R * dp.y - Q17i1j2I * dm.x;
    x2.x += Q17i2j2R * dp.x - Q17i2j2I * dm.y;
    x2.y += Q17i2j2R * dp.y + Q17i2j2I * dm.x;
    x15.x += Q17i2j2R * dp.x + Q17i2j2I * dm.y;
    x15.y += Q17i2j2R * dp.y - Q17i2j2I * dm.x;
    x3.x += Q17i3j2R * dp.x - Q17i3j2I * dm.y;
    x3.y += Q17i3j2R * dp.y + Q17i3j2I * dm.x;
    x14.x += Q17i3j2R * dp.x + Q17i3j2I * dm.y;
    x14.y += Q17i3j2R * dp.y - Q17i3j2I * dm.x;
    x4.x += Q17i4j2R * dp.x - Q17i4j2I * dm.y;
    x4.y += Q17i4j2R * dp.y + Q17i4j2I * dm.x;
    x13.x += Q17i4j2R * dp.x + Q17i4j2I * dm.y;
    x13.y += Q17i4j2R * dp.y - Q17i4j2I * dm.x;
    x5.x += Q17i5j2R * dp.x - Q17i5j2I * dm.y;
    x5.y += Q17i5j2R * dp.y + Q17i5j2I * dm.x;
    x12.x += Q17i5j2R * dp.x + Q17i5j2I * dm.y;
    x12.y += Q17i5j2R * dp.y - Q17i5j2I * dm.x;
    x6.x += Q17i6j2R * dp.x - Q17i6j2I * dm.y;
    x6.y += Q17i6j2R * dp.y + Q17i6j2I * dm.x;
    x11.x += Q17i6j2R * dp.x + Q17i6j2I * dm.y;
    x11.y += Q17i6j2R * dp.y - Q17i6j2I * dm.x;
    x7.x += Q17i7j2R * dp.x - Q17i7j2I * dm.y;
    x7.y += Q17i7j2R * dp.y + Q17i7j2I * dm.x;
    x10.x += Q17i7j2R * dp.x + Q17i7j2I * dm.y;
    x10.y += Q17i7j2R * dp.y - Q17i7j2I * dm.x;
    x8.x += Q17i8j2R * dp.x - Q17i8j2I * dm.y;
    x8.y += Q17i8j2R * dp.y + Q17i8j2I * dm.x;
    x9.x += Q17i8j2R * dp.x + Q17i8j2I * dm.y;
    x9.y += Q17i8j2R * dp.y - Q17i8j2I * dm.x;
    dp = (*R3) + (*R14);
    dm = (*R3) - (*R14);
    x1.x += Q17i1j3R * dp.x - Q17i1j3I * dm.y;
    x1.y += Q17i1j3R * dp.y + Q17i1j3I * dm.x;
    x16.x += Q17i1j3R * dp.x + Q17i1j3I * dm.y;
    x16.y += Q17i1j3R * dp.y - Q17i1j3I * dm.x;
    x2.x += Q17i2j3R * dp.x - Q17i2j3I * dm.y;
    x2.y += Q17i2j3R * dp.y + Q17i2j3I * dm.x;
    x15.x += Q17i2j3R * dp.x + Q17i2j3I * dm.y;
    x15.y += Q17i2j3R * dp.y - Q17i2j3I * dm.x;
    x3.x += Q17i3j3R * dp.x - Q17i3j3I * dm.y;
    x3.y += Q17i3j3R * dp.y + Q17i3j3I * dm.x;
    x14.x += Q17i3j3R * dp.x + Q17i3j3I * dm.y;
    x14.y += Q17i3j3R * dp.y - Q17i3j3I * dm.x;
    x4.x += Q17i4j3R * dp.x - Q17i4j3I * dm.y;
    x4.y += Q17i4j3R * dp.y + Q17i4j3I * dm.x;
    x13.x += Q17i4j3R * dp.x + Q17i4j3I * dm.y;
    x13.y += Q17i4j3R * dp.y - Q17i4j3I * dm.x;
    x5.x += Q17i5j3R * dp.x - Q17i5j3I * dm.y;
    x5.y += Q17i5j3R * dp.y + Q17i5j3I * dm.x;
    x12.x += Q17i5j3R * dp.x + Q17i5j3I * dm.y;
    x12.y += Q17i5j3R * dp.y - Q17i5j3I * dm.x;
    x6.x += Q17i6j3R * dp.x - Q17i6j3I * dm.y;
    x6.y += Q17i6j3R * dp.y + Q17i6j3I * dm.x;
    x11.x += Q17i6j3R * dp.x + Q17i6j3I * dm.y;
    x11.y += Q17i6j3R * dp.y - Q17i6j3I * dm.x;
    x7.x += Q17i7j3R * dp.x - Q17i7j3I * dm.y;
    x7.y += Q17i7j3R * dp.y + Q17i7j3I * dm.x;
    x10.x += Q17i7j3R * dp.x + Q17i7j3I * dm.y;
    x10.y += Q17i7j3R * dp.y - Q17i7j3I * dm.x;
    x8.x += Q17i8j3R * dp.x - Q17i8j3I * dm.y;
    x8.y += Q17i8j3R * dp.y + Q17i8j3I * dm.x;
    x9.x += Q17i8j3R * dp.x + Q17i8j3I * dm.y;
    x9.y += Q17i8j3R * dp.y - Q17i8j3I * dm.x;
    dp = (*R4) + (*R13);
    dm = (*R4) - (*R13);
    x1.x += Q17i1j4R * dp.x - Q17i1j4I * dm.y;
    x1.y += Q17i1j4R * dp.y + Q17i1j4I * dm.x;
    x16.x += Q17i1j4R * dp.x + Q17i1j4I * dm.y;
    x16.y += Q17i1j4R * dp.y - Q17i1j4I * dm.x;
    x2.x += Q17i2j4R * dp.x - Q17i2j4I * dm.y;
    x2.y += Q17i2j4R * dp.y + Q17i2j4I * dm.x;
    x15.x += Q17i2j4R * dp.x + Q17i2j4I * dm.y;
    x15.y += Q17i2j4R * dp.y - Q17i2j4I * dm.x;
    x3.x += Q17i3j4R * dp.x - Q17i3j4I * dm.y;
    x3.y += Q17i3j4R * dp.y + Q17i3j4I * dm.x;
    x14.x += Q17i3j4R * dp.x + Q17i3j4I * dm.y;
    x14.y += Q17i3j4R * dp.y - Q17i3j4I * dm.x;
    x4.x += Q17i4j4R * dp.x - Q17i4j4I * dm.y;
    x4.y += Q17i4j4R * dp.y + Q17i4j4I * dm.x;
    x13.x += Q17i4j4R * dp.x + Q17i4j4I * dm.y;
    x13.y += Q17i4j4R * dp.y - Q17i4j4I * dm.x;
    x5.x += Q17i5j4R * dp.x - Q17i5j4I * dm.y;
    x5.y += Q17i5j4R * dp.y + Q17i5j4I * dm.x;
    x12.x += Q17i5j4R * dp.x + Q17i5j4I * dm.y;
    x12.y += Q17i5j4R * dp.y - Q17i5j4I * dm.x;
    x6.x += Q17i6j4R * dp.x - Q17i6j4I * dm.y;
    x6.y += Q17i6j4R * dp.y + Q17i6j4I * dm.x;
    x11.x += Q17i6j4R * dp.x + Q17i6j4I * dm.y;
    x11.y += Q17i6j4R * dp.y - Q17i6j4I * dm.x;
    x7.x += Q17i7j4R * dp.x - Q17i7j4I * dm.y;
    x7.y += Q17i7j4R * dp.y + Q17i7j4I * dm.x;
    x10.x += Q17i7j4R * dp.x + Q17i7j4I * dm.y;
    x10.y += Q17i7j4R * dp.y - Q17i7j4I * dm.x;
    x8.x += Q17i8j4R * dp.x - Q17i8j4I * dm.y;
    x8.y += Q17i8j4R * dp.y + Q17i8j4I * dm.x;
    x9.x += Q17i8j4R * dp.x + Q17i8j4I * dm.y;
    x9.y += Q17i8j4R * dp.y - Q17i8j4I * dm.x;
    dp = (*R5) + (*R12);
    dm = (*R5) - (*R12);
    x1.x += Q17i1j5R * dp.x - Q17i1j5I * dm.y;
    x1.y += Q17i1j5R * dp.y + Q17i1j5I * dm.x;
    x16.x += Q17i1j5R * dp.x + Q17i1j5I * dm.y;
    x16.y += Q17i1j5R * dp.y - Q17i1j5I * dm.x;
    x2.x += Q17i2j5R * dp.x - Q17i2j5I * dm.y;
    x2.y += Q17i2j5R * dp.y + Q17i2j5I * dm.x;
    x15.x += Q17i2j5R * dp.x + Q17i2j5I * dm.y;
    x15.y += Q17i2j5R * dp.y - Q17i2j5I * dm.x;
    x3.x += Q17i3j5R * dp.x - Q17i3j5I * dm.y;
    x3.y += Q17i3j5R * dp.y + Q17i3j5I * dm.x;
    x14.x += Q17i3j5R * dp.x + Q17i3j5I * dm.y;
    x14.y += Q17i3j5R * dp.y - Q17i3j5I * dm.x;
    x4.x += Q17i4j5R * dp.x - Q17i4j5I * dm.y;
    x4.y += Q17i4j5R * dp.y + Q17i4j5I * dm.x;
    x13.x += Q17i4j5R * dp.x + Q17i4j5I * dm.y;
    x13.y += Q17i4j5R * dp.y - Q17i4j5I * dm.x;
    x5.x += Q17i5j5R * dp.x - Q17i5j5I * dm.y;
    x5.y += Q17i5j5R * dp.y + Q17i5j5I * dm.x;
    x12.x += Q17i5j5R * dp.x + Q17i5j5I * dm.y;
    x12.y += Q17i5j5R * dp.y - Q17i5j5I * dm.x;
    x6.x += Q17i6j5R * dp.x - Q17i6j5I * dm.y;
    x6.y += Q17i6j5R * dp.y + Q17i6j5I * dm.x;
    x11.x += Q17i6j5R * dp.x + Q17i6j5I * dm.y;
    x11.y += Q17i6j5R * dp.y - Q17i6j5I * dm.x;
    x7.x += Q17i7j5R * dp.x - Q17i7j5I * dm.y;
    x7.y += Q17i7j5R * dp.y + Q17i7j5I * dm.x;
    x10.x += Q17i7j5R * dp.x + Q17i7j5I * dm.y;
    x10.y += Q17i7j5R * dp.y - Q17i7j5I * dm.x;
    x8.x += Q17i8j5R * dp.x - Q17i8j5I * dm.y;
    x8.y += Q17i8j5R * dp.y + Q17i8j5I * dm.x;
    x9.x += Q17i8j5R * dp.x + Q17i8j5I * dm.y;
    x9.y += Q17i8j5R * dp.y - Q17i8j5I * dm.x;
    dp = (*R6) + (*R11);
    dm = (*R6) - (*R11);
    x1.x += Q17i1j6R * dp.x - Q17i1j6I * dm.y;
    x1.y += Q17i1j6R * dp.y + Q17i1j6I * dm.x;
    x16.x += Q17i1j6R * dp.x + Q17i1j6I * dm.y;
    x16.y += Q17i1j6R * dp.y - Q17i1j6I * dm.x;
    x2.x += Q17i2j6R * dp.x - Q17i2j6I * dm.y;
    x2.y += Q17i2j6R * dp.y + Q17i2j6I * dm.x;
    x15.x += Q17i2j6R * dp.x + Q17i2j6I * dm.y;
    x15.y += Q17i2j6R * dp.y - Q17i2j6I * dm.x;
    x3.x += Q17i3j6R * dp.x - Q17i3j6I * dm.y;
    x3.y += Q17i3j6R * dp.y + Q17i3j6I * dm.x;
    x14.x += Q17i3j6R * dp.x + Q17i3j6I * dm.y;
    x14.y += Q17i3j6R * dp.y - Q17i3j6I * dm.x;
    x4.x += Q17i4j6R * dp.x - Q17i4j6I * dm.y;
    x4.y += Q17i4j6R * dp.y + Q17i4j6I * dm.x;
    x13.x += Q17i4j6R * dp.x + Q17i4j6I * dm.y;
    x13.y += Q17i4j6R * dp.y - Q17i4j6I * dm.x;
    x5.x += Q17i5j6R * dp.x - Q17i5j6I * dm.y;
    x5.y += Q17i5j6R * dp.y + Q17i5j6I * dm.x;
    x12.x += Q17i5j6R * dp.x + Q17i5j6I * dm.y;
    x12.y += Q17i5j6R * dp.y - Q17i5j6I * dm.x;
    x6.x += Q17i6j6R * dp.x - Q17i6j6I * dm.y;
    x6.y += Q17i6j6R * dp.y + Q17i6j6I * dm.x;
    x11.x += Q17i6j6R * dp.x + Q17i6j6I * dm.y;
    x11.y += Q17i6j6R * dp.y - Q17i6j6I * dm.x;
    x7.x += Q17i7j6R * dp.x - Q17i7j6I * dm.y;
    x7.y += Q17i7j6R * dp.y + Q17i7j6I * dm.x;
    x10.x += Q17i7j6R * dp.x + Q17i7j6I * dm.y;
    x10.y += Q17i7j6R * dp.y - Q17i7j6I * dm.x;
    x8.x += Q17i8j6R * dp.x - Q17i8j6I * dm.y;
    x8.y += Q17i8j6R * dp.y + Q17i8j6I * dm.x;
    x9.x += Q17i8j6R * dp.x + Q17i8j6I * dm.y;
    x9.y += Q17i8j6R * dp.y - Q17i8j6I * dm.x;
    dp = (*R7) + (*R10);
    dm = (*R7) - (*R10);
    x1.x += Q17i1j7R * dp.x - Q17i1j7I * dm.y;
    x1.y += Q17i1j7R * dp.y + Q17i1j7I * dm.x;
    x16.x += Q17i1j7R * dp.x + Q17i1j7I * dm.y;
    x16.y += Q17i1j7R * dp.y - Q17i1j7I * dm.x;
    x2.x += Q17i2j7R * dp.x - Q17i2j7I * dm.y;
    x2.y += Q17i2j7R * dp.y + Q17i2j7I * dm.x;
    x15.x += Q17i2j7R * dp.x + Q17i2j7I * dm.y;
    x15.y += Q17i2j7R * dp.y - Q17i2j7I * dm.x;
    x3.x += Q17i3j7R * dp.x - Q17i3j7I * dm.y;
    x3.y += Q17i3j7R * dp.y + Q17i3j7I * dm.x;
    x14.x += Q17i3j7R * dp.x + Q17i3j7I * dm.y;
    x14.y += Q17i3j7R * dp.y - Q17i3j7I * dm.x;
    x4.x += Q17i4j7R * dp.x - Q17i4j7I * dm.y;
    x4.y += Q17i4j7R * dp.y + Q17i4j7I * dm.x;
    x13.x += Q17i4j7R * dp.x + Q17i4j7I * dm.y;
    x13.y += Q17i4j7R * dp.y - Q17i4j7I * dm.x;
    x5.x += Q17i5j7R * dp.x - Q17i5j7I * dm.y;
    x5.y += Q17i5j7R * dp.y + Q17i5j7I * dm.x;
    x12.x += Q17i5j7R * dp.x + Q17i5j7I * dm.y;
    x12.y += Q17i5j7R * dp.y - Q17i5j7I * dm.x;
    x6.x += Q17i6j7R * dp.x - Q17i6j7I * dm.y;
    x6.y += Q17i6j7R * dp.y + Q17i6j7I * dm.x;
    x11.x += Q17i6j7R * dp.x + Q17i6j7I * dm.y;
    x11.y += Q17i6j7R * dp.y - Q17i6j7I * dm.x;
    x7.x += Q17i7j7R * dp.x - Q17i7j7I * dm.y;
    x7.y += Q17i7j7R * dp.y + Q17i7j7I * dm.x;
    x10.x += Q17i7j7R * dp.x + Q17i7j7I * dm.y;
    x10.y += Q17i7j7R * dp.y - Q17i7j7I * dm.x;
    x8.x += Q17i8j7R * dp.x - Q17i8j7I * dm.y;
    x8.y += Q17i8j7R * dp.y + Q17i8j7I * dm.x;
    x9.x += Q17i8j7R * dp.x + Q17i8j7I * dm.y;
    x9.y += Q17i8j7R * dp.y - Q17i8j7I * dm.x;
    dp = (*R8) + (*R9);
    dm = (*R8) - (*R9);
    x1.x += Q17i1j8R * dp.x - Q17i1j8I * dm.y;
    x1.y += Q17i1j8R * dp.y + Q17i1j8I * dm.x;
    x16.x += Q17i1j8R * dp.x + Q17i1j8I * dm.y;
    x16.y += Q17i1j8R * dp.y - Q17i1j8I * dm.x;
    x2.x += Q17i2j8R * dp.x - Q17i2j8I * dm.y;
    x2.y += Q17i2j8R * dp.y + Q17i2j8I * dm.x;
    x15.x += Q17i2j8R * dp.x + Q17i2j8I * dm.y;
    x15.y += Q17i2j8R * dp.y - Q17i2j8I * dm.x;
    x3.x += Q17i3j8R * dp.x - Q17i3j8I * dm.y;
    x3.y += Q17i3j8R * dp.y + Q17i3j8I * dm.x;
    x14.x += Q17i3j8R * dp.x + Q17i3j8I * dm.y;
    x14.y += Q17i3j8R * dp.y - Q17i3j8I * dm.x;
    x4.x += Q17i4j8R * dp.x - Q17i4j8I * dm.y;
    x4.y += Q17i4j8R * dp.y + Q17i4j8I * dm.x;
    x13.x += Q17i4j8R * dp.x + Q17i4j8I * dm.y;
    x13.y += Q17i4j8R * dp.y - Q17i4j8I * dm.x;
    x5.x += Q17i5j8R * dp.x - Q17i5j8I * dm.y;
    x5.y += Q17i5j8R * dp.y + Q17i5j8I * dm.x;
    x12.x += Q17i5j8R * dp.x + Q17i5j8I * dm.y;
    x12.y += Q17i5j8R * dp.y - Q17i5j8I * dm.x;
    x6.x += Q17i6j8R * dp.x - Q17i6j8I * dm.y;
    x6.y += Q17i6j8R * dp.y + Q17i6j8I * dm.x;
    x11.x += Q17i6j8R * dp.x + Q17i6j8I * dm.y;
    x11.y += Q17i6j8R * dp.y - Q17i6j8I * dm.x;
    x7.x += Q17i7j8R * dp.x - Q17i7j8I * dm.y;
    x7.y += Q17i7j8R * dp.y + Q17i7j8I * dm.x;
    x10.x += Q17i7j8R * dp.x + Q17i7j8I * dm.y;
    x10.y += Q17i7j8R * dp.y - Q17i7j8I * dm.x;
    x8.x += Q17i8j8R * dp.x - Q17i8j8I * dm.y;
    x8.y += Q17i8j8R * dp.y + Q17i8j8I * dm.x;
    x9.x += Q17i8j8R * dp.x + Q17i8j8I * dm.y;
    x9.y += Q17i8j8R * dp.y - Q17i8j8I * dm.x;
    (*R0)  = x0;
    (*R1)  = x1;
    (*R2)  = x2;
    (*R3)  = x3;
    (*R4)  = x4;
    (*R5)  = x5;
    (*R6)  = x6;
    (*R7)  = x7;
    (*R8)  = x8;
    (*R9)  = x9;
    (*R10) = x10;
    (*R11) = x11;
    (*R12) = x12;
    (*R13) = x13;
    (*R14) = x14;
    (*R15) = x15;
    (*R16) = x16;
}

template <typename T>
__device__ void InvRad17B1(T* R0,
                           T* R1,
                           T* R2,
                           T* R3,
                           T* R4,
                           T* R5,
                           T* R6,
                           T* R7,
                           T* R8,
                           T* R9,
                           T* R10,
                           T* R11,
                           T* R12,
                           T* R13,
                           T* R14,
                           T* R15,
                           T* R16)
{
    T x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, dp, dm;

    x0 = (*R0) + (*R1) + (*R2) + (*R3) + (*R4) + (*R5) + (*R6) + (*R7) + (*R8) + (*R9) + (*R10)
         + (*R11) + (*R12) + (*R13) + (*R14) + (*R15) + (*R16);
    x1  = (*R0);
    x2  = (*R0);
    x3  = (*R0);
    x4  = (*R0);
    x5  = (*R0);
    x6  = (*R0);
    x7  = (*R0);
    x8  = (*R0);
    x9  = (*R0);
    x10 = (*R0);
    x11 = (*R0);
    x12 = (*R0);
    x13 = (*R0);
    x14 = (*R0);
    x15 = (*R0);
    x16 = (*R0);
    dp  = (*R1) + (*R16);
    dm  = (*R1) - (*R16);
    x1.x += Q17i1j1R * dp.x + Q17i1j1I * dm.y;
    x1.y += Q17i1j1R * dp.y - Q17i1j1I * dm.x;
    x16.x += Q17i1j1R * dp.x - Q17i1j1I * dm.y;
    x16.y += Q17i1j1R * dp.y + Q17i1j1I * dm.x;
    x2.x += Q17i2j1R * dp.x + Q17i2j1I * dm.y;
    x2.y += Q17i2j1R * dp.y - Q17i2j1I * dm.x;
    x15.x += Q17i2j1R * dp.x - Q17i2j1I * dm.y;
    x15.y += Q17i2j1R * dp.y + Q17i2j1I * dm.x;
    x3.x += Q17i3j1R * dp.x + Q17i3j1I * dm.y;
    x3.y += Q17i3j1R * dp.y - Q17i3j1I * dm.x;
    x14.x += Q17i3j1R * dp.x - Q17i3j1I * dm.y;
    x14.y += Q17i3j1R * dp.y + Q17i3j1I * dm.x;
    x4.x += Q17i4j1R * dp.x + Q17i4j1I * dm.y;
    x4.y += Q17i4j1R * dp.y - Q17i4j1I * dm.x;
    x13.x += Q17i4j1R * dp.x - Q17i4j1I * dm.y;
    x13.y += Q17i4j1R * dp.y + Q17i4j1I * dm.x;
    x5.x += Q17i5j1R * dp.x + Q17i5j1I * dm.y;
    x5.y += Q17i5j1R * dp.y - Q17i5j1I * dm.x;
    x12.x += Q17i5j1R * dp.x - Q17i5j1I * dm.y;
    x12.y += Q17i5j1R * dp.y + Q17i5j1I * dm.x;
    x6.x += Q17i6j1R * dp.x + Q17i6j1I * dm.y;
    x6.y += Q17i6j1R * dp.y - Q17i6j1I * dm.x;
    x11.x += Q17i6j1R * dp.x - Q17i6j1I * dm.y;
    x11.y += Q17i6j1R * dp.y + Q17i6j1I * dm.x;
    x7.x += Q17i7j1R * dp.x + Q17i7j1I * dm.y;
    x7.y += Q17i7j1R * dp.y - Q17i7j1I * dm.x;
    x10.x += Q17i7j1R * dp.x - Q17i7j1I * dm.y;
    x10.y += Q17i7j1R * dp.y + Q17i7j1I * dm.x;
    x8.x += Q17i8j1R * dp.x + Q17i8j1I * dm.y;
    x8.y += Q17i8j1R * dp.y - Q17i8j1I * dm.x;
    x9.x += Q17i8j1R * dp.x - Q17i8j1I * dm.y;
    x9.y += Q17i8j1R * dp.y + Q17i8j1I * dm.x;
    dp = (*R2) + (*R15);
    dm = (*R2) - (*R15);
    x1.x += Q17i1j2R * dp.x + Q17i1j2I * dm.y;
    x1.y += Q17i1j2R * dp.y - Q17i1j2I * dm.x;
    x16.x += Q17i1j2R * dp.x - Q17i1j2I * dm.y;
    x16.y += Q17i1j2R * dp.y + Q17i1j2I * dm.x;
    x2.x += Q17i2j2R * dp.x + Q17i2j2I * dm.y;
    x2.y += Q17i2j2R * dp.y - Q17i2j2I * dm.x;
    x15.x += Q17i2j2R * dp.x - Q17i2j2I * dm.y;
    x15.y += Q17i2j2R * dp.y + Q17i2j2I * dm.x;
    x3.x += Q17i3j2R * dp.x + Q17i3j2I * dm.y;
    x3.y += Q17i3j2R * dp.y - Q17i3j2I * dm.x;
    x14.x += Q17i3j2R * dp.x - Q17i3j2I * dm.y;
    x14.y += Q17i3j2R * dp.y + Q17i3j2I * dm.x;
    x4.x += Q17i4j2R * dp.x + Q17i4j2I * dm.y;
    x4.y += Q17i4j2R * dp.y - Q17i4j2I * dm.x;
    x13.x += Q17i4j2R * dp.x - Q17i4j2I * dm.y;
    x13.y += Q17i4j2R * dp.y + Q17i4j2I * dm.x;
    x5.x += Q17i5j2R * dp.x + Q17i5j2I * dm.y;
    x5.y += Q17i5j2R * dp.y - Q17i5j2I * dm.x;
    x12.x += Q17i5j2R * dp.x - Q17i5j2I * dm.y;
    x12.y += Q17i5j2R * dp.y + Q17i5j2I * dm.x;
    x6.x += Q17i6j2R * dp.x + Q17i6j2I * dm.y;
    x6.y += Q17i6j2R * dp.y - Q17i6j2I * dm.x;
    x11.x += Q17i6j2R * dp.x - Q17i6j2I * dm.y;
    x11.y += Q17i6j2R * dp.y + Q17i6j2I * dm.x;
    x7.x += Q17i7j2R * dp.x + Q17i7j2I * dm.y;
    x7.y += Q17i7j2R * dp.y - Q17i7j2I * dm.x;
    x10.x += Q17i7j2R * dp.x - Q17i7j2I * dm.y;
    x10.y += Q17i7j2R * dp.y + Q17i7j2I * dm.x;
    x8.x += Q17i8j2R * dp.x + Q17i8j2I * dm.y;
    x8.y += Q17i8j2R * dp.y - Q17i8j2I * dm.x;
    x9.x += Q17i8j2R * dp.x - Q17i8j2I * dm.y;
    x9.y += Q17i8j2R * dp.y + Q17i8j2I * dm.x;
    dp = (*R3) + (*R14);
    dm = (*R3) - (*R14);
    x1.x += Q17i1j3R * dp.x + Q17i1j3I * dm.y;
    x1.y += Q17i1j3R * dp.y - Q17i1j3I * dm.x;
    x16.x += Q17i1j3R * dp.x - Q17i1j3I * dm.y;
    x16.y += Q17i1j3R * dp.y + Q17i1j3I * dm.x;
    x2.x += Q17i2j3R * dp.x + Q17i2j3I * dm.y;
    x2.y += Q17i2j3R * dp.y - Q17i2j3I * dm.x;
    x15.x += Q17i2j3R * dp.x - Q17i2j3I * dm.y;
    x15.y += Q17i2j3R * dp.y + Q17i2j3I * dm.x;
    x3.x += Q17i3j3R * dp.x + Q17i3j3I * dm.y;
    x3.y += Q17i3j3R * dp.y - Q17i3j3I * dm.x;
    x14.x += Q17i3j3R * dp.x - Q17i3j3I * dm.y;
    x14.y += Q17i3j3R * dp.y + Q17i3j3I * dm.x;
    x4.x += Q17i4j3R * dp.x + Q17i4j3I * dm.y;
    x4.y += Q17i4j3R * dp.y - Q17i4j3I * dm.x;
    x13.x += Q17i4j3R * dp.x - Q17i4j3I * dm.y;
    x13.y += Q17i4j3R * dp.y + Q17i4j3I * dm.x;
    x5.x += Q17i5j3R * dp.x + Q17i5j3I * dm.y;
    x5.y += Q17i5j3R * dp.y - Q17i5j3I * dm.x;
    x12.x += Q17i5j3R * dp.x - Q17i5j3I * dm.y;
    x12.y += Q17i5j3R * dp.y + Q17i5j3I * dm.x;
    x6.x += Q17i6j3R * dp.x + Q17i6j3I * dm.y;
    x6.y += Q17i6j3R * dp.y - Q17i6j3I * dm.x;
    x11.x += Q17i6j3R * dp.x - Q17i6j3I * dm.y;
    x11.y += Q17i6j3R * dp.y + Q17i6j3I * dm.x;
    x7.x += Q17i7j3R * dp.x + Q17i7j3I * dm.y;
    x7.y += Q17i7j3R * dp.y - Q17i7j3I * dm.x;
    x10.x += Q17i7j3R * dp.x - Q17i7j3I * dm.y;
    x10.y += Q17i7j3R * dp.y + Q17i7j3I * dm.x;
    x8.x += Q17i8j3R * dp.x + Q17i8j3I * dm.y;
    x8.y += Q17i8j3R * dp.y - Q17i8j3I * dm.x;
    x9.x += Q17i8j3R * dp.x - Q17i8j3I * dm.y;
    x9.y += Q17i8j3R * dp.y + Q17i8j3I * dm.x;
    dp = (*R4) + (*R13);
    dm = (*R4) - (*R13);
    x1.x += Q17i1j4R * dp.x + Q17i1j4I * dm.y;
    x1.y += Q17i1j4R * dp.y - Q17i1j4I * dm.x;
    x16.x += Q17i1j4R * dp.x - Q17i1j4I * dm.y;
    x16.y += Q17i1j4R * dp.y + Q17i1j4I * dm.x;
    x2.x += Q17i2j4R * dp.x + Q17i2j4I * dm.y;
    x2.y += Q17i2j4R * dp.y - Q17i2j4I * dm.x;
    x15.x += Q17i2j4R * dp.x - Q17i2j4I * dm.y;
    x15.y += Q17i2j4R * dp.y + Q17i2j4I * dm.x;
    x3.x += Q17i3j4R * dp.x + Q17i3j4I * dm.y;
    x3.y += Q17i3j4R * dp.y - Q17i3j4I * dm.x;
    x14.x += Q17i3j4R * dp.x - Q17i3j4I * dm.y;
    x14.y += Q17i3j4R * dp.y + Q17i3j4I * dm.x;
    x4.x += Q17i4j4R * dp.x + Q17i4j4I * dm.y;
    x4.y += Q17i4j4R * dp.y - Q17i4j4I * dm.x;
    x13.x += Q17i4j4R * dp.x - Q17i4j4I * dm.y;
    x13.y += Q17i4j4R * dp.y + Q17i4j4I * dm.x;
    x5.x += Q17i5j4R * dp.x + Q17i5j4I * dm.y;
    x5.y += Q17i5j4R * dp.y - Q17i5j4I * dm.x;
    x12.x += Q17i5j4R * dp.x - Q17i5j4I * dm.y;
    x12.y += Q17i5j4R * dp.y + Q17i5j4I * dm.x;
    x6.x += Q17i6j4R * dp.x + Q17i6j4I * dm.y;
    x6.y += Q17i6j4R * dp.y - Q17i6j4I * dm.x;
    x11.x += Q17i6j4R * dp.x - Q17i6j4I * dm.y;
    x11.y += Q17i6j4R * dp.y + Q17i6j4I * dm.x;
    x7.x += Q17i7j4R * dp.x + Q17i7j4I * dm.y;
    x7.y += Q17i7j4R * dp.y - Q17i7j4I * dm.x;
    x10.x += Q17i7j4R * dp.x - Q17i7j4I * dm.y;
    x10.y += Q17i7j4R * dp.y + Q17i7j4I * dm.x;
    x8.x += Q17i8j4R * dp.x + Q17i8j4I * dm.y;
    x8.y += Q17i8j4R * dp.y - Q17i8j4I * dm.x;
    x9.x += Q17i8j4R * dp.x - Q17i8j4I * dm.y;
    x9.y += Q17i8j4R * dp.y + Q17i8j4I * dm.x;
    dp = (*R5) + (*R12);
    dm = (*R5) - (*R12);
    x1.x += Q17i1j5R * dp.x + Q17i1j5I * dm.y;
    x1.y += Q17i1j5R * dp.y - Q17i1j5I * dm.x;
    x16.x += Q17i1j5R * dp.x - Q17i1j5I * dm.y;
    x16.y += Q17i1j5R * dp.y + Q17i1j5I * dm.x;
    x2.x += Q17i2j5R * dp.x + Q17i2j5I * dm.y;
    x2.y += Q17i2j5R * dp.y - Q17i2j5I * dm.x;
    x15.x += Q17i2j5R * dp.x - Q17i2j5I * dm.y;
    x15.y += Q17i2j5R * dp.y + Q17i2j5I * dm.x;
    x3.x += Q17i3j5R * dp.x + Q17i3j5I * dm.y;
    x3.y += Q17i3j5R * dp.y - Q17i3j5I * dm.x;
    x14.x += Q17i3j5R * dp.x - Q17i3j5I * dm.y;
    x14.y += Q17i3j5R * dp.y + Q17i3j5I * dm.x;
    x4.x += Q17i4j5R * dp.x + Q17i4j5I * dm.y;
    x4.y += Q17i4j5R * dp.y - Q17i4j5I * dm.x;
    x13.x += Q17i4j5R * dp.x - Q17i4j5I * dm.y;
    x13.y += Q17i4j5R * dp.y + Q17i4j5I * dm.x;
    x5.x += Q17i5j5R * dp.x + Q17i5j5I * dm.y;
    x5.y += Q17i5j5R * dp.y - Q17i5j5I * dm.x;
    x12.x += Q17i5j5R * dp.x - Q17i5j5I * dm.y;
    x12.y += Q17i5j5R * dp.y + Q17i5j5I * dm.x;
    x6.x += Q17i6j5R * dp.x + Q17i6j5I * dm.y;
    x6.y += Q17i6j5R * dp.y - Q17i6j5I * dm.x;
    x11.x += Q17i6j5R * dp.x - Q17i6j5I * dm.y;
    x11.y += Q17i6j5R * dp.y + Q17i6j5I * dm.x;
    x7.x += Q17i7j5R * dp.x + Q17i7j5I * dm.y;
    x7.y += Q17i7j5R * dp.y - Q17i7j5I * dm.x;
    x10.x += Q17i7j5R * dp.x - Q17i7j5I * dm.y;
    x10.y += Q17i7j5R * dp.y + Q17i7j5I * dm.x;
    x8.x += Q17i8j5R * dp.x + Q17i8j5I * dm.y;
    x8.y += Q17i8j5R * dp.y - Q17i8j5I * dm.x;
    x9.x += Q17i8j5R * dp.x - Q17i8j5I * dm.y;
    x9.y += Q17i8j5R * dp.y + Q17i8j5I * dm.x;
    dp = (*R6) + (*R11);
    dm = (*R6) - (*R11);
    x1.x += Q17i1j6R * dp.x + Q17i1j6I * dm.y;
    x1.y += Q17i1j6R * dp.y - Q17i1j6I * dm.x;
    x16.x += Q17i1j6R * dp.x - Q17i1j6I * dm.y;
    x16.y += Q17i1j6R * dp.y + Q17i1j6I * dm.x;
    x2.x += Q17i2j6R * dp.x + Q17i2j6I * dm.y;
    x2.y += Q17i2j6R * dp.y - Q17i2j6I * dm.x;
    x15.x += Q17i2j6R * dp.x - Q17i2j6I * dm.y;
    x15.y += Q17i2j6R * dp.y + Q17i2j6I * dm.x;
    x3.x += Q17i3j6R * dp.x + Q17i3j6I * dm.y;
    x3.y += Q17i3j6R * dp.y - Q17i3j6I * dm.x;
    x14.x += Q17i3j6R * dp.x - Q17i3j6I * dm.y;
    x14.y += Q17i3j6R * dp.y + Q17i3j6I * dm.x;
    x4.x += Q17i4j6R * dp.x + Q17i4j6I * dm.y;
    x4.y += Q17i4j6R * dp.y - Q17i4j6I * dm.x;
    x13.x += Q17i4j6R * dp.x - Q17i4j6I * dm.y;
    x13.y += Q17i4j6R * dp.y + Q17i4j6I * dm.x;
    x5.x += Q17i5j6R * dp.x + Q17i5j6I * dm.y;
    x5.y += Q17i5j6R * dp.y - Q17i5j6I * dm.x;
    x12.x += Q17i5j6R * dp.x - Q17i5j6I * dm.y;
    x12.y += Q17i5j6R * dp.y + Q17i5j6I * dm.x;
    x6.x += Q17i6j6R * dp.x + Q17i6j6I * dm.y;
    x6.y += Q17i6j6R * dp.y - Q17i6j6I * dm.x;
    x11.x += Q17i6j6R * dp.x - Q17i6j6I * dm.y;
    x11.y += Q17i6j6R * dp.y + Q17i6j6I * dm.x;
    x7.x += Q17i7j6R * dp.x + Q17i7j6I * dm.y;
    x7.y += Q17i7j6R * dp.y - Q17i7j6I * dm.x;
    x10.x += Q17i7j6R * dp.x - Q17i7j6I * dm.y;
    x10.y += Q17i7j6R * dp.y + Q17i7j6I * dm.x;
    x8.x += Q17i8j6R * dp.x + Q17i8j6I * dm.y;
    x8.y += Q17i8j6R * dp.y - Q17i8j6I * dm.x;
    x9.x += Q17i8j6R * dp.x - Q17i8j6I * dm.y;
    x9.y += Q17i8j6R * dp.y + Q17i8j6I * dm.x;
    dp = (*R7) + (*R10);
    dm = (*R7) - (*R10);
    x1.x += Q17i1j7R * dp.x + Q17i1j7I * dm.y;
    x1.y += Q17i1j7R * dp.y - Q17i1j7I * dm.x;
    x16.x += Q17i1j7R * dp.x - Q17i1j7I * dm.y;
    x16.y += Q17i1j7R * dp.y + Q17i1j7I * dm.x;
    x2.x += Q17i2j7R * dp.x + Q17i2j7I * dm.y;
    x2.y += Q17i2j7R * dp.y - Q17i2j7I * dm.x;
    x15.x += Q17i2j7R * dp.x - Q17i2j7I * dm.y;
    x15.y += Q17i2j7R * dp.y + Q17i2j7I * dm.x;
    x3.x += Q17i3j7R * dp.x + Q17i3j7I * dm.y;
    x3.y += Q17i3j7R * dp.y - Q17i3j7I * dm.x;
    x14.x += Q17i3j7R * dp.x - Q17i3j7I * dm.y;
    x14.y += Q17i3j7R * dp.y + Q17i3j7I * dm.x;
    x4.x += Q17i4j7R * dp.x + Q17i4j7I * dm.y;
    x4.y += Q17i4j7R * dp.y - Q17i4j7I * dm.x;
    x13.x += Q17i4j7R * dp.x - Q17i4j7I * dm.y;
    x13.y += Q17i4j7R * dp.y + Q17i4j7I * dm.x;
    x5.x += Q17i5j7R * dp.x + Q17i5j7I * dm.y;
    x5.y += Q17i5j7R * dp.y - Q17i5j7I * dm.x;
    x12.x += Q17i5j7R * dp.x - Q17i5j7I * dm.y;
    x12.y += Q17i5j7R * dp.y + Q17i5j7I * dm.x;
    x6.x += Q17i6j7R * dp.x + Q17i6j7I * dm.y;
    x6.y += Q17i6j7R * dp.y - Q17i6j7I * dm.x;
    x11.x += Q17i6j7R * dp.x - Q17i6j7I * dm.y;
    x11.y += Q17i6j7R * dp.y + Q17i6j7I * dm.x;
    x7.x += Q17i7j7R * dp.x + Q17i7j7I * dm.y;
    x7.y += Q17i7j7R * dp.y - Q17i7j7I * dm.x;
    x10.x += Q17i7j7R * dp.x - Q17i7j7I * dm.y;
    x10.y += Q17i7j7R * dp.y + Q17i7j7I * dm.x;
    x8.x += Q17i8j7R * dp.x + Q17i8j7I * dm.y;
    x8.y += Q17i8j7R * dp.y - Q17i8j7I * dm.x;
    x9.x += Q17i8j7R * dp.x - Q17i8j7I * dm.y;
    x9.y += Q17i8j7R * dp.y + Q17i8j7I * dm.x;
    dp = (*R8) + (*R9);
    dm = (*R8) - (*R9);
    x1.x += Q17i1j8R * dp.x + Q17i1j8I * dm.y;
    x1.y += Q17i1j8R * dp.y - Q17i1j8I * dm.x;
    x16.x += Q17i1j8R * dp.x - Q17i1j8I * dm.y;
    x16.y += Q17i1j8R * dp.y + Q17i1j8I * dm.x;
    x2.x += Q17i2j8R * dp.x + Q17i2j8I * dm.y;
    x2.y += Q17i2j8R * dp.y - Q17i2j8I * dm.x;
    x15.x += Q17i2j8R * dp.x - Q17i2j8I * dm.y;
    x15.y += Q17i2j8R * dp.y + Q17i2j8I * dm.x;
    x3.x += Q17i3j8R * dp.x + Q17i3j8I * dm.y;
    x3.y += Q17i3j8R * dp.y - Q17i3j8I * dm.x;
    x14.x += Q17i3j8R * dp.x - Q17i3j8I * dm.y;
    x14.y += Q17i3j8R * dp.y + Q17i3j8I * dm.x;
    x4.x += Q17i4j8R * dp.x + Q17i4j8I * dm.y;
    x4.y += Q17i4j8R * dp.y - Q17i4j8I * dm.x;
    x13.x += Q17i4j8R * dp.x - Q17i4j8I * dm.y;
    x13.y += Q17i4j8R * dp.y + Q17i4j8I * dm.x;
    x5.x += Q17i5j8R * dp.x + Q17i5j8I * dm.y;
    x5.y += Q17i5j8R * dp.y - Q17i5j8I * dm.x;
    x12.x += Q17i5j8R * dp.x - Q17i5j8I * dm.y;
    x12.y += Q17i5j8R * dp.y + Q17i5j8I * dm.x;
    x6.x += Q17i6j8R * dp.x + Q17i6j8I * dm.y;
    x6.y += Q17i6j8R * dp.y - Q17i6j8I * dm.x;
    x11.x += Q17i6j8R * dp.x - Q17i6j8I * dm.y;
    x11.y += Q17i6j8R * dp.y + Q17i6j8I * dm.x;
    x7.x += Q17i7j8R * dp.x + Q17i7j8I * dm.y;
    x7.y += Q17i7j8R * dp.y - Q17i7j8I * dm.x;
    x10.x += Q17i7j8R * dp.x - Q17i7j8I * dm.y;
    x10.y += Q17i7j8R * dp.y + Q17i7j8I * dm.x;
    x8.x += Q17i8j8R * dp.x + Q17i8j8I * dm.y;
    x8.y += Q17i8j8R * dp.y - Q17i8j8I * dm.x;
    x9.x += Q17i8j8R * dp.x - Q17i8j8I * dm.y;
    x9.y += Q17i8j8R * dp.y + Q17i8j8I * dm.x;
    (*R0)  = x0;
    (*R1)  = x1;
    (*R2)  = x2;
    (*R3)  = x3;
    (*R4)  = x4;
    (*R5)  = x5;
    (*R6)  = x6;
    (*R7)  = x7;
    (*R8)  = x8;
    (*R9)  = x9;
    (*R10) = x10;
    (*R11) = x11;
    (*R12) = x12;
    (*R13) = x13;
    (*R14) = x14;
    (*R15) = x15;
    (*R16) = x16;
}
)_PY_EMBED_"};
const char* device_properties_h {
R"_PY_EMBED_(
// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef ROCFFT_DEVICE_PROPS_H
#define ROCFFT_DEVICE_PROPS_H


// get device properties
static hipDeviceProp_t get_curr_device_prop()
{
    hipDeviceProp_t prop;
    int             deviceId = 0;
    if(hipGetDevice(&deviceId) != hipSuccess)
        throw std::runtime_error("hipGetDevice failed.");

    if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess)
        throw std::runtime_error("hipGetDeviceProperties failed for deviceId "
                                 + std::to_string(deviceId));

    return prop;
}

// check that the given grid/block dims will fit into the limits in
// the device properties.  throws std::runtime_error if the limits
// are exceeded.
static void launch_limits_check(const std::string&     kernel_name,
                                const dim3             gridDim,
                                const dim3             blockDim,
                                const hipDeviceProp_t& deviceProp)
{
    // Need lots of casting here because dim3 is unsigned but device
    // props are signed.  Cast direct comparisons to fix signedness
    // issues.  Promote types to 64-bit when multiplying to try to
    // avoid overflow.

    // Block limits along each dimension
    if(blockDim.x > static_cast<uint32_t>(deviceProp.maxThreadsDim[0])
       || blockDim.y > static_cast<uint32_t>(deviceProp.maxThreadsDim[1])
       || blockDim.z > static_cast<uint32_t>(deviceProp.maxThreadsDim[2]))
        throw std::runtime_error("max threads per dim exceeded: " + kernel_name);

    // Total threads for the whole block
    if(static_cast<uint64_t>(blockDim.x) * blockDim.y * blockDim.z
       > static_cast<uint64_t>(deviceProp.maxThreadsPerBlock))
        throw std::runtime_error("max threads per block exceeded: " + kernel_name);

    // Grid dimension limits
    if(gridDim.x > static_cast<uint32_t>(deviceProp.maxGridSize[0])
       || gridDim.y > static_cast<uint32_t>(deviceProp.maxGridSize[1])
       || gridDim.z > static_cast<uint32_t>(deviceProp.maxGridSize[2]))
        throw std::runtime_error("max grid size exceeded: " + kernel_name);
}

#endif
)_PY_EMBED_"};
const char* rocfft_hip_h {
R"_PY_EMBED_(
// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef __ROCFFT_HIP_H__
#define __ROCFFT_HIP_H__


class rocfft_scoped_device
{
public:
    rocfft_scoped_device(int device)
    {
        if(hipGetDevice(&orig_device) != hipSuccess)
            throw std::runtime_error("hipGetDevice failure");

        if(hipSetDevice(device) != hipSuccess)
            throw std::runtime_error("hipSetDevice failure");
    }
    ~rocfft_scoped_device()
    {
        (void)hipSetDevice(orig_device);
    }

    // not copyable or movable
    rocfft_scoped_device(const rocfft_scoped_device&) = delete;
    rocfft_scoped_device(rocfft_scoped_device&&)      = delete;
    rocfft_scoped_device& operator=(const rocfft_scoped_device&) = delete;

private:
    int orig_device;
};

#endif // __ROCFFT_HIP_H__
)_PY_EMBED_"};
const char* gpubuf_h {
R"_PY_EMBED_(
// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef ROCFFT_GPUBUF_H
#define ROCFFT_GPUBUF_H


// Simple RAII class for GPU buffers.  T is the type of pointer that
// data() returns
template <class T = void>
class gpubuf_t
{
public:
    gpubuf_t() {}
    // buffers are movable but not copyable
    gpubuf_t(gpubuf_t&& other)
    {
        std::swap(buf, other.buf);
        std::swap(owned, other.owned);
        std::swap(bsize, other.bsize);
        std::swap(device, other.device);
        std::swap(is_managed_memory, other.is_managed_memory);
    }
    gpubuf_t& operator=(gpubuf_t&& other)
    {
        std::swap(buf, other.buf);
        std::swap(owned, other.owned);
        std::swap(bsize, other.bsize);
        std::swap(device, other.device);
        std::swap(is_managed_memory, other.is_managed_memory);
        return *this;
    }
    gpubuf_t(const gpubuf_t&) = delete;
    gpubuf_t& operator=(const gpubuf_t&) = delete;

    static gpubuf_t make_nonowned(T* p, size_t size_bytes = 0)
    {
        gpubuf_t ret;
        ret.owned             = false;
        ret.buf               = p;
        ret.bsize             = size_bytes;
        ret.is_managed_memory = false; // irrelevant if not owned
        return ret;
    }

    ~gpubuf_t()
    {
        free();
    }

    static bool use_alloc_managed()
    {
        return std::getenv("ROCFFT_MALLOC_MANAGED");
    }

    hipError_t alloc(const size_t size, bool make_it_shared = false)
    {
        // remember the device that was current as of alloc, so we can
        // free on the correct device
        auto ret = hipGetDevice(&device);
        if(ret != hipSuccess)
            return ret;

        bsize             = size;
        is_managed_memory = use_alloc_managed() || make_it_shared;
        free();
        ret = is_managed_memory ? hipMallocManaged(&buf, bsize) : hipMalloc(&buf, bsize);
        if(ret != hipSuccess)
        {
            buf   = nullptr;
            bsize = 0;
        }
        return ret;
    }

    size_t size() const
    {
        return bsize;
    }

    void free()
    {
        if(buf != nullptr)
        {
            if(owned)
            {
                // free on the device we allocated on
                rocfft_scoped_device dev(device);
                (void)hipFree(buf);
            }
            buf   = nullptr;
            bsize = 0;
        }
        owned = true;
    }

    // return a pointer to the allocated memory, offset by the
    // specified number of bytes
    T* data_offset(size_t offset_bytes = 0) const
    {
        void* ptr = static_cast<char*>(buf) + offset_bytes;
        return static_cast<T*>(ptr);
    }

    T* data() const
    {
        return static_cast<T*>(buf);
    }

    // equality/bool tests
    bool operator==(std::nullptr_t n) const
    {
        return buf == n;
    }
    bool operator!=(std::nullptr_t n) const
    {
        return buf != n;
    }
    operator bool() const
    {
        return buf;
    }

private:
    // The GPU buffer
    void* buf = nullptr;
    // whether this object owns the 'buf' pointer (and hence needs to
    // free it)
    bool   owned             = true;
    bool   is_managed_memory = false;
    size_t bsize             = 0;
    int    device            = 0;
};

// default gpubuf that gives out void* pointers
typedef gpubuf_t<> gpubuf;
#endif
)_PY_EMBED_"};
const char* rtc_kernel_h {
R"_PY_EMBED_(
// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef ROCFFT_RTC_H
#define ROCFFT_RTC_H




struct DeviceCallIn;
class TreeNode;
class LeafNode;
struct GridParam;

// Helper class that handles alignment of kernel arguments
class RTCKernelArgs
{
public:
    RTCKernelArgs() = default;
    void append_ptr(const void* ptr)
    {
        append(&ptr, sizeof(void*));
    }
    void append_size_t(size_t s)
    {
        append(&s, sizeof(size_t));
    }
    void append_unsigned_int(unsigned int i)
    {
        append(&i, sizeof(unsigned int));
    }
    void append_int(int i)
    {
        append(&i, sizeof(int));
    }
    void append_double(double d)
    {
        append(&d, sizeof(double));
    }
    void append_float(float f)
    {
        append(&f, sizeof(float));
    }
    void append_half(rocfft_fp16 f)
    {
        append(&f, sizeof(rocfft_fp16));
    }
    template <typename T>
    void append_struct(const T& data)
    {
        append(&data, sizeof(T), 8);
    }

    size_t size_bytes() const
    {
        return buf.size();
    }
    void* data()
    {
        return buf.data();
    }

private:
    void append(const void* src, size_t nbytes, size_t align = 0)
    {
        // values need to be aligned to their width (i.e. 8-byte values
        // need 8-byte alignment, 4-byte needs 4-byte alignment)
        if(align == 0)
            align = nbytes;

        size_t oldsize = buf.size();
        size_t padding = oldsize % align ? align - (oldsize % align) : 0;
        buf.resize(oldsize + padding + nbytes);
        std::copy_n(static_cast<const char*>(src), nbytes, buf.begin() + oldsize + padding);
    }

    std::vector<char> buf;
};

// Base class for a runtime compiled kernel.  Subclassed for
// different kernel types that each have their own details about how
// to be launched.
struct RTCKernel
{
    // try to compile kernel for node, and attach compiled kernel to
    // node if successful.  returns nullptr if there is no matching
    // supported scheme + problem size.  throws runtime_error on
    // error.
    static std::shared_future<std::unique_ptr<RTCKernel>>
        runtime_compile(const LeafNode&    node,
                        const std::string& gpu_arch,
                        std::string&       kernel_name,
                        bool               enable_callbacks = false);

    // take already-compiled code object and prepare to launch the
    // named kernel
    RTCKernel(const std::string&       kernel_name,
              const std::vector<char>& code,
              dim3                     gridDim  = {},
              dim3                     blockDim = {});

    virtual ~RTCKernel()
    {
        kernel = nullptr;
        (void)hipModuleUnload(module);
        module = nullptr;
    }

    // disallow copies, since we expect this to be managed by smart ptr
    RTCKernel(const RTCKernel&) = delete;
    RTCKernel(RTCKernel&&)      = delete;

    void operator=(const RTCKernel&) = delete;

    // normal launch from within rocFFT execution plan
    void launch(DeviceCallIn& data, const hipDeviceProp_t& deviceProp);
    // direct launch with kernel args
    void launch(RTCKernelArgs&         kargs,
                dim3                   gridDim,
                dim3                   blockDim,
                unsigned int           lds_bytes,
                const hipDeviceProp_t& deviceProp,
                hipStream_t            stream = nullptr);

    // normal launch from within rocFFT execution plan
    bool get_occupancy(dim3 blockDim, unsigned int lds_bytes, int& occupancy);

#ifndef ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS
    // Subclasses implement this - each kernel type has different
    // parameters
    virtual RTCKernelArgs get_launch_args(DeviceCallIn& data) = 0;
#endif

    // function to construct the correct RTCKernel object, given a kernel name and its compiled code
    using rtckernel_construct_t = std::function<std::unique_ptr<RTCKernel>(
        const std::string&, const std::vector<char>&, dim3, dim3)>;

    // grid parameters for this kernel.  may be set by runtime
    // compilation, if compilation of this kernel type knows how to.
    // Otherwise, TreeNode::SetupGridParam_internal will do it
    // later.
    dim3 gridDim;
    dim3 blockDim;

    std::string kernel_name;

protected:
#ifndef ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS
    struct RTCGenerator
    {
        kernel_name_gen_t     generate_name;
        kernel_src_gen_t      generate_src;
        rtckernel_construct_t construct_rtckernel;

        virtual bool valid() const
        {
            return generate_name && generate_src && construct_rtckernel;
        }
        // generator is the correct type, but kernel is already compiled
        virtual bool is_pre_compiled() const
        {
            return false;
        }

        // if known at compile time, the grid parameters of the kernel
        // to launch with
        dim3 gridDim;
        dim3 blockDim;
    };
#endif

    hipModule_t   module = nullptr;
    hipFunction_t kernel = nullptr;
};

#ifndef ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS

// helper functions to construct pieces of RTC kernel names
static const char* rtc_array_type_name(rocfft_array_type type)
{
    // hermitian is the same as complex in terms of generated code,
    // so give them the same names in kernels
    switch(type)
    {
    case rocfft_array_type_complex_interleaved:
    case rocfft_array_type_hermitian_interleaved:
        return "_CI";
    case rocfft_array_type_complex_planar:
    case rocfft_array_type_hermitian_planar:
        return "_CP";
    case rocfft_array_type_real:
        return "_R";
    default:
        return "_UN";
    }
}

static const char* rtc_precision_name(rocfft_precision precision)
{
    switch(precision)
    {
    case rocfft_precision_single:
        return "_sp";
    case rocfft_precision_double:
        return "_dp";
    case rocfft_precision_half:
        return "_half";
    }
}

static const char* rtc_precision_type_decl(rocfft_precision precision, bool is_complex = true)
{
    switch(precision)
    {
    case rocfft_precision_single:
        return is_complex ? "typedef rocfft_complex<float> scalar_type;\n"
                          : "typedef float scalar_type;\n";
    case rocfft_precision_double:
        return is_complex ? "typedef rocfft_complex<double> scalar_type;\n"
                          : "typedef double scalar_type;\n";
    case rocfft_precision_half:
        return is_complex ? "typedef rocfft_complex<rocfft_fp16> scalar_type;\n"
                          : "typedef rocfft_fp16 scalar_type;\n";
    }
}

static const char* rtc_cbtype_name(CallbackType cbtype)
{
    switch(cbtype)
    {
    case CallbackType::NONE:
        return "";
    case CallbackType::USER_LOAD_STORE:
        return "_CB";
    case CallbackType::USER_LOAD_STORE_R2C:
        return "_CBr2c";
    case CallbackType::USER_LOAD_STORE_C2R:
        return "_CBc2r";
    }
}

// realDataAsComplex is true if we're treating real data as complex
// (in an even-length real-complex FFT)
static const std::string rtc_const_cbtype_decl(CallbackType cbtype)
{
    switch(cbtype)
    {
    case CallbackType::NONE:
        return "static const CallbackType cbtype = CallbackType::NONE;\n";
    case CallbackType::USER_LOAD_STORE:
        return "static const CallbackType cbtype = CallbackType::USER_LOAD_STORE;\n";
    case CallbackType::USER_LOAD_STORE_R2C:
        return "static const CallbackType cbtype = CallbackType::USER_LOAD_STORE_R2C;\n";
    case CallbackType::USER_LOAD_STORE_C2R:
        return "static const CallbackType cbtype = CallbackType::USER_LOAD_STORE_C2R;\n";
    }
}
#endif

#endif
)_PY_EMBED_"};
const char* rtc_kernel_cpp {
R"_PY_EMBED_(
// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.



RTCKernel::RTCKernel(const std::string&       kernel_name,
                     const std::vector<char>& code,
                     dim3                     gridDim,
                     dim3                     blockDim)
    : gridDim(gridDim)
    , blockDim(blockDim)
    , kernel_name(kernel_name)
{
#ifndef ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS
    // if we're only compiling, no need to actually load the code objects
    if(rocfft_getenv("ROCFFT_INTERNAL_COMPILE_ONLY") == "1")
        return;
#endif
    if(hipModuleLoadData(&module, code.data()) != hipSuccess)
        throw std::runtime_error("failed to load module for " + kernel_name);

    if(hipModuleGetFunction(&kernel, module, kernel_name.c_str()) != hipSuccess)
        throw std::runtime_error("failed to get function " + kernel_name);
}

#ifndef ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS
void RTCKernel::launch(DeviceCallIn& data, const hipDeviceProp_t& deviceProp)
{
    RTCKernelArgs kargs = get_launch_args(data);

    const auto& gp = data.gridParam;

    launch(kargs,
           {gp.b_x, gp.b_y, gp.b_z},
           {gp.wgs_x, gp.wgs_y, gp.wgs_z},
           gp.lds_bytes,
           deviceProp,
           data.rocfft_stream);
}
#endif

void RTCKernel::launch(RTCKernelArgs&         kargs,
                       dim3                   gridDim,
                       dim3                   blockDim,
                       unsigned int           lds_bytes,
                       const hipDeviceProp_t& deviceProp,
                       hipStream_t            stream)
{
    launch_limits_check(kernel_name, gridDim, blockDim, deviceProp);
    auto  size     = kargs.size_bytes();
    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
                      kargs.data(),
                      HIP_LAUNCH_PARAM_BUFFER_SIZE,
                      &size,
                      HIP_LAUNCH_PARAM_END};

#ifndef ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS
    if(LOG_PLAN_ENABLED())
    {
        int        max_blocks_per_sm;
        hipError_t ret = hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
            &max_blocks_per_sm, kernel, blockDim.x * blockDim.y * blockDim.z, lds_bytes);
        rocfft_ostream* kernelplan_stream = LogSingleton::GetInstance().GetPlanOS();
        if(ret == hipSuccess)
            *kernelplan_stream << "Kernel occupancy: " << max_blocks_per_sm << std::endl;
        else
            *kernelplan_stream << "Can not retrieve occupancy info." << std::endl;
    }
#endif

    if(hipModuleLaunchKernel(kernel,
                             gridDim.x,
                             gridDim.y,
                             gridDim.z,
                             blockDim.x,
                             blockDim.y,
                             blockDim.z,
                             lds_bytes,
                             stream,
                             nullptr,
                             config)
       != hipSuccess)
        throw std::runtime_error("hipModuleLaunchKernel failure");
}

bool RTCKernel::get_occupancy(dim3 blockDim, unsigned int lds_bytes, int& occupancy)
{
    hipError_t ret = hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
        &occupancy, kernel, blockDim.x * blockDim.y * blockDim.z, lds_bytes);

    return ret == hipSuccess;
}

std::shared_future<std::unique_ptr<RTCKernel>>
    RTCKernel::runtime_compile(const LeafNode&    node,
                               const std::string& gpu_arch,
                               std::string&       kernel_name,
                               bool               enable_callbacks)
{
#ifndef ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS
    int deviceId = 0;
    if(hipGetDevice(&deviceId) != hipSuccess)
    {
        throw std::runtime_error("failed to get device");
    }

    RTCGenerator generator;
    // try each type of generator until one is valid
    generator = RTCKernelStockham::generate_from_node(node, gpu_arch, enable_callbacks);
    if(!generator.valid())
        generator = RTCKernelTranspose::generate_from_node(node, gpu_arch, enable_callbacks);
    if(!generator.valid())
        generator = RTCKernelRealComplex::generate_from_node(node, gpu_arch, enable_callbacks);
    if(!generator.valid())
        generator = RTCKernelRealComplexEven::generate_from_node(node, gpu_arch, enable_callbacks);
    if(!generator.valid())
        generator = RTCKernelRealComplexEvenTranspose::generate_from_node(
            node, gpu_arch, enable_callbacks);
    if(!generator.valid())
        generator = RTCKernelBluesteinSingle::generate_from_node(node, gpu_arch, enable_callbacks);
    if(!generator.valid())
        generator = RTCKernelBluesteinMulti::generate_from_node(node, gpu_arch, enable_callbacks);

    if(generator.valid())
    {
        kernel_name = generator.generate_name();

        auto compile = [=](std::promise<std::unique_ptr<RTCKernel>> compile_promise) {
            if(hipSetDevice(deviceId) != hipSuccess)
            {
                compile_promise.set_exception(
                    std::make_exception_ptr(std::runtime_error("failed to set device")));
            }
            try
            {
                std::vector<char> code = RTCCache::cached_compile(
                    kernel_name, gpu_arch, generator.generate_src, generator_sum());
                compile_promise.set_value(generator.construct_rtckernel(
                    kernel_name, code, generator.gridDim, generator.blockDim));
            }
            catch(std::exception& e)
            {
                if(LOG_RTC_ENABLED())
                    (*LogSingleton::GetInstance().GetRTCOS()) << e.what() << std::endl;
                compile_promise.set_exception(std::current_exception());
            }
        };

        // compile to code object
        std::promise<std::unique_ptr<RTCKernel>>       compile_promise;
        std::shared_future<std::unique_ptr<RTCKernel>> compile_future
            = compile_promise.get_future();
        std::thread compile_thread(compile, std::move(compile_promise));
        // we'll wait for the future so the thread can continue
        // without being managed by this object
        compile_thread.detach();
        return compile_future;
    }
    // a pre-compiled rtc-stockham-kernel goes here
    else if(generator.is_pre_compiled())
    {
        kernel_name = generator.generate_name();
    }
#endif
    // kernel harness being generated or no kernel found, return
    // null RTCKernel
    std::promise<std::unique_ptr<RTCKernel>> p;
    p.set_value(nullptr);
    return p.get_future();
}
)_PY_EMBED_"};
const char* rtc_test_harness_helper_cpp {
R"_PY_EMBED_(
// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

// utility code to embed into generated test harnesses, to simplify
// allocating and initializing device memory

// copy a host vector to the device
template <typename T>
gpubuf_t<T> host_vec_to_dev(const std::vector<T>& hvec)
{
    gpubuf_t<T> ret;
    if(ret.alloc(sizeof(T) * hvec.size()) != hipSuccess)
        throw std::runtime_error("failed to hipMalloc");
    if(hipMemcpy(ret.data(), hvec.data(), sizeof(T) * hvec.size(), hipMemcpyHostToDevice)
       != hipSuccess)
        throw std::runtime_error("failed to memcpy");
    return ret;
}

template <typename T1, typename T2>
T1 ceildiv(T1 a, T2 b)
{
    return (a + b - 1) / b;
}

// generate random complex input
template <typename Tcomplex>
gpubuf_t<Tcomplex> random_complex_device(unsigned int count)
{
    std::vector<Tcomplex> hostBuf(count);

    auto partitions     = std::max<size_t>(std::thread::hardware_concurrency(), 32);
    auto partition_size = ceildiv(count, partitions);

#pragma omp parallel for
    for(unsigned int partition = 0; partition < partitions; ++partition)
    {
        std::mt19937                           gen(partition);
        std::uniform_real_distribution<double> dis(0.0, 1.0);

        auto begin = partition * partition_size;
        if(begin >= count)
            continue;
        auto end = std::min(begin + partition_size, count);

        for(auto d = hostBuf.begin() + begin; d != hostBuf.begin() + end; ++d)
        {
            d->x = dis(gen);
            d->y = dis(gen);
        }
    }
    return host_vec_to_dev(hostBuf);
}

// generate random real input
template <typename Treal>
gpubuf_t<Treal> random_real_device(unsigned int count)
{
    std::vector<Treal> hostBuf(count);

    auto partitions     = std::max<size_t>(std::thread::hardware_concurrency(), 32);
    auto partition_size = ceildiv(count, partitions);

#pragma omp parallel for
    for(unsigned int partition = 0; partition < partitions; ++partition)
    {
        std::mt19937                           gen(partition);
        std::uniform_real_distribution<double> dis(0.0, 1.0);

        auto begin = partition * partition_size;
        if(begin >= count)
            continue;
        auto end = std::min(begin + partition_size, count);

        for(auto d = hostBuf.begin() + begin; d != hostBuf.begin() + end; ++d)
        {
            *d = dis(gen);
        }
    }
    return host_vec_to_dev(hostBuf);
}

// compile a function using hipRTC
std::unique_ptr<RTCKernel> compile(const std::string& name, const std::string& src)
{
    hiprtcProgram prog;
    if(hiprtcCreateProgram(&prog, src.c_str(), "rtc.cu", 0, nullptr, nullptr) != HIPRTC_SUCCESS)
    {
        throw std::runtime_error("unable to create program");
    }
    std::vector<const char*> options;
    options.reserve(2);
    options.push_back("-O3");
    options.push_back("-mcumode");

    auto compileResult = hiprtcCompileProgram(prog, options.size(), options.data());
    if(compileResult != HIPRTC_SUCCESS)
    {
        size_t logSize = 0;
        hiprtcGetProgramLogSize(prog, &logSize);

        if(logSize)
        {
            std::vector<char> log(logSize, '\0');
            if(hiprtcGetProgramLog(prog, log.data()) == HIPRTC_SUCCESS)
                throw std::runtime_error(log.data());
        }
        throw std::runtime_error("compile failed without log");
    }

    size_t codeSize;
    if(hiprtcGetCodeSize(prog, &codeSize) != HIPRTC_SUCCESS)
        throw std::runtime_error("failed to get code size");

    std::vector<char> code(codeSize);
    if(hiprtcGetCode(prog, code.data()) != HIPRTC_SUCCESS)
        throw std::runtime_error("failed to get code");
    hiprtcDestroyProgram(&prog);

    return std::make_unique<RTCKernel>(name, code);
}
)_PY_EMBED_"};
const std::array<char,32> generator_sum() { return {'\xa7', '\x8c', '\xc5', '\xd4', '\xf8', '\x52', '\x54', '\xb3', '\x68', '\x89', '\x8f', '\x8f', '\xb0', '\xcd', '\xca', '\xaf', '\xd2', '\x8b', '\x83', '\xdd', '\xef', '\x9e', '\x6d', '\xfa', '\x84', '\x3d', '\x59', '\x7a', '\x3c', '\x95', '\x2e', '\xd2', };}
