1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP 
    2 #define VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP 
   41 template<
typename StringT>
 
   44   source.append(
"__kernel void bluestein_post(__global "); source.append(numeric_string); source.append(
"2 *Z, \n");
 
   45   source.append(
"                             __global "); source.append(numeric_string); source.append(
"2 *out, \n");
 
   46   source.append(
"                             unsigned int size) \n");
 
   47   source.append(
"{ \n");
 
   48   source.append(
"  unsigned int glb_id = get_global_id(0); \n");
 
   49   source.append(
"  unsigned int glb_sz = get_global_size(0); \n");
 
   51   source.append(
"  unsigned int double_size = size << 1; \n");
 
   52   source.append(
"  "); source.append(numeric_string); source.append(
" sn_a, cs_a; \n");
 
   53   source.append(
"  const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
 
   55   source.append(
"  for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
 
   56   source.append(
"    unsigned int rm = i * i % (double_size); \n");
 
   57   source.append(
"    "); source.append(numeric_string); source.append(
" angle = ("); source.append(numeric_string); source.append(
")rm / size * (-NUM_PI); \n");
 
   59   source.append(
"    sn_a = sincos(angle, &cs_a); \n");
 
   61   source.append(
"    "); source.append(numeric_string); source.append(
"2 b_i = ("); source.append(numeric_string); source.append(
"2)(cs_a, sn_a); \n");
 
   62   source.append(
"    out[i] = ("); source.append(numeric_string); source.append(
"2)(Z[i].x * b_i.x - Z[i].y * b_i.y, Z[i].x * b_i.y + Z[i].y * b_i.x); \n");
 
   63   source.append(
"  } \n");
 
   64   source.append(
"} \n");
 
   68 template<
typename StringT>
 
   71   source.append(
"__kernel void bluestein_pre(__global "); source.append(numeric_string); source.append(
"2 *input, \n");
 
   72   source.append(
"  __global "); source.append(numeric_string); source.append(
"2 *A, \n");
 
   73   source.append(
"  __global "); source.append(numeric_string); source.append(
"2 *B, \n");
 
   74   source.append(
"  unsigned int size, \n");
 
   75   source.append(
"  unsigned int ext_size \n");
 
   76   source.append(
"  ) { \n");
 
   77   source.append(
"  unsigned int glb_id = get_global_id(0); \n");
 
   78   source.append(
"  unsigned int glb_sz = get_global_size(0); \n");
 
   80   source.append(
"  unsigned int double_size = size << 1; \n");
 
   82   source.append(
"  "); source.append(numeric_string); source.append(
" sn_a, cs_a; \n");
 
   83   source.append(
"  const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
 
   85   source.append(
"  for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
 
   86   source.append(
"    unsigned int rm = i * i % (double_size); \n");
 
   87   source.append(
"    "); source.append(numeric_string); source.append(
" angle = ("); source.append(numeric_string); source.append(
")rm / size * NUM_PI; \n");
 
   89   source.append(
"    sn_a = sincos(-angle, &cs_a); \n");
 
   91   source.append(
"    "); source.append(numeric_string); source.append(
"2 a_i = ("); source.append(numeric_string); source.append(
"2)(cs_a, sn_a); \n");
 
   92   source.append(
"    "); source.append(numeric_string); source.append(
"2 b_i = ("); source.append(numeric_string); source.append(
"2)(cs_a, -sn_a); \n");
 
   94   source.append(
"    A[i] = ("); source.append(numeric_string); source.append(
"2)(input[i].x * a_i.x - input[i].y * a_i.y, input[i].x * a_i.y + input[i].y * a_i.x); \n");
 
   95   source.append(
"    B[i] = b_i; \n");
 
   98   source.append(
"    if (i) \n");
 
   99   source.append(
"      B[ext_size - i] = b_i; \n");
 
  100   source.append(
"  } \n");
 
  101   source.append(
"} \n");
 
  105 template<
typename StringT>
 
  108   source.append(
"__kernel void complex_to_real(__global "); source.append(numeric_string); source.append(
"2 *in, \n");
 
  109   source.append(
"  __global "); source.append(numeric_string); source.append(
"  *out, \n");
 
  110   source.append(
"  unsigned int size) { \n");
 
  111   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))  \n");
 
  112   source.append(
"    out[i] = in[i].x; \n");
 
  113   source.append(
"} \n");
 
  117 template<
typename StringT>
 
  120   source.append(
"__kernel void fft_div_vec_scalar(__global "); source.append(numeric_string); source.append(
"2 *input1, \n");
 
  121   source.append(
"  unsigned int size, \n");
 
  122   source.append(
"  "); source.append(numeric_string); source.append(
" factor) { \n");
 
  123   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))  \n");
 
  124   source.append(
"    input1[i] /= factor; \n");
 
  125   source.append(
"} \n");
 
  129 template<
typename StringT>
 
  132   source.append(
"__kernel void fft_mult_vec(__global const "); source.append(numeric_string); source.append(
"2 *input1, \n");
 
  133   source.append(
"  __global const "); source.append(numeric_string); source.append(
"2 *input2, \n");
 
  134   source.append(
"  __global "); source.append(numeric_string); source.append(
"2 *output, \n");
 
  135   source.append(
"  unsigned int size) { \n");
 
  136   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 
  137   source.append(
"    "); source.append(numeric_string); source.append(
"2 in1 = input1[i]; \n");
 
  138   source.append(
"    "); source.append(numeric_string); source.append(
"2 in2 = input2[i]; \n");
 
  140   source.append(
"    output[i] = ("); source.append(numeric_string); source.append(
"2)(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); \n");
 
  141   source.append(
"  } \n");
 
  142   source.append(
"} \n");
 
  146 template<
typename StringT>
 
  149   source.append(
"__kernel void real_to_complex(__global "); source.append(numeric_string); source.append(
" *in, \n");
 
  150   source.append(
"  __global "); source.append(numeric_string); source.append(
"2 *out, \n");
 
  151   source.append(
"  unsigned int size) { \n");
 
  152   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 
  153   source.append(
"    "); source.append(numeric_string); source.append(
"2 val = 0; \n");
 
  154   source.append(
"    val.x = in[i]; \n");
 
  155   source.append(
"    out[i] = val; \n");
 
  156   source.append(
"  } \n");
 
  157   source.append(
"} \n");
 
  161 template<
typename StringT>
 
  164   source.append(
"__kernel void reverse_inplace(__global "); source.append(numeric_string); source.append(
" *vec, uint size) { \n");
 
  165   source.append(
"  for (uint i = get_global_id(0); i < (size >> 1); i+=get_global_size(0)) { \n");
 
  166   source.append(
"    "); source.append(numeric_string); source.append(
" val1 = vec[i]; \n");
 
  167   source.append(
"    "); source.append(numeric_string); source.append(
" val2 = vec[size - i - 1]; \n");
 
  169   source.append(
"    vec[i] = val2; \n");
 
  170   source.append(
"    vec[size - i - 1] = val1; \n");
 
  171   source.append(
"  } \n");
 
  172   source.append(
"} \n");
 
  176 template<
typename StringT>
 
  179   source.append(
"__kernel void transpose(__global "); source.append(numeric_string); source.append(
"2 *input, \n");
 
  180   source.append(
"  __global "); source.append(numeric_string); source.append(
"2 *output, \n");
 
  181   source.append(
"  unsigned int row_num, \n");
 
  182   source.append(
"  unsigned int col_num) { \n");
 
  183   source.append(
"  unsigned int size = row_num * col_num; \n");
 
  184   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
 
  185   source.append(
"    unsigned int row = i / col_num; \n");
 
  186   source.append(
"    unsigned int col = i - row*col_num; \n");
 
  188   source.append(
"    unsigned int new_pos = col * row_num + row; \n");
 
  190   source.append(
"    output[new_pos] = input[i]; \n");
 
  191   source.append(
"  } \n");
 
  192   source.append(
"} \n");
 
  196 template<
typename StringT>
 
  199   source.append(
"__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append(
"2* input, \n");
 
  200   source.append(
"  unsigned int row_num, \n");
 
  201   source.append(
"  unsigned int col_num) { \n");
 
  202   source.append(
"  unsigned int size = row_num * col_num; \n");
 
  203   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
 
  204   source.append(
"    unsigned int row = i / col_num; \n");
 
  205   source.append(
"    unsigned int col = i - row*col_num; \n");
 
  207   source.append(
"    unsigned int new_pos = col * row_num + row; \n");
 
  209   source.append(
"    if (i < new_pos) { \n");
 
  210   source.append(
"      "); source.append(numeric_string); source.append(
"2 val = input[i]; \n");
 
  211   source.append(
"      input[i] = input[new_pos]; \n");
 
  212   source.append(
"      input[new_pos] = val; \n");
 
  213   source.append(
"    } \n");
 
  214   source.append(
"  } \n");
 
  215   source.append(
"} \n");
 
  219 template<
typename StringT>
 
  222   source.append(
"__kernel void vandermonde_prod(__global "); source.append(numeric_string); source.append(
" *vander, \n");
 
  223   source.append(
"  __global "); source.append(numeric_string); source.append(
" *vector, \n");
 
  224   source.append(
"  __global "); source.append(numeric_string); source.append(
" *result, \n");
 
  225   source.append(
"  uint size) { \n");
 
  226   source.append(
"  for (uint i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
 
  227   source.append(
"    "); source.append(numeric_string); source.append(
" mul = vander[i]; \n");
 
  228   source.append(
"    "); source.append(numeric_string); source.append(
" pwr = 1; \n");
 
  229   source.append(
"    "); source.append(numeric_string); source.append(
" val = 0; \n");
 
  231   source.append(
"    for (uint j = 0; j < size; j++) { \n");
 
  232   source.append(
"      val = val + pwr * vector[j]; \n");
 
  233   source.append(
"      pwr *= mul; \n");
 
  234   source.append(
"    } \n");
 
  236   source.append(
"    result[i] = val; \n");
 
  237   source.append(
"  } \n");
 
  238   source.append(
"} \n");
 
  242 template<
typename StringT>
 
  245   source.append(
"__kernel void zero2(__global "); source.append(numeric_string); source.append(
"2 *input1, \n");
 
  246   source.append(
"  __global "); source.append(numeric_string); source.append(
"2 *input2, \n");
 
  247   source.append(
"  unsigned int size) { \n");
 
  248   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 
  249   source.append(
"    input1[i] = 0; \n");
 
  250   source.append(
"    input2[i] = 0; \n");
 
  251   source.append(
"  } \n");
 
  252   source.append(
"} \n");
 
  259 template<
typename NumericT>
 
  269     static std::map<cl_context, bool> init_done;
 
  276       source.reserve(8192);
 
  278       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
  281       if (numeric_string == 
"float" || numeric_string == 
"double")
 
  297       #ifdef VIENNACL_BUILD_INFO 
  298       std::cout << 
"Creating program " << prog_name << std::endl;
 
  300       ctx.add_program(source, prog_name);
 
  301       init_done[ctx.handle().get()] = 
true;
 
void generate_fft_div_vec_scalar(StringT &source, std::string const &numeric_string)
OpenCL kernel generation code for dividing a complex number by a real number. 
Main kernel class for generating OpenCL kernels for the fast Fourier transform. 
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
static std::string program_name()
void generate_fft_bluestein_post(StringT &source, std::string const &numeric_string)
Provides OpenCL-related utilities. 
void generate_fft_reverse_inplace(StringT &source, std::string const &numeric_string)
Reverses the entries in a vector. 
const viennacl::ocl::handle< cl_context > & handle() const 
Returns the context handle. 
void generate_fft_complex_to_real(StringT &source, std::string const &numeric_string)
Extract real part of a complex number array. 
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const 
void generate_fft_zero2(StringT &source, std::string const &numeric_string)
Zero two complex vectors (to avoid kernel launch overhead) 
void generate_fft_mult_vec(StringT &source, std::string const &numeric_string)
Elementwise product of two complex vectors. 
void generate_fft_transpose_inplace(StringT &source, std::string const &numeric_string)
Simplistic inplace matrix transpose function. 
void generate_fft_bluestein_pre(StringT &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL. 
void generate_fft_vandermonde_prod(StringT &source, std::string const &numeric_string)
Computes the matrix vector product with a Vandermonde matrix. 
void generate_fft_real_to_complex(StringT &source, std::string const &numeric_string)
Embedds a real-valued vector into a complex one. 
Helper class for converting a type to its string representation. 
static void init(viennacl::ocl::context &ctx)
void generate_fft_transpose(StringT &source, std::string const &numeric_string)
Simplistic matrix transpose function.