1 #ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_ 
   80 inline __host__ __device__ float2 
operator+(float2 a, float2 b)
 
   82   return make_float2(a.x + b.x, a.y + b.y);
 
   86 inline __host__ __device__ float2 
operator-(float2 a, float2 b)
 
   88   return make_float2(a.x - b.x, a.y - b.y);
 
   91 template<
typename SCALARTYPE>
 
   92 inline __device__ float2 
operator/(float2 a,SCALARTYPE b)
 
   94   return make_float2(a.x/b, a.y/b);
 
   98 inline __device__ float2 
operator*(float2 in1, float2 in2)
 
  100   return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
 
  104 inline __host__ __device__ double2 
operator+(double2 a, double2 b)
 
  106   return make_double2(a.x + b.x, a.y + b.y);
 
  110 inline __host__ __device__ double2 
operator-(double2 a, double2 b)
 
  112   return make_double2(a.x - b.x, a.y - b.y);
 
  116 template<
typename SCALARTYPE>
 
  117 inline __host__ __device__ double2 
operator/(double2 a,SCALARTYPE b)
 
  119   return make_double2(a.x/b, a.y/b);
 
  123 inline __host__ __device__ double2 
operator*(double2 in1, double2 in2)
 
  125   return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
 
  130   v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
 
  131   v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
 
  132   v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
 
  133   v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
 
  134   v = (v >> 16) | (v << 16);
 
  135   v = v >> (32 - bit_size);
 
  139 template<
typename Numeric2T, 
typename NumericT>
 
  141     const Numeric2T * input,
 
  145     unsigned int batch_num,
 
  150   const NumericT NUM_PI(3.14159265358979323846);
 
  152   for (
unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
 
  154     for (
unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
 
  160       for (
unsigned int n = 0; n < 
size; n++)
 
  164           in = input[batch_id * stride + n];   
 
  166           in = input[n * stride + batch_id];
 
  169         NumericT arg = sign * 2 * NUM_PI * k / size * n;
 
  177         tmp.x = in.x * ex.x - in.y * ex.y;
 
  178         tmp.y = in.x * ex.y + in.y * ex.x;
 
  183         output[batch_id * stride + k] = f; 
 
  185         output[k * stride + batch_id] = f;
 
  196 template<
typename NumericT, 
unsigned int AlignmentV>
 
  205   fft_direct<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(
viennacl::cuda_arg(in)),
 
  207                           static_cast<unsigned int>(size),
 
  208                           static_cast<unsigned int>(
stride),
 
  209                           static_cast<unsigned int>(batch_num),
 
  221 template<
typename NumericT, 
unsigned int AlignmentV>
 
  230   fft_direct<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(
viennacl::cuda_arg(in)),
 
  232                           static_cast<unsigned int>(size),
 
  233                           static_cast<unsigned int>(
stride),
 
  234                           static_cast<unsigned int>(batch_num),
 
  240 template<
typename NumericT>
 
  242                             unsigned int bit_size,
 
  245                             unsigned int batch_num,
 
  249   unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
 
  250   unsigned int glb_sz = gridDim.x * blockDim.x;
 
  252   for (
unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
 
  254     for (
unsigned int i = glb_id; i < 
size; i += glb_sz)
 
  262           NumericT tmp = input[batch_id * stride + i];    
 
  263           input[batch_id * stride + i] = input[batch_id * stride + v];
 
  264           input[batch_id * stride + v] = tmp;
 
  268           NumericT tmp = input[i * stride + batch_id];
 
  269           input[i * stride + batch_id] = input[v * stride + batch_id];
 
  270           input[v * stride + batch_id] = tmp;
 
  281 template<
typename NumericT, 
unsigned int AlignmentV>
 
  289                            static_cast<unsigned int>(bits_datasize),
 
  290                            static_cast<unsigned int>(
size),
 
  291                            static_cast<unsigned int>(stride),
 
  292                            static_cast<unsigned int>(batch_num),
 
  293                            static_cast<bool>(data_order));
 
  297 template<
typename Numeric2T, 
typename NumericT>
 
  299                                  unsigned int bit_size,
 
  302                                  unsigned int batch_num,
 
  306   __shared__ Numeric2T lcl_input[1024];
 
  307   unsigned int grp_id = blockIdx.x;
 
  308   unsigned int grp_num = gridDim.x;
 
  310   unsigned int lcl_sz = blockDim.x;
 
  311   unsigned int lcl_id = threadIdx.x;
 
  312   const NumericT NUM_PI(3.14159265358979323846);
 
  314   for (
unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
 
  316     for (
unsigned int p = lcl_id; p < 
size; p += lcl_sz)
 
  320         lcl_input[v] = input[batch_id * stride + p];
 
  322         lcl_input[v] = input[p * stride + batch_id];
 
  328     for (
unsigned int s = 0; s < bit_size; s++)
 
  330       unsigned int ss = 1 << s;
 
  332       for (
unsigned int tid = lcl_id; tid < 
size; tid += lcl_sz)
 
  334         unsigned int group = (tid & (ss - 1));
 
  335         unsigned int pos = ((tid >> s) << (s + 1)) + group;
 
  337         Numeric2T in1 = lcl_input[pos];
 
  338         Numeric2T in2 = lcl_input[pos + ss];
 
  340         NumericT arg = group * sign * NUM_PI / ss;
 
  349         tmp.x = in2.x * ex.x - in2.y * ex.y;
 
  350         tmp.y = in2.x * ex.y + in2.y * ex.x;
 
  352         lcl_input[pos + ss] = in1 - tmp;
 
  353         lcl_input[pos]      = in1 + tmp;
 
  359     for (
unsigned int p = lcl_id; p < 
size; p += lcl_sz)
 
  362         input[batch_id * stride + p] = lcl_input[p];   
 
  364         input[p * stride + batch_id] = lcl_input[p];
 
  370 template<
typename Numeric2T, 
typename NumericT>
 
  373                            unsigned int bit_size,
 
  376                            unsigned int batch_num,
 
  381   unsigned int ss = 1 << s;
 
  382   unsigned int half_size = size >> 1;
 
  385   const NumericT NUM_PI(3.14159265358979323846);
 
  387   unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
 
  388   unsigned int glb_sz = gridDim.x * blockDim.x;
 
  390   for (
unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
 
  392     for (
unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
 
  394       unsigned int group = (tid & (ss - 1));
 
  395       unsigned int pos = ((tid >> s) << (s + 1)) + group;
 
  401         offset = batch_id * stride + pos;
 
  403         in2 = input[offset + ss];
 
  407         offset = pos * stride + batch_id;
 
  409         in2 = input[offset + ss * 
stride];
 
  412       NumericT arg = group * sign * NUM_PI / ss;
 
  422       tmp.x = in2.x * ex.x - in2.y * ex.y;
 
  423       tmp.y = in2.x * ex.y + in2.y * ex.x;
 
  426         input[offset + ss] = in1 - tmp;  
 
  428         input[offset + ss * 
stride] = in1 - tmp;  
 
  429       input[offset] = in1 + tmp;  
 
  441 template<
typename NumericT, 
unsigned int AlignmentV>
 
  452     fft_radix2_local<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(
viennacl::cuda_arg(in)),
 
  453                                   static_cast<unsigned int>(bit_size),
 
  454                                   static_cast<unsigned int>(
size),
 
  455                                   static_cast<unsigned int>(stride),
 
  456                                   static_cast<unsigned int>(batch_num),
 
  457                                   static_cast<NumericT>(
sign),
 
  458                                   static_cast<bool>(data_order));
 
  464                              static_cast<unsigned int>(bit_size),
 
  465                              static_cast<unsigned int>(
size),
 
  466                              static_cast<unsigned int>(stride),
 
  467                              static_cast<unsigned int>(batch_num),
 
  468                              static_cast<bool>(data_order));
 
  474                               static_cast<unsigned int>(
step),
 
  475                               static_cast<unsigned int>(bit_size),
 
  476                               static_cast<unsigned int>(size),
 
  477                               static_cast<unsigned int>(
stride),
 
  478                               static_cast<unsigned int>(batch_num),
 
  480                               static_cast<bool>(data_order));
 
  493 template<
typename NumericT, 
unsigned int AlignmentV>
 
  504     fft_radix2_local<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(
viennacl::cuda_arg(in)),
 
  505                                   static_cast<unsigned int>(bit_size),
 
  506                                   static_cast<unsigned int>(
size),
 
  507                                   static_cast<unsigned int>(stride),
 
  508                                   static_cast<unsigned int>(batch_num),
 
  510                                   static_cast<bool>(data_order));
 
  516                              static_cast<unsigned int>(bit_size),
 
  517                              static_cast<unsigned int>(
size),
 
  518                              static_cast<unsigned int>(stride),
 
  519                              static_cast<unsigned int>(batch_num),
 
  520                              static_cast<bool>(data_order));
 
  525                               static_cast<unsigned int>(
step),
 
  526                               static_cast<unsigned int>(bit_size),
 
  527                               static_cast<unsigned int>(size),
 
  528                               static_cast<unsigned int>(
stride),
 
  529                               static_cast<unsigned int>(batch_num),
 
  531                               static_cast<bool>(data_order));
 
  537 template<
typename Numeric2T, 
typename NumericT>
 
  540   unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
 
  541   unsigned int glb_sz =gridDim.x * blockDim.x;
 
  543   unsigned int double_size = size << 1;
 
  545   const NumericT NUM_PI(3.14159265358979323846);
 
  547   for (
unsigned int i = glb_id; i < 
size; i += glb_sz)
 
  549     unsigned int rm = i * i % (double_size);
 
  558     out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
 
  559     out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
 
  563 template<
typename Numeric2T, 
typename NumericT>
 
  564 __global__ 
void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
 
  565                               unsigned int size, 
unsigned int ext_size, 
NumericT sign)
 
  567   unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
 
  568   unsigned int glb_sz = gridDim.x * blockDim.x;
 
  570   unsigned int double_size = size << 1;
 
  573   const NumericT NUM_PI(3.14159265358979323846);
 
  575   for (
unsigned int i = glb_id; i < 
size; i += glb_sz)
 
  577     unsigned int rm = i * i % (double_size);
 
  591     A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
 
  592     A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
 
  597     B[ext_size - i] = b_i;
 
  601 template<
typename NumericT>
 
  604   for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
 
  621 template<
typename NumericT, 
unsigned int AlignmentV>
 
  636                      static_cast<unsigned int>(ext_size));
 
  642                              static_cast<unsigned int>(size),
 
  643                              static_cast<unsigned int>(ext_size),
 
  651                               static_cast<unsigned int>(size),
 
  656 template<
typename NumericT>
 
  662   for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
 
  666     output[i] = in1 * in2;
 
  673 template<
typename NumericT, 
unsigned int AlignmentV>
 
  682   fft_mult_vec<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(
viennacl::cuda_arg(input1)),
 
  685                             static_cast<unsigned int>(size));
 
  689 template<
typename Numeric2T, 
typename NumericT>
 
  692   for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
 
  693     input1[i] = input1[i]/factor;
 
  699 template<
typename NumericT, 
unsigned int AlignmentV>
 
  706   fft_div_vec_scalar<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(
viennacl::cuda_arg(input)),
 
  707                                   static_cast<unsigned int>(size),
 
  712 template<
typename NumericT>
 
  715                           unsigned int row_num,
 
  716                           unsigned int col_num)
 
  718   unsigned int size = row_num * col_num;
 
  719   for (
unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
 
  721     unsigned int row = i / col_num;
 
  722     unsigned int col = i - row*col_num;
 
  723     unsigned int new_pos = col * row_num + 
row;
 
  724     output[new_pos] = input[i];
 
  731 template<
typename NumericT, 
unsigned int AlignmentV>
 
  737   transpose<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(
viennacl::cuda_arg(input)),
 
  745 template<
typename NumericT>
 
  748     unsigned int row_num,
 
  749     unsigned int col_num)
 
  751   unsigned int size = row_num * col_num;
 
  752   for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
 
  754     unsigned int row = i / col_num;
 
  755     unsigned int col = i - row*col_num;
 
  756     unsigned int new_pos = col * row_num + 
row;
 
  760       input[i] = input[new_pos];
 
  761       input[new_pos] = val;
 
  769 template<
typename NumericT, 
unsigned int AlignmentV>
 
  774   transpose_inplace<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(
viennacl::cuda_arg(input)),
 
  781 template<
typename RealT,
typename ComplexT>
 
  784   for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
 
  796 template<
typename NumericT>
 
  804                                static_cast<unsigned int>(size));
 
  808 template<
typename ComplexT,
typename RealT>
 
  811   for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
 
  818 template<
typename NumericT>
 
  824   complex_to_real<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(
viennacl::cuda_arg(in)),
 
  826                                static_cast<unsigned int>(
size));
 
  831 template<
typename NumericT>
 
  834   for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
 
  839     vec[size - i - 1] = val1;
 
  846 template<
typename NumericT>
 
Helper class for checking whether a matrix has a row-major layout. 
Implementation of the dense matrix class. 
__global__ void bluestein_post(Numeric2T *Z, Numeric2T *out, unsigned int size, NumericT sign)
__global__ void real_to_complex(const RealT *in, ComplexT *out, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
__global__ void fft_direct(const Numeric2T *input, Numeric2T *output, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
This file provides the forward declarations for the main types used within ViennaCL. 
__global__ void fft_mult_vec(const NumericT *input1, const NumericT *input2, NumericT *output, unsigned int size)
__host__ __device__ float2 operator+(float2 a, float2 b)
const vcl_size_t MAX_LOCAL_POINTS_NUM
vcl_size_t num_bits(vcl_size_t size)
__global__ void reverse_inplace(NumericT *vec, unsigned int size)
__global__ void transpose(const NumericT *input, NumericT *output, unsigned int row_num, unsigned int col_num)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
void radix2(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 1D algorithm for computing Fourier transformation. 
void bluestein(viennacl::vector< NumericT, AlignmentV > &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t)
Bluestein's algorithm for computing Fourier transformation. 
__global__ void fft_div_vec_scalar(Numeric2T *input1, unsigned int size, NumericT factor)
__global__ void fft_reorder(NumericT *input, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, bool is_row_major)
__device__ float2 operator*(float2 in1, float2 in2)
__device__ float2 operator/(float2 a, SCALARTYPE b)
void normalize(viennacl::vector< NumericT, AlignmentV > &input)
Normalize vector on with his own size. 
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
void convolve_i(viennacl::vector< SCALARTYPE, ALIGNMENT > &input1, viennacl::vector< SCALARTYPE, ALIGNMENT > &input2, viennacl::vector< SCALARTYPE, ALIGNMENT > &output)
__global__ void zero2(NumericT *input1, NumericT *input2, unsigned int size)
__global__ void fft_radix2(Numeric2T *input, unsigned int s, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
Common routines for CUDA execution. 
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
__device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
size_type size() const 
Returns the length of the vector (cf. std::vector) 
__global__ void bluestein_pre(Numeric2T *input, Numeric2T *A, Numeric2T *B, unsigned int size, unsigned int ext_size, NumericT sign)
Implementations of Fast Furier Transformation using a plain single-threaded or OpenMP-enabled executi...
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
__global__ void transpose_inplace(NumericT *input, unsigned int row_num, unsigned int col_num)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. 
void reverse(viennacl::vector_base< NumericT > &in)
Reverse vector to oposite order and save it in input vector. 
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
void multiply_complex(viennacl::vector< NumericT, AlignmentV > const &input1, viennacl::vector< NumericT, AlignmentV > const &input2, viennacl::vector< NumericT, AlignmentV > &output)
Mutiply two complex vectors and store result in output. 
__global__ void complex_to_real(const ComplexT *in, RealT *out, unsigned int size)
__host__ __device__ float2 operator-(float2 a, float2 b)
void reorder(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
void direct(viennacl::vector< NumericT, AlignmentV > const &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Direct 1D algorithm for computing Fourier transformation. 
vcl_size_t next_power_2(vcl_size_t n)
__global__ void fft_radix2_local(Numeric2T *input, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
Implementation of the ViennaCL scalar class. 
ScalarType fft(std::vector< ScalarType > &in, std::vector< ScalarType > &out, unsigned int, unsigned int, unsigned int batch_size)
SCALARTYPE sign(SCALARTYPE val)
Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU...