1 #ifndef VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_ 
    2 #define VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_ 
   47     namespace FFT_DATA_ORDER
 
   86       v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
 
   87       v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
 
   88       v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
 
   89       v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
 
   90       v = (v >> 16) | (v << 16);
 
   91       v = v >> (32 - bit_size);
 
   95     template<
typename NumericT, 
unsigned int AlignmentV>
 
   99 #ifdef VIENNACL_WITH_OPENMP 
  100       #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) 
  102       for (
long i2 = 0; i2 < long(size * 2); i2 += 2)
 
  105         input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
 
  109     template<
typename NumericT>
 
  113 #ifdef VIENNACL_WITH_OPENMP 
  114       #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) 
  116       for (
long i2 = 0; i2 < long(size * 2); i2 += 2)
 
  119         input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
 
  123     template<
typename NumericT, 
unsigned int AlignmentV>
 
  127 #ifdef VIENNACL_WITH_OPENMP 
  128       #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) 
  130       for (
long i2 = 0; i2 < long(size); i2++)
 
  133         in(i * 2)     = 
static_cast<NumericT>(std::real(input_complex[i]));
 
  134         in(i * 2 + 1) = 
static_cast<NumericT>(std::imag(input_complex[i]));
 
  138     template<
typename NumericT>
 
  142 #ifdef VIENNACL_WITH_OPENMP 
  143       #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) 
  145       for (
long i2 = 0; i2 < long(size * 2); i2 += 2)
 
  148         input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
 
  152     template<
typename NumericT>
 
  155 #ifdef VIENNACL_WITH_OPENMP 
  156       #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) 
  158       for (
long i2 = 0; i2 < long(size); i2++)
 
  161         in[i * 2]     = 
static_cast<NumericT>(std::real(input_complex[i]));
 
  162         in[i * 2 + 1] = 
static_cast<NumericT>(std::imag(input_complex[i]));
 
  166     template<
typename NumericT>
 
  170       std::vector<NumericT> temp(2 * size);
 
  171 #ifdef VIENNACL_WITH_OPENMP 
  172       #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) 
  174       for (
long i2 = 0; i2 < long(size); i2++)
 
  177         temp[i * 2]     = 
static_cast<NumericT>(std::real(input_complex[i]));
 
  178         temp[i * 2 + 1] = 
static_cast<NumericT>(std::imag(input_complex[i]));
 
  183     template<
typename NumericT>
 
  186 #ifdef VIENNACL_WITH_OPENMP 
  187       #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) 
  189       for (
long i2 = 0; i2 < long(size); i2++)
 
  204 template<
typename NumericT>
 
  205 void fft_direct(std::complex<NumericT> * input_complex, std::complex<NumericT> * output,
 
  210 #ifdef VIENNACL_WITH_OPENMP 
  213   for (
long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
 
  218       std::complex<NumericT> f = 0;
 
  221         std::complex<NumericT> input;
 
  223           input = input_complex[batch_id * stride + n]; 
 
  225           input = input_complex[n * stride + batch_id];
 
  230         std::complex<NumericT> ex(cs, sn);
 
  231         std::complex<NumericT> tmp(input.real() * ex.real() - input.imag() * ex.imag(),
 
  232                                    input.real() * ex.imag() + input.imag() * ex.real());
 
  236         output[batch_id * stride + k] = f;   
 
  238         output[k * stride + batch_id] = f;
 
  250 template<
typename NumericT, 
unsigned int AlignmentV>
 
  257   std::vector<std::complex<NumericT> > input_complex(size * batch_num);
 
  258   std::vector<std::complex<NumericT> > output(size * batch_num);
 
  262   fft_direct(&input_complex[0], &output[0], size, stride, batch_num, 
sign, data_order);
 
  273 template<
typename NumericT, 
unsigned int AlignmentV>
 
  284   std::vector<std::complex<NumericT> > input_complex(size_mat);
 
  285   std::vector<std::complex<NumericT> > output(size_mat);
 
  287   NumericT const * data_A = detail::extract_raw_pointer<NumericT>(in);
 
  288   NumericT       * data_B = detail::extract_raw_pointer<NumericT>(out);
 
  292   fft_direct(&input_complex[0], &output[0], size, stride, batch_num, 
sign, data_order);
 
  301 template<
typename NumericT, 
unsigned int AlignmentV>
 
  306   std::vector<std::complex<NumericT> > input(size * batch_num);
 
  308 #ifdef VIENNACL_WITH_OPENMP 
  309   #pragma omp parallel for 
  311   for (
long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
 
  321           std::complex<NumericT> tmp   = input[batch_id * stride + i]; 
 
  322           input[batch_id * stride + i] = input[batch_id * stride + v]; 
 
  323           input[batch_id * stride + v] = tmp;      
 
  327           std::complex<NumericT> tmp   = input[i * stride + batch_id]; 
 
  328           input[i * stride + batch_id] = input[v * stride + batch_id]; 
 
  329           input[v * stride + batch_id] = tmp;      
 
  341 template<
typename NumericT, 
unsigned int AlignmentV>
 
  347   NumericT * data = detail::extract_raw_pointer<NumericT>(in);
 
  352   std::vector<std::complex<NumericT> > input(size_mat);
 
  356 #ifdef VIENNACL_WITH_OPENMP 
  357   #pragma omp parallel for 
  359   for (
long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
 
  369           std::complex<NumericT> tmp   = input[batch_id * stride + i]; 
 
  370           input[batch_id * stride + i] = input[batch_id * stride + v]; 
 
  371           input[batch_id * stride + v] = tmp;      
 
  374           std::complex<NumericT> tmp   = input[i * stride + batch_id]; 
 
  375           input[i * stride + batch_id] = input[v * stride + batch_id]; 
 
  376           input[v * stride + batch_id] = tmp;      
 
  388 template<
typename NumericT>
 
  400 #ifdef VIENNACL_WITH_OPENMP 
  401     #pragma omp parallel for 
  403     for (
long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
 
  406       for (
vcl_size_t tid = 0; tid < half_size; tid++)
 
  410         std::complex<NumericT> in1;
 
  411         std::complex<NumericT> in2;
 
  415           offset = batch_id * stride + pos;
 
  416           in1 = input_complex[offset];
 
  417           in2 = input_complex[offset + ss];
 
  421           offset = pos * stride + batch_id;
 
  422           in1 = input_complex[offset];
 
  423           in2 = input_complex[offset + ss * 
stride];
 
  428         std::complex<NumericT> ex(cs, sn);
 
  429         std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
 
  430                                    in2.real() * ex.imag() + in2.imag() * ex.real());
 
  432           input_complex[offset + ss] = in1 - tmp;
 
  434           input_complex[offset + ss * 
stride] = in1 - tmp;
 
  435         input_complex[offset] = in1 + tmp;
 
  446 template<
typename NumericT>
 
  454   for (
vcl_size_t batch_id = 0; batch_id < batch_num; batch_id++)
 
  456 #ifdef VIENNACL_WITH_OPENMP 
  457     #pragma omp parallel for 
  459     for (
long p2 = 0; p2 < long(size); p2 += 1)
 
  465         lcl_input[v] = input_complex[batch_id * stride + p]; 
 
  467         lcl_input[v] = input_complex[p * stride + batch_id];
 
  473 #ifdef VIENNACL_WITH_OPENMP 
  474       #pragma omp parallel for 
  476       for (
long tid2 = 0; tid2 < long(size)/2; tid2++)
 
  480         vcl_size_t pos = ((tid >> s) << (s + 1)) + group;
 
  482         std::complex<NumericT> in1 = lcl_input[pos];
 
  483         std::complex<NumericT> in2 = lcl_input[pos + ss];
 
  489         std::complex<NumericT> ex(cs, sn);
 
  491         std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
 
  492                                    in2.real() * ex.imag() + in2.imag() * ex.real());
 
  494         lcl_input[pos + ss] = in1 - tmp;
 
  495         lcl_input[pos] = in1 + tmp;
 
  499 #ifdef VIENNACL_WITH_OPENMP 
  500     #pragma omp parallel for 
  503     for (
long p2 = 0; p2 < long(size); p2 += 1)
 
  507         input_complex[batch_id * stride + p] = lcl_input[p];
 
  509         input_complex[p * stride + batch_id] = lcl_input[p];
 
  524 template<
typename NumericT, 
unsigned int AlignmentV>
 
  532   std::vector<std::complex<NumericT> > input_complex(size * batch_num);
 
  533   std::vector<std::complex<NumericT> > lcl_input(size * batch_num);
 
  542     viennacl::linalg::host_based::reorder<NumericT>(in, 
size, 
stride, bit_size, batch_num, data_order);
 
  557 template<
typename NumericT, 
unsigned int AlignmentV>
 
  565   NumericT * data = detail::extract_raw_pointer<NumericT>(in);
 
  571   std::vector<std::complex<NumericT> > input_complex(size_mat);
 
  577     std::vector<std::complex<NumericT> > lcl_input(size_mat);
 
  582     viennacl::linalg::host_based::reorder<NumericT>(in, 
size, 
stride, bit_size, batch_num, data_order);
 
  598 template<
typename NumericT, 
unsigned int AlignmentV>
 
  609   std::vector<std::complex<NumericT> > input_complex(size);
 
  610   std::vector<std::complex<NumericT> > output_complex(size);
 
  612   std::vector<std::complex<NumericT> > A_complex(ext_size);
 
  613   std::vector<std::complex<NumericT> > B_complex(ext_size);
 
  614   std::vector<std::complex<NumericT> > Z_complex(ext_size);
 
  617 #ifdef VIENNACL_WITH_OPENMP 
  618   #pragma omp parallel for 
  620   for (
long i2 = 0; i2 < long(ext_size); i2++)
 
  630 #ifdef VIENNACL_WITH_OPENMP 
  631   #pragma omp parallel for 
  633   for (
long i2 = 0; i2 < long(size); i2++)
 
  642     std::complex<NumericT> a_i(cs_a, sn_a);
 
  643     std::complex<NumericT> b_i(cs_a, -sn_a);
 
  645     A_complex[i] = std::complex<NumericT>(input_complex[i].real() * a_i.real() - input_complex[i].imag() * a_i.imag(),
 
  646                                           input_complex[i].real() * a_i.imag() + input_complex[i].imag() * a_i.real());
 
  651       B_complex[ext_size - i] = b_i;
 
  662 #ifdef VIENNACL_WITH_OPENMP 
  663   #pragma omp parallel for 
  665   for (
long i2 = 0; i2 < long(size); i2++)
 
  672     std::complex<NumericT> b_i(cs_a, sn_a);
 
  673     output_complex[i] = std::complex<NumericT>(Z_complex[i].real() * b_i.real() - Z_complex[i].imag() * b_i.imag(),
 
  674                                                Z_complex[i].real() * b_i.imag() + Z_complex[i].imag() * b_i.real());
 
  683 template<
typename NumericT, 
unsigned int AlignmentV>
 
  689     input[i] /= norm_factor;
 
  696 template<
typename NumericT, 
unsigned int AlignmentV>
 
  703   std::vector<std::complex<NumericT> > input1_complex(size);
 
  704   std::vector<std::complex<NumericT> > input2_complex(size);
 
  705   std::vector<std::complex<NumericT> > output_complex(size);
 
  709 #ifdef VIENNACL_WITH_OPENMP 
  710   #pragma omp parallel for 
  712   for (
long i2 = 0; i2 < long(size); i2++)
 
  715     std::complex<NumericT> in1 = input1_complex[i];
 
  716     std::complex<NumericT> in2 = input2_complex[i];
 
  717     output_complex[i] = std::complex<NumericT>(in1.real() * in2.real() - in1.imag() * in2.imag(),
 
  718                                                in1.real() * in2.imag() + in1.imag() * in2.real());
 
  726 template<
typename NumericT, 
unsigned int AlignmentV>
 
  734   NumericT * data = detail::extract_raw_pointer<NumericT>(input);
 
  736   std::vector<std::complex<NumericT> > input_complex(size);
 
  739 #ifdef VIENNACL_WITH_OPENMP 
  740   #pragma omp parallel for 
  742   for (
long i2 = 0; i2 < long(size); i2++)
 
  751       std::complex<NumericT> val = input_complex[i];
 
  752       input_complex[i] = input_complex[new_pos];
 
  753       input_complex[new_pos] = val;
 
  763 template<
typename NumericT, 
unsigned int AlignmentV>
 
  772   NumericT const * data_A = detail::extract_raw_pointer<NumericT>(input);
 
  773   NumericT       * data_B = detail::extract_raw_pointer<NumericT>(output);
 
  775   std::vector<std::complex<NumericT> > input_complex(size);
 
  778   std::vector<std::complex<NumericT> > output_complex(size);
 
  779 #ifdef VIENNACL_WITH_OPENMP 
  780   #pragma omp parallel for 
  782   for (
long i2 = 0; i2 < long(size); i2++)
 
  788     output_complex[new_pos] = input_complex[i];
 
  796 template<
typename NumericT>
 
  800   NumericT const * data_in  = detail::extract_raw_pointer<NumericT>(in);
 
  801   NumericT       * data_out = detail::extract_raw_pointer<NumericT>(out);
 
  803 #ifdef VIENNACL_WITH_OPENMP 
  804   #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) 
  806   for (
long i2 = 0; i2 < long(size); i2++)
 
  809     data_out[2*i  ] = data_in[i];
 
  817 template<
typename NumericT>
 
  821   NumericT const * data_in  = detail::extract_raw_pointer<NumericT>(in);
 
  822   NumericT       * data_out = detail::extract_raw_pointer<NumericT>(out);
 
  824 #ifdef VIENNACL_WITH_OPENMP 
  825 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) 
  827   for (
long i = 0; i < long(size); i++)
 
  828     data_out[i] = data_in[2*i];
 
  834 template<
typename NumericT>
 
  839 #ifdef VIENNACL_WITH_OPENMP 
  840   #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) 
  842   for (
long i2 = 0; i2 < long(size); i2++)
 
  848     in[size - i - 1] = val1;
 
void fft_radix2_local(std::complex< NumericT > *input_complex, std::complex< NumericT > *lcl_input, vcl_size_t batch_num, vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 algorithm for computing Fourier transformation. Kernel for computing bigger amount of data...
Implementation of the dense matrix class. 
void zero2(NumericT *input1, NumericT *input2, vcl_size_t size)
void reverse(viennacl::vector_base< NumericT > &in)
Reverse vector to opposite order and save it in input vector. 
void radix2(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 1D algorithm for computing Fourier transformation. 
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
void real_to_complex(viennacl::vector_base< NumericT > const &in, viennacl::vector_base< NumericT > &out, vcl_size_t size)
Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imagina...
void multiply_complex(viennacl::vector< NumericT, AlignmentV > const &input1, viennacl::vector< NumericT, AlignmentV > const &input2, viennacl::vector< NumericT, AlignmentV > &output)
Complex multiplikation of two vectors. 
void fft_direct(std::complex< NumericT > *input_complex, std::complex< NumericT > *output, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Direct algoritm kenrnel. 
vcl_size_t get_reorder_num(vcl_size_t v, vcl_size_t bit_size)
void copy_to_complex_array(std::complex< NumericT > *input_complex, viennacl::vector< NumericT, AlignmentV > const &in, vcl_size_t size)
void bluestein(viennacl::vector< NumericT, AlignmentV > &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t)
Bluestein's algorithm for computing Fourier transformation. 
void direct(viennacl::vector< NumericT, AlignmentV > const &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Direct 1D algorithm for computing Fourier transformation. 
void copy_to_vector(std::complex< NumericT > *input_complex, viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
void convolve_i(viennacl::vector< SCALARTYPE, ALIGNMENT > &input1, viennacl::vector< SCALARTYPE, ALIGNMENT > &input2, viennacl::vector< SCALARTYPE, ALIGNMENT > &output)
vcl_size_t next_power_2(vcl_size_t n)
void transpose(viennacl::matrix< NumericT, viennacl::row_major, AlignmentV > &input)
Inplace transpose of matrix. 
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void copy(std::vector< NumericT > &cpu_vec, circulant_matrix< NumericT, AlignmentV > &gpu_mat)
Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU) ...
size_type size() const 
Returns the length of the vector (cf. std::vector) 
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
vcl_size_t num_bits(vcl_size_t size)
void fft_radix2(std::complex< NumericT > *input_complex, vcl_size_t batch_num, vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 algorithm for computing Fourier transformation. Kernel for computing smaller amount of data...
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
void normalize(viennacl::vector< NumericT, AlignmentV > &input)
Normalize vector with his own size. 
const vcl_size_t MAX_LOCAL_POINTS_NUM
void complex_to_real(viennacl::vector_base< NumericT > const &in, viennacl::vector_base< NumericT > &out, vcl_size_t size)
Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imagina...
void reorder(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
ScalarType fft(std::vector< ScalarType > &in, std::vector< ScalarType > &out, unsigned int, unsigned int, unsigned int batch_size)
SCALARTYPE sign(SCALARTYPE val)
Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU...