1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP 
    2 #define VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP 
   38 template <
typename StringType>
 
   41   source.append(
"__kernel void scan_1(__global "); source.append(numeric_string); source.append(
"* X, \n");
 
   42   source.append(
"                     unsigned int startX, \n");
 
   43   source.append(
"                     unsigned int incX, \n");
 
   44   source.append(
"                     unsigned int sizeX, \n");
 
   46   source.append(
"                     __global "); source.append(numeric_string); source.append(
"* Y, \n");
 
   47   source.append(
"                     unsigned int startY, \n");
 
   48   source.append(
"                     unsigned int incY, \n");
 
   50   source.append(
"                     unsigned int scan_offset, \n"); 
 
   51   source.append(
"                     __global "); source.append(numeric_string); source.append(
"* carries) { \n");
 
   53   source.append(
"  __local "); source.append(numeric_string); source.append(
" shared_buffer[256]; \n");
 
   54   source.append(
"  "); source.append(numeric_string); source.append(
" my_value; \n");
 
   56   source.append(
"  unsigned int work_per_thread = (sizeX - 1) / get_global_size(0) + 1; \n");
 
   57   source.append(
"  unsigned int block_start = work_per_thread * get_local_size(0) *  get_group_id(0); \n");
 
   58   source.append(
"  unsigned int block_stop  = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n");
 
   59   source.append(
"  unsigned int block_offset = 0; \n");
 
   62   source.append(
"  for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) { \n");
 
   65   source.append(
"    my_value = (i < sizeX) ? X[i * incX + startX] : 0; \n");
 
   68   source.append(
"    for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
 
   69   source.append(
"       barrier(CLK_LOCAL_MEM_FENCE);   \n");
 
   70   source.append(
"       shared_buffer[get_local_id(0)] = my_value;   \n");
 
   71   source.append(
"       barrier(CLK_LOCAL_MEM_FENCE);   \n");
 
   72   source.append(
"       if (get_local_id(0) >= stride)   \n");
 
   73   source.append(
"         my_value += shared_buffer[get_local_id(0) - stride];   \n");
 
   74   source.append(
"    } \n");
 
   75   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE);   \n");
 
   76   source.append(
"    shared_buffer[get_local_id(0)] = my_value;   \n");
 
   77   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE);   \n");
 
   80   source.append(
"    if (scan_offset > 0) \n");
 
   81   source.append(
"      my_value = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0; \n");
 
   83   source.append(
"    if (i < sizeX) \n");
 
   84   source.append(
"      Y[i * incY + startY] = block_offset + my_value; \n");
 
   86   source.append(
"    block_offset += shared_buffer[get_local_size(0)-1]; \n");
 
   87   source.append(
"  } \n");
 
   90   source.append(
"  if (get_local_id(0) == 0) carries[get_group_id(0)] = block_offset; \n");
 
   92   source.append(
"} \n");
 
   95 template <
typename StringType>
 
   98   source.append(
"__kernel void scan_2(__global "); source.append(numeric_string); source.append(
"* carries) { \n");
 
  100   source.append(
"  __local "); source.append(numeric_string); source.append(
" shared_buffer[256]; \n");       
 
  103   source.append(
"  "); source.append(numeric_string); source.append(
" my_carry = carries[get_local_id(0)]; \n");
 
  106   source.append(
"  for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
 
  107   source.append(
"     barrier(CLK_LOCAL_MEM_FENCE);   \n");
 
  108   source.append(
"     shared_buffer[get_local_id(0)] = my_carry;   \n");
 
  109   source.append(
"     barrier(CLK_LOCAL_MEM_FENCE);   \n");
 
  110   source.append(
"     if (get_local_id(0) >= stride)   \n");
 
  111   source.append(
"       my_carry += shared_buffer[get_local_id(0) - stride];   \n");
 
  112   source.append(
"  } \n");
 
  113   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE);   \n");
 
  114   source.append(
"  shared_buffer[get_local_id(0)] = my_carry;   \n");
 
  115   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE);   \n");
 
  118   source.append(
"  carries[get_local_id(0)] = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0;  \n");
 
  120   source.append(
"} \n");
 
  123 template <
typename StringType>
 
  126   source.append(
"__kernel void scan_3(__global "); source.append(numeric_string); source.append(
" * Y, \n");
 
  127   source.append(
"                     unsigned int startY, \n");
 
  128   source.append(
"                     unsigned int incY, \n");
 
  129   source.append(
"                     unsigned int sizeY, \n");
 
  131   source.append(
"                     __global "); source.append(numeric_string); source.append(
"* carries) { \n");
 
  133   source.append(
"  unsigned int work_per_thread = (sizeY - 1) / get_global_size(0) + 1; \n");
 
  134   source.append(
"  unsigned int block_start = work_per_thread * get_local_size(0) *  get_group_id(0); \n");
 
  135   source.append(
"  unsigned int block_stop  = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n");
 
  137   source.append(
"  __local "); source.append(numeric_string); source.append(
" shared_offset; \n");
 
  139   source.append(
"  if (get_local_id(0) == 0) shared_offset = carries[get_group_id(0)]; \n");
 
  140   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE);   \n");
 
  142   source.append(
"  for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) \n");
 
  143   source.append(
"    if (i < sizeY) \n");
 
  144   source.append(
"      Y[i * incY + startY] += shared_offset; \n");
 
  146   source.append(
"} \n");
 
  154 template<
typename NumericT>
 
  164     static std::map<cl_context, bool> init_done;
 
  171       source.reserve(1024);
 
  173       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
  180       #ifdef VIENNACL_BUILD_INFO 
  181       std::cout << 
"Creating program " << prog_name << std::endl;
 
  183       ctx.add_program(source, prog_name);
 
  184       init_done[ctx.handle().get()] = 
true;
 
void generate_scan_kernel_3(StringType &source, std::string const &numeric_string)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities. 
static void init(viennacl::ocl::context &ctx)
Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices...
const viennacl::ocl::handle< cl_context > & handle() const 
Returns the context handle. 
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const 
static std::string program_name()
Representation of an OpenCL kernel in ViennaCL. 
void generate_scan_kernel_1(StringType &source, std::string const &numeric_string)
void generate_scan_kernel_2(StringType &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.