1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP 
    2 #define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP 
   66 template <
typename StringType>
 
   69   source.append(
"    for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
 
   72     source.append(
"      vec1[i*size1.y+size1.x] "); source.append(cfg.
assign_op); source.append(
" vec2[i*size2.y+size2.x] ");
 
   74       source.append(
"* alpha ");
 
   76       source.append(
"/ alpha ");
 
   79       source.append(
"+ vec3[i*size3.y+size3.x] ");
 
   81         source.append(
"* beta");
 
   83         source.append(
"/ beta");
 
   88     source.append(
"    vec1[i] "); source.append(cfg.
assign_op); source.append(
" vec2[i] ");
 
   90       source.append(
"* alpha ");
 
   92       source.append(
"/ alpha ");
 
   95       source.append(
"+ vec3[i] ");
 
   97         source.append(
"* beta");
 
   99         source.append(
"/ beta");
 
  102   source.append(
"; \n");
 
  105 template <
typename StringType>
 
  108   source.append(
"__kernel void av");
 
  115     source.append(
"_cpu");
 
  117     source.append(
"_gpu");
 
  120     source.append(
"_cpu");
 
  122     source.append(
"_gpu");
 
  123   source.append(
"( \n");
 
  124   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vec1, \n");
 
  125   source.append(
"  uint4 size1, \n");
 
  126   source.append(
" \n");
 
  129     source.append(
"  "); source.append(numeric_string); source.append(
" fac2, \n");
 
  133     source.append(
"  __global "); source.append(numeric_string); source.append(
" * fac2, \n");
 
  135   source.append(
"  unsigned int options2, \n");  
 
  136   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * vec2, \n");
 
  137   source.append(
"  uint4 size2");
 
  141     source.append(
", \n\n");
 
  144       source.append(
"  "); source.append(numeric_string); source.append(
" fac3, \n");
 
  148       source.append(
"  __global "); source.append(numeric_string); source.append(
" * fac3, \n");
 
  150     source.append(
"  unsigned int options3, \n");  
 
  151     source.append(
"  __global const "); source.append(numeric_string); source.append(
" * vec3, \n");
 
  152     source.append(
"  uint4 size3 \n");
 
  154   source.append(
") { \n");
 
  158     source.append(
"  "); source.append(numeric_string); source.append(
" alpha = fac2; \n");
 
  162     source.append(
"  "); source.append(numeric_string); source.append(
" alpha = fac2[0]; \n");
 
  164   source.append(
"  if (options2 & (1 << 0)) \n");
 
  165   source.append(
"    alpha = -alpha; \n");
 
  166   source.append(
" \n");
 
  170     source.append(
"  "); source.append(numeric_string); source.append(
" beta = fac3; \n");
 
  174     source.append(
"  "); source.append(numeric_string); source.append(
" beta = fac3[0]; \n");
 
  178     source.append(
"  if (options3 & (1 << 0)) \n");
 
  179     source.append(
"    beta = -beta; \n");
 
  180     source.append(
" \n");
 
  182   source.append(
"  if (options2 & (1 << 1)) { \n");
 
  185     source.append(
"    if (options3 & (1 << 1)) {\n");
 
  187     source.append(
"    } else {\n");
 
  189     source.append(
"    } \n");
 
  193   source.append(
"  } else { \n");
 
  196     source.append(
"    if (options3 & (1 << 1)) {\n");
 
  198     source.append(
"    } else {\n");
 
  200     source.append(
"    } \n");
 
  204   source.append(
"  } \n");
 
  205   source.append(
"} \n");
 
  208 template <
typename StringType>
 
  234 template <
typename StringType>
 
  237   source.append(
"__kernel void plane_rotation( \n");
 
  238   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vec1, \n");
 
  239   source.append(
"          unsigned int start1, \n");
 
  240   source.append(
"          unsigned int inc1, \n");
 
  241   source.append(
"          unsigned int size1, \n");
 
  242   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vec2, \n");
 
  243   source.append(
"          unsigned int start2, \n");
 
  244   source.append(
"          unsigned int inc2, \n");
 
  245   source.append(
"          unsigned int size2, \n");
 
  246   source.append(
"          "); source.append(numeric_string); source.append(
" alpha, \n");
 
  247   source.append(
"          "); source.append(numeric_string); source.append(
" beta) \n");
 
  248   source.append(
"{ \n");
 
  249   source.append(
"  "); source.append(numeric_string); source.append(
" tmp1 = 0; \n");
 
  250   source.append(
"  "); source.append(numeric_string); source.append(
" tmp2 = 0; \n");
 
  251   source.append(
" \n");
 
  252   source.append(
"  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
 
  253   source.append(
" { \n");
 
  254   source.append(
"    tmp1 = vec1[i*inc1+start1]; \n");
 
  255   source.append(
"    tmp2 = vec2[i*inc2+start2]; \n");
 
  256   source.append(
" \n");
 
  257   source.append(
"    vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; \n");
 
  258   source.append(
"    vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; \n");
 
  259   source.append(
"  } \n");
 
  260   source.append(
" \n");
 
  261   source.append(
"} \n");
 
  264 template <
typename StringType>
 
  267   source.append(
"__kernel void swap( \n");
 
  268   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vec1, \n");
 
  269   source.append(
"          unsigned int start1, \n");
 
  270   source.append(
"          unsigned int inc1, \n");
 
  271   source.append(
"          unsigned int size1, \n");
 
  272   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vec2, \n");
 
  273   source.append(
"          unsigned int start2, \n");
 
  274   source.append(
"          unsigned int inc2, \n");
 
  275   source.append(
"          unsigned int size2 \n");
 
  276   source.append(
"          ) \n");
 
  277   source.append(
"{ \n");
 
  278   source.append(
"  "); source.append(numeric_string); source.append(
" tmp; \n");
 
  279   source.append(
"  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
 
  280   source.append(
"  { \n");
 
  281   source.append(
"    tmp = vec2[i*inc2+start2]; \n");
 
  282   source.append(
"    vec2[i*inc2+start2] = vec1[i*inc1+start1]; \n");
 
  283   source.append(
"    vec1[i*inc1+start1] = tmp; \n");
 
  284   source.append(
"  } \n");
 
  285   source.append(
"} \n");
 
  288 template <
typename StringType>
 
  291   source.append(
"__kernel void assign_cpu( \n");
 
  292   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vec1, \n");
 
  293   source.append(
"          unsigned int start1, \n");
 
  294   source.append(
"          unsigned int inc1, \n");
 
  295   source.append(
"          unsigned int size1, \n");
 
  296   source.append(
"          unsigned int internal_size1, \n");
 
  297   source.append(
"          "); source.append(numeric_string); source.append(
" alpha) \n");
 
  298   source.append(
"{ \n");
 
  299   source.append(
"  for (unsigned int i = get_global_id(0); i < internal_size1; i += get_global_size(0)) \n");
 
  300   source.append(
"    vec1[i*inc1+start1] = (i < size1) ? alpha : 0; \n");
 
  301   source.append(
"} \n");
 
  305 template <
typename StringType>
 
  308   std::stringstream ss;
 
  310   std::string vector_num_string = ss.str();
 
  312   source.append(
"__kernel void inner_prod"); source.append(vector_num_string); source.append(
"( \n");
 
  313   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * x, \n");
 
  314   source.append(
"          uint4 params_x, \n");
 
  319     source.append(
"          __global const "); source.append(numeric_string); source.append(
" * y"); source.append(ss.str()); source.append(
", \n");
 
  320     source.append(
"          uint4 params_y"); source.append(ss.str()); source.append(
", \n");
 
  322   source.append(
"          __local "); source.append(numeric_string); source.append(
" * tmp_buffer, \n");
 
  323   source.append(
"          __global "); source.append(numeric_string); source.append(
" * group_buffer) \n");
 
  324   source.append(
"{ \n");
 
  325   source.append(
"  unsigned int entries_per_thread = (params_x.z - 1) / get_global_size(0) + 1; \n");
 
  326   source.append(
"  unsigned int vec_start_index = get_group_id(0) * get_local_size(0) * entries_per_thread; \n");
 
  327   source.append(
"  unsigned int vec_stop_index  = min((unsigned int)((get_group_id(0) + 1) * get_local_size(0) * entries_per_thread), params_x.z); \n");
 
  334     source.append(
"  "); source.append(numeric_string); source.append(
" tmp"); source.append(ss.str()); source.append(
" = 0; \n");
 
  336   source.append(
"  for (unsigned int i = vec_start_index + get_local_id(0); i < vec_stop_index; i += get_local_size(0)) { \n");
 
  337   source.append(
"    ");  source.append(numeric_string); source.append(
" val_x = x[i*params_x.y + params_x.x]; \n");
 
  342     source.append(
"    tmp"); source.append(ss.str()); source.append(
" += val_x * y"); source.append(ss.str()); source.append(
"[i * params_y"); source.append(ss.str()); source.append(
".y + params_y"); source.append(ss.str()); source.append(
".x]; \n");
 
  344   source.append(
"  } \n");
 
  349     source.append(
"  tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(
" * get_local_size(0)] = tmp"); source.append(ss.str()); source.append(
"; \n");
 
  353   source.append(
"  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  354   source.append(
"  { \n");
 
  355   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  356   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  361     source.append(
"      tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(
" * get_local_size(0)] += tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(
" * get_local_size(0) + stride]; \n");
 
  363   source.append(
"    } \n");
 
  364   source.append(
"  } \n");
 
  365   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  367   source.append(
"  if (get_local_id(0) == 0) { \n");
 
  372     source.append(
"    group_buffer[get_group_id(0) + "); source.append(ss.str()); source.append(
" * get_num_groups(0)] = tmp_buffer["); source.append(ss.str()); source.append(
" * get_local_size(0)]; \n");
 
  374   source.append(
"  } \n");
 
  375   source.append(
"} \n");
 
  379 template <
typename StringType>
 
  382   bool is_float_or_double = (numeric_string == 
"float" || numeric_string == 
"double");
 
  384   source.append(numeric_string); source.append(
" impl_norm( \n");
 
  385   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * vec, \n");
 
  386   source.append(
"          unsigned int start1, \n");
 
  387   source.append(
"          unsigned int inc1, \n");
 
  388   source.append(
"          unsigned int size1, \n");
 
  389   source.append(
"          unsigned int norm_selector, \n");
 
  390   source.append(
"          __local "); source.append(numeric_string); source.append(
" * tmp_buffer) \n");
 
  391   source.append(
"{ \n");
 
  392   source.append(
"  "); source.append(numeric_string); source.append(
" tmp = 0; \n");
 
  393   source.append(
"  if (norm_selector == 1) \n"); 
 
  394   source.append(
"  { \n");
 
  395   source.append(
"    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
 
  396   if (is_float_or_double)
 
  397     source.append(
"      tmp += fabs(vec[i*inc1 + start1]); \n");
 
  398   else if (numeric_string[0] == 
'u') 
 
  399     source.append(
"      tmp += vec[i*inc1 + start1]; \n");
 
  401     source.append(
"      tmp += abs(vec[i*inc1 + start1]); \n");
 
  402   source.append(
"  } \n");
 
  403   source.append(
"  else if (norm_selector == 2) \n"); 
 
  404   source.append(
"  { \n");
 
  405   source.append(
"    "); source.append(numeric_string); source.append(
" vec_entry = 0; \n");
 
  406   source.append(
"    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
 
  407   source.append(
"    { \n");
 
  408   source.append(
"      vec_entry = vec[i*inc1 + start1]; \n");
 
  409   source.append(
"      tmp += vec_entry * vec_entry; \n");
 
  410   source.append(
"    } \n");
 
  411   source.append(
"  } \n");
 
  412   source.append(
"  else if (norm_selector == 0) \n"); 
 
  413   source.append(
"  { \n");
 
  414   source.append(
"    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
 
  415   if (is_float_or_double)
 
  416     source.append(
"      tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); \n");
 
  417   else if (numeric_string[0] == 
'u') 
 
  418     source.append(
"      tmp = max(vec[i*inc1 + start1], tmp); \n");
 
  421     source.append(
"      tmp = max(("); source.append(numeric_string); source.append(
")abs(vec[i*inc1 + start1]), tmp); \n");
 
  423   source.append(
"  } \n");
 
  425   source.append(
"  tmp_buffer[get_local_id(0)] = tmp; \n");
 
  427   source.append(
"  if (norm_selector > 0) \n"); 
 
  428   source.append(
"  { \n");
 
  429   source.append(
"    for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  430   source.append(
"    { \n");
 
  431   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  432   source.append(
"      if (get_local_id(0) < stride) \n");
 
  433   source.append(
"        tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride]; \n");
 
  434   source.append(
"    } \n");
 
  435   source.append(
"    return tmp_buffer[0]; \n");
 
  436   source.append(
"  } \n");
 
  439   source.append(
"  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  440   source.append(
"  { \n");
 
  441   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  442   source.append(
"    if (get_local_id(0) < stride) \n");
 
  443   if (is_float_or_double)
 
  444     source.append(
"      tmp_buffer[get_local_id(0)] = fmax(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
 
  446     source.append(
"      tmp_buffer[get_local_id(0)] = max(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
 
  447   source.append(
"  } \n");
 
  449   source.append(
"  return tmp_buffer[0]; \n");
 
  450   source.append(
"}; \n");
 
  452   source.append(
"__kernel void norm( \n");
 
  453   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * vec, \n");
 
  454   source.append(
"          unsigned int start1, \n");
 
  455   source.append(
"          unsigned int inc1, \n");
 
  456   source.append(
"          unsigned int size1, \n");
 
  457   source.append(
"          unsigned int norm_selector, \n");
 
  458   source.append(
"          __local "); source.append(numeric_string); source.append(
" * tmp_buffer, \n");
 
  459   source.append(
"          __global "); source.append(numeric_string); source.append(
" * group_buffer) \n");
 
  460   source.append(
"{ \n");
 
  461   source.append(
"  "); source.append(numeric_string); source.append(
" tmp = impl_norm(vec, \n");
 
  462   source.append(
"                        (        get_group_id(0)  * size1) / get_num_groups(0) * inc1 + start1, \n");
 
  463   source.append(
"                        inc1, \n");
 
  464   source.append(
"                        (   (1 + get_group_id(0)) * size1) / get_num_groups(0) \n");
 
  465   source.append(
"                      - (        get_group_id(0)  * size1) / get_num_groups(0), \n");
 
  466   source.append(
"                        norm_selector, \n");
 
  467   source.append(
"                        tmp_buffer); \n");
 
  469   source.append(
"  if (get_local_id(0) == 0) \n");
 
  470   source.append(
"    group_buffer[get_group_id(0)] = tmp; \n");
 
  471   source.append(
"} \n");
 
  475 template <
typename StringType>
 
  479   source.append(
"__kernel void sum_inner_prod( \n");
 
  480   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vec1, \n");
 
  481   source.append(
"          unsigned int size_per_workgroup, \n");
 
  482   source.append(
"          __local "); source.append(numeric_string); source.append(
" * tmp_buffer, \n");
 
  483   source.append(
"          __global "); source.append(numeric_string); source.append(
" * result, \n");
 
  484   source.append(
"          unsigned int start_result, \n");
 
  485   source.append(
"          unsigned int inc_result) \n");
 
  486   source.append(
"{ \n");
 
  487   source.append(
"  "); source.append(numeric_string); source.append(
" thread_sum = 0; \n");
 
  488   source.append(
"  for (unsigned int i = get_local_id(0); i<size_per_workgroup; i += get_local_size(0)) \n");
 
  489   source.append(
"    thread_sum += vec1[size_per_workgroup * get_group_id(0) + i]; \n");
 
  491   source.append(
"  tmp_buffer[get_local_id(0)] = thread_sum; \n");
 
  493   source.append(
"  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  494   source.append(
"  { \n");
 
  495   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  496   source.append(
"    if (get_local_id(0) < stride) \n");
 
  497   source.append(
"      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
 
  498   source.append(
"  } \n");
 
  499   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  501   source.append(
"  if (get_local_id(0) == 0) \n");
 
  502   source.append(
"    result[start_result + inc_result * get_group_id(0)] = tmp_buffer[0]; \n");
 
  503   source.append(
"} \n");
 
  507 template <
typename StringType>
 
  508 void generate_sum(StringType & source, std::string 
const & numeric_string)
 
  511   source.append(
"__kernel void sum( \n");
 
  512   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vec1, \n");
 
  513   source.append(
"          unsigned int start1, \n");
 
  514   source.append(
"          unsigned int inc1, \n");
 
  515   source.append(
"          unsigned int size1, \n");
 
  516   source.append(
"          unsigned int option,  \n"); 
 
  517   source.append(
"          __local "); source.append(numeric_string); source.append(
" * tmp_buffer, \n");
 
  518   source.append(
"          __global "); source.append(numeric_string); source.append(
" * result) \n");
 
  519   source.append(
"{ \n");
 
  520   source.append(
"  "); source.append(numeric_string); source.append(
" thread_sum = 0; \n");
 
  521   source.append(
"  "); source.append(numeric_string); source.append(
" tmp = 0; \n");
 
  522   source.append(
"  for (unsigned int i = get_local_id(0); i<size1; i += get_local_size(0)) \n");
 
  523   source.append(
"  { \n");
 
  524   source.append(
"    if (option > 0) \n");
 
  525   source.append(
"      thread_sum += vec1[i*inc1+start1]; \n");
 
  526   source.append(
"    else \n");
 
  527   source.append(
"    { \n");
 
  528   source.append(
"      tmp = vec1[i*inc1+start1]; \n");
 
  529   source.append(
"      tmp = (tmp < 0) ? -tmp : tmp; \n");
 
  530   source.append(
"      thread_sum = (thread_sum > tmp) ? thread_sum : tmp; \n");
 
  531   source.append(
"    } \n");
 
  532   source.append(
"  } \n");
 
  534   source.append(
"  tmp_buffer[get_local_id(0)] = thread_sum; \n");
 
  536   source.append(
"  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  537   source.append(
"  { \n");
 
  538   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  539   source.append(
"    if (get_local_id(0) < stride) \n");
 
  540   source.append(
"    { \n");
 
  541   source.append(
"      if (option > 0) \n");
 
  542   source.append(
"        tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
 
  543   source.append(
"      else \n");
 
  544   source.append(
"        tmp_buffer[get_local_id(0)] = (tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride]) ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
 
  545   source.append(
"    } \n");
 
  546   source.append(
"  } \n");
 
  547   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  549   source.append(
"  if (get_global_id(0) == 0) \n");
 
  550   source.append(
"  { \n");
 
  551   if (numeric_string == 
"float" || numeric_string == 
"double")
 
  553     source.append(
"    if (option == 2) \n");
 
  554     source.append(
"      *result = sqrt(tmp_buffer[0]); \n");
 
  555     source.append(
"    else \n");
 
  557   source.append(
"      *result = tmp_buffer[0]; \n");
 
  558   source.append(
"  } \n");
 
  559   source.append(
"} \n");
 
  563 template <
typename StringType>
 
  567   source.append(
"unsigned int index_norm_inf_impl( \n");
 
  568   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * vec, \n");
 
  569   source.append(
"          unsigned int start1, \n");
 
  570   source.append(
"          unsigned int inc1, \n");
 
  571   source.append(
"          unsigned int size1, \n");
 
  572   source.append(
"          __local "); source.append(numeric_string); source.append(
" * entry_buffer, \n");
 
  573   source.append(
"          __local unsigned int * index_buffer) \n");
 
  574   source.append(
"{ \n");
 
  576   source.append(
"  "); source.append(numeric_string); source.append(
" cur_max = 0; \n");
 
  577   source.append(
"  "); source.append(numeric_string); source.append(
" tmp; \n");
 
  578   source.append(
"  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
 
  579   source.append(
"  { \n");
 
  580   if (numeric_string == 
"float" || numeric_string == 
"double")
 
  581     source.append(
"    tmp = fabs(vec[i*inc1+start1]); \n");
 
  582   else if (numeric_string[0] == 
'u') 
 
  583     source.append(
"    tmp = vec[i*inc1+start1]; \n");
 
  585     source.append(
"    tmp = abs(vec[i*inc1+start1]); \n");
 
  586   source.append(
"    if (cur_max < tmp) \n");
 
  587   source.append(
"    { \n");
 
  588   source.append(
"      entry_buffer[get_global_id(0)] = tmp; \n");
 
  589   source.append(
"      index_buffer[get_global_id(0)] = i; \n");
 
  590   source.append(
"      cur_max = tmp; \n");
 
  591   source.append(
"    } \n");
 
  592   source.append(
"  } \n");
 
  595   source.append(
"  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2) \n");
 
  596   source.append(
"  { \n");
 
  597   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  598   source.append(
"    if (get_global_id(0) < stride) \n");
 
  599   source.append(
"   { \n");
 
  601   source.append(
"      if (entry_buffer[get_global_id(0)] < entry_buffer[get_global_id(0)+stride]) \n");
 
  602   source.append(
"      { \n");
 
  603   source.append(
"        index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride]; \n");
 
  604   source.append(
"        entry_buffer[get_global_id(0)] = entry_buffer[get_global_id(0)+stride]; \n");
 
  605   source.append(
"      } \n");
 
  606   source.append(
"    } \n");
 
  607   source.append(
"  } \n");
 
  608   source.append(
" \n");
 
  609   source.append(
"  return index_buffer[0]; \n");
 
  610   source.append(
"} \n");
 
  612   source.append(
"__kernel void index_norm_inf( \n");
 
  613   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vec, \n");
 
  614   source.append(
"          unsigned int start1, \n");
 
  615   source.append(
"          unsigned int inc1, \n");
 
  616   source.append(
"          unsigned int size1, \n");
 
  617   source.append(
"          __local "); source.append(numeric_string); source.append(
" * entry_buffer, \n");
 
  618   source.append(
"          __local unsigned int * index_buffer, \n");
 
  619   source.append(
"          __global unsigned int * result) \n");
 
  620   source.append(
"{ \n");
 
  621   source.append(
"  entry_buffer[get_global_id(0)] = 0; \n");
 
  622   source.append(
"  index_buffer[get_global_id(0)] = 0; \n");
 
  623   source.append(
"  unsigned int tmp = index_norm_inf_impl(vec, start1, inc1, size1, entry_buffer, index_buffer); \n");
 
  624   source.append(
"  if (get_global_id(0) == 0) *result = tmp; \n");
 
  625   source.append(
"} \n");
 
  629 template <
typename StringType>
 
  630 void generate_maxmin(StringType & source, std::string 
const & numeric_string, 
bool is_max)
 
  634     source.append(
"__kernel void max_kernel( \n");
 
  636     source.append(
"__kernel void min_kernel( \n");
 
  637   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vec1, \n");
 
  638   source.append(
"          unsigned int start1, \n");
 
  639   source.append(
"          unsigned int inc1, \n");
 
  640   source.append(
"          unsigned int size1, \n");
 
  641   source.append(
"          __local "); source.append(numeric_string); source.append(
" * tmp_buffer, \n");
 
  642   source.append(
"          __global "); source.append(numeric_string); source.append(
" * result) \n");
 
  643   source.append(
"{ \n");
 
  644   source.append(
"  "); source.append(numeric_string); source.append(
" thread_result = vec1[start1]; \n");
 
  645   source.append(
"  for (unsigned int i = get_global_id(0); i<size1; i += get_global_size(0)) \n");
 
  646   source.append(
"  { \n");
 
  647   source.append(
"    "); source.append(numeric_string); source.append(
" tmp = vec1[i*inc1+start1]; \n");
 
  649     source.append(
"      thread_result = thread_result > tmp ? thread_result : tmp; \n");
 
  651     source.append(
"      thread_result = thread_result < tmp ? thread_result : tmp; \n");
 
  652   source.append(
"  } \n");
 
  654   source.append(
"  tmp_buffer[get_local_id(0)] = thread_result; \n");
 
  656   source.append(
"  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  657   source.append(
"  { \n");
 
  658   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  659   source.append(
"    if (get_local_id(0) < stride) \n");
 
  660   source.append(
"    { \n");
 
  662     source.append(
"        tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
 
  664     source.append(
"        tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] < tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
 
  665   source.append(
"    } \n");
 
  666   source.append(
"  } \n");
 
  667   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  669   source.append(
"  if (get_local_id(0) == 0) \n");
 
  670   source.append(
"    result[get_group_id(0)] = tmp_buffer[0]; \n");
 
  671   source.append(
"} \n");
 
  678 template<
typename NumericT>
 
  691     static std::map<cl_context, bool> init_done;
 
  695       source.reserve(8192);
 
  697       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
  715       #ifdef VIENNACL_BUILD_INFO 
  716       std::cout << 
"Creating program " << prog_name << std::endl;
 
  718       ctx.add_program(source, prog_name);
 
  719       init_done[ctx.handle().get()] = 
true;
 
  726 template<
typename NumericT>
 
  739     static std::map<cl_context, bool> init_done;
 
  743       source.reserve(8192);
 
  745       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
  755       #ifdef VIENNACL_BUILD_INFO 
  756       std::cout << 
"Creating program " << prog_name << std::endl;
 
  758       ctx.add_program(source, prog_name);
 
  759       init_done[ctx.handle().get()] = 
true;
 
  765 template<
typename StringT>
 
  768  source.append(
" __kernel void convert_" + dest_type + 
"_" + src_type + 
"( \n");
 
  769  source.append(
"  __global " + dest_type + 
" * dest, \n");
 
  770  source.append(
"  unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, \n");
 
  771  source.append(
"  __global const " + src_type + 
" * src, \n");
 
  772  source.append(
"  unsigned int start_src, unsigned int inc_src) \n");
 
  773  source.append(
"  { \n");
 
  774  source.append(
"   for (unsigned int i = get_global_id(0); i < size_dest; i += get_global_size(0)) \n");
 
  775  source.append(
"     dest[start_dest + i * inc_dest] = src[start_src + i * inc_src]; \n");
 
  776  source.append(
"  } \n");
 
  786     return "vector_convert";
 
  791     static std::map<cl_context, bool> init_done;
 
  795       source.reserve(4096);
 
  851       #ifdef VIENNACL_BUILD_INFO 
  852       std::cout << 
"Creating program " << prog_name << std::endl;
 
viennacl::ocl::device const & current_device() const 
Returns the current device. 
void generate_inner_prod(StringType &source, std::string const &numeric_string, vcl_size_t vector_num)
void generate_index_norm_inf(StringType &source, std::string const &numeric_string)
static std::string program_name()
void append_double_precision_pragma< double >(viennacl::ocl::context const &ctx, std::string &source)
void generate_vector_swap(StringType &source, std::string const &numeric_string)
Some helper routines for reading/writing/printing scheduler expressions. 
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Main kernel class for generating OpenCL kernels for multiple inner products on/with viennacl::vector<...
void generate_assign_cpu(StringType &source, std::string const &numeric_string, bool is_row_major)
Provides OpenCL-related utilities. 
void generate_maxmin(StringType &source, std::string const &numeric_string, bool is_max)
Main kernel class for vector conversion routines (e.g. convert vector to vector). 
void generate_plane_rotation(StringType &source, std::string const &numeric_string)
static std::string program_name()
bool with_stride_and_range
const viennacl::ocl::handle< cl_context > & handle() const 
Returns the context handle. 
static void apply(viennacl::ocl::context const &)
void generate_avbv(StringType &source, std::string const &numeric_string)
viennacl::ocl::program & add_program(cl_program p, std::string const &prog_name)
Adds a program to the context. 
const OCL_TYPE & get() const 
static std::string program_name()
void generate_sum(StringType &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
Configuration struct for generating OpenCL kernels for linear combinations of vectors. 
bool double_support() const 
ViennaCL convenience function: Returns true if the device supports double precision. 
Provides the datastructures for dealing with a single statement such as 'x = y + z;'. 
static void init(viennacl::ocl::context &ctx)
void generate_avbv_impl(StringType &source, std::string const &numeric_string, avbv_config const &cfg)
void generate_avbv_impl2(StringType &source, std::string const &, avbv_config const &cfg, bool mult_alpha, bool mult_beta)
void generate_inner_prod_sum(StringType &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL. 
void generate_norm(StringType &source, std::string const &numeric_string)
Helper class for converting a type to its string representation. 
void generate_vector_convert(StringT &source, std::string const &dest_type, std::string const &src_type)
static void init(viennacl::ocl::context &ctx)
avbv_scalar_type
Enumeration for the scalar type in avbv-like operations. 
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...