1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP 
    2 #define VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP 
   45 template<
typename StringT>
 
   48   source.append(
"__kernel void cg_vector_update( \n");
 
   49   source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
   50   source.append(
"  "); source.append(numeric_string); source.append(
" alpha, \n");
 
   51   source.append(
"  __global "); source.append(numeric_string); source.append(
" * p, \n");
 
   52   source.append(
"  __global "); source.append(numeric_string); source.append(
" * r, \n");
 
   53   source.append(
"  __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
 
   54   source.append(
"  "); source.append(numeric_string); source.append(
" beta, \n");
 
   55   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
   56   source.append(
"  unsigned int size, \n");
 
   57   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
 
   58   source.append(
"{ \n");
 
   59   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
 
   60   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 
   61   source.append(
"    "); source.append(numeric_string); source.append(
" value_p = p[i]; \n");
 
   62   source.append(
"    "); source.append(numeric_string); source.append(
" value_r = r[i]; \n");
 
   64   source.append(
"    result[i] += alpha * value_p; \n");
 
   65   source.append(
"    value_r   -= alpha * Ap[i]; \n");
 
   66   source.append(
"    value_p    = value_r + beta * value_p; \n");
 
   68   source.append(
"    p[i] = value_p; \n");
 
   69   source.append(
"    r[i] = value_r; \n");
 
   70   source.append(
"    inner_prod_contrib += value_r * value_r; \n");
 
   71   source.append(
"  }  \n");
 
   74   source.append(
"  shared_array[get_local_id(0)] = inner_prod_contrib; \n");
 
   75   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
   76   source.append(
"  { \n");
 
   77   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
   78   source.append(
"    if (get_local_id(0) < stride)  \n");
 
   79   source.append(
"      shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride];  \n");
 
   83   source.append(
" if (get_local_id(0) == 0) \n ");
 
   84   source.append(
"   inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
 
   86   source.append(
"} \n");
 
   89 template<
typename StringT>
 
   95   source.append(
"__kernel void cg_csr_blocked_prod( \n");
 
   96   source.append(
"    __global const unsigned int * row_indices, \n");
 
   97   source.append(
"    __global const unsigned int * column_indices, \n");
 
   98   source.append(
"    __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
   99   source.append(
"    __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
  100   source.append(
"    __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
  101   source.append(
"    unsigned int size, \n");
 
  102   source.append(
"    __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  103   source.append(
"    unsigned int buffer_size, \n");
 
  104   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
  105   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
 
  106   source.append(
"{ \n");
 
  107   source.append(
"  __local "); source.append(numeric_string); source.append(
" shared_elements[256]; \n");
 
  108   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
  109   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
  111   source.append(
"  const unsigned int id_in_row = get_local_id(0) % " + ss.str() + 
"; \n");
 
  112   source.append(
"  const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
 
  113   source.append(
"  const unsigned int block_start = get_group_id(0) * block_increment; \n");
 
  114   source.append(
"  const unsigned int block_stop  = min(block_start + block_increment, size); \n");
 
  116   source.append(
"  for (unsigned int row  = block_start + get_local_id(0) / " + ss.str() + 
"; \n");
 
  117   source.append(
"                    row  < block_stop; \n");
 
  118   source.append(
"                    row += get_local_size(0) / " + ss.str() + 
") \n");
 
  119   source.append(
"  { \n");
 
  120   source.append(
"    "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
 
  121   source.append(
"    unsigned int row_end = row_indices[row+1]; \n");
 
  122   source.append(
"    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + 
") \n");
 
  123   source.append(
"      dot_prod += elements[i] * p[column_indices[i]]; \n");
 
  125   source.append(
"    shared_elements[get_local_id(0)] = dot_prod; \n");
 
  126   source.append(
"    #pragma unroll \n");
 
  127   source.append(
"    for (unsigned int k = 1; k < " + ss.str() + 
"; k *= 2) \n");
 
  128   source.append(
"      shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
 
  130   source.append(
"    if (id_in_row == 0) { \n");
 
  131   source.append(
"      Ap[row] = shared_elements[get_local_id(0)]; \n");
 
  132   source.append(
"      inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
 
  133   source.append(
"      inner_prod_pAp  +=                           p[row] * shared_elements[get_local_id(0)]; \n");
 
  134   source.append(
"    } \n");
 
  135   source.append(
"  } \n");
 
  138   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
  139   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
  140   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  141   source.append(
"  { \n");
 
  142   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  143   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  144   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
  145   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
  146   source.append(
"    } ");
 
  147   source.append(
"  } ");
 
  150   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
  151   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
  152   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
  153   source.append(
"  } \n");
 
  155   source.append(
"} \n");
 
  158 template<
typename StringT>
 
  161   source.append(
"__kernel void cg_csr_prod( \n");
 
  162   source.append(
"  __global const unsigned int * row_indices, \n");
 
  163   source.append(
"  __global const unsigned int * column_indices, \n");
 
  164   source.append(
"  __global const unsigned int * row_blocks, \n");
 
  165   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  166   source.append(
"  unsigned int num_blocks, \n");
 
  167   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
  168   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
  169   source.append(
"  unsigned int size, \n");
 
  170   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  171   source.append(
"  unsigned int buffer_size, \n");
 
  172   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
  173   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
 
  174   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_elements) \n");
 
  175   source.append(
"{ \n");
 
  177   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
  178   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
  180   source.append(
"  for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
 
  181   source.append(
"    unsigned int row_start = row_blocks[block_id]; \n");
 
  182   source.append(
"    unsigned int row_stop  = row_blocks[block_id + 1]; \n");
 
  183   source.append(
"    unsigned int rows_to_process = row_stop - row_start; \n");
 
  184   source.append(
"    unsigned int element_start = row_indices[row_start]; \n");
 
  185   source.append(
"    unsigned int element_stop = row_indices[row_stop]; \n");
 
  187   source.append(
"    if (rows_to_process > 1) { \n"); 
 
  189   source.append(
"      for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
 
  190   source.append(
"        shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
 
  192   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  195   source.append(
"      for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
 
  196   source.append(
"        "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
 
  197   source.append(
"        unsigned int thread_row_start = row_indices[row]     - element_start; \n");
 
  198   source.append(
"        unsigned int thread_row_stop  = row_indices[row + 1] - element_start; \n");
 
  199   source.append(
"        for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
 
  200   source.append(
"          dot_prod += shared_elements[i]; \n");
 
  201   source.append(
"        Ap[row] = dot_prod; \n");
 
  202   source.append(
"        inner_prod_ApAp += dot_prod * dot_prod; \n");
 
  203   source.append(
"        inner_prod_pAp  +=   p[row] * dot_prod; \n");
 
  204   source.append(
"      } \n");
 
  205   source.append(
"    } \n");
 
  207   source.append(
"    else  \n"); 
 
  208   source.append(
"    { \n");
 
  210   source.append(
"      shared_elements[get_local_id(0)] = 0; \n");
 
  211   source.append(
"      for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
 
  212   source.append(
"        shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
 
  215   source.append(
"      for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
 
  216   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  217   source.append(
"        if (get_local_id(0) < stride) \n");
 
  218   source.append(
"          shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
 
  219   source.append(
"      } \n");
 
  221   source.append(
"      if (get_local_id(0) == 0) { \n");
 
  222   source.append(
"        Ap[row_start] = shared_elements[0]; \n");
 
  223   source.append(
"        inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
 
  224   source.append(
"        inner_prod_pAp  +=       p[row_start] * shared_elements[0]; \n");
 
  225   source.append(
"      } \n");
 
  226   source.append(
"    } \n");
 
  227   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  228   source.append(
"  } \n");
 
  231   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
  232   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
  233   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  234   source.append(
"  { \n");
 
  235   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  236   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  237   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
  238   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
  239   source.append(
"    } ");
 
  240   source.append(
"  } ");
 
  243   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
  244   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
  245   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
  246   source.append(
"  } \n");
 
  248   source.append(
"} \n");
 
  253 template<
typename StringT>
 
  256   source.append(
"__kernel void cg_coo_prod( \n");
 
  257   source.append(
"  __global const uint2 * coords,  \n");
 
  258   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  259   source.append(
"  __global const uint  * group_boundaries, \n");
 
  260   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
  261   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
  262   source.append(
"  unsigned int size, \n");
 
  263   source.append(
"  __local unsigned int * shared_rows, \n");
 
  264   source.append(
"  __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
 
  265   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  266   source.append(
"  unsigned int buffer_size, \n");
 
  267   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
  268   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
 
  269   source.append(
"{ \n");
 
  270   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
  271   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
  274   source.append(
"  uint2 tmp; \n");
 
  275   source.append(
"  "); source.append(numeric_string); source.append(
" val; \n");
 
  276   source.append(
"  uint group_start = group_boundaries[get_group_id(0)]; \n");
 
  277   source.append(
"  uint group_end   = group_boundaries[get_group_id(0) + 1]; \n");
 
  278   source.append(
"  uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");   
 
  280   source.append(
"  uint local_index = 0; \n");
 
  282   source.append(
"  for (uint k = 0; k < k_end; ++k) { \n");
 
  283   source.append(
"    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
 
  285   source.append(
"    tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
 
  286   source.append(
"    val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
 
  289   source.append(
"    if (get_local_id(0) == 0 && k > 0) { \n");
 
  290   source.append(
"      if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
 
  291   source.append(
"        val += inter_results[get_local_size(0)-1]; \n");
 
  292   source.append(
"      else {\n");
 
  293   source.append(
"        "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_size(0)-1]; \n");
 
  294   source.append(
"        Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
 
  295   source.append(
"        inner_prod_ApAp += Ap_entry * Ap_entry; \n");
 
  296   source.append(
"        inner_prod_pAp  += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
 
  297   source.append(
"      } \n");
 
  298   source.append(
"    } \n");
 
  301   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  302   source.append(
"    shared_rows[get_local_id(0)] = tmp.x; \n");
 
  303   source.append(
"    inter_results[get_local_id(0)] = val; \n");
 
  304   source.append(
"    "); source.append(numeric_string); source.append(
" left = 0; \n");
 
  305   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  307   source.append(
"    for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
 
  308   source.append(
"      left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
 
  309   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  310   source.append(
"      inter_results[get_local_id(0)] += left; \n");
 
  311   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  312   source.append(
"    } \n");
 
  315   source.append(
"    if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
 
  316   source.append(
"      shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
 
  317   source.append(
"      "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
 
  318   source.append(
"      Ap[tmp.x] = Ap_entry; \n");
 
  319   source.append(
"      inner_prod_ApAp += Ap_entry * Ap_entry; \n");
 
  320   source.append(
"      inner_prod_pAp  += p[tmp.x] * Ap_entry; \n");
 
  321   source.append(
"    } \n");
 
  323   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  324   source.append(
"  }  \n"); 
 
  326   source.append(
"  if (local_index + 1 == group_end) {\n");  
 
  327   source.append(
"    "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
 
  328   source.append(
"    Ap[tmp.x] = Ap_entry; \n");
 
  329   source.append(
"    inner_prod_ApAp += Ap_entry * Ap_entry; \n");
 
  330   source.append(
"    inner_prod_pAp  += p[tmp.x] * Ap_entry; \n");
 
  331   source.append(
"  }  \n");
 
  334   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
  335   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
  336   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  337   source.append(
"  { \n");
 
  338   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  339   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  340   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
  341   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
  342   source.append(
"    } ");
 
  343   source.append(
"  } ");
 
  346   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
  347   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
  348   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
  349   source.append(
"  } \n");
 
  351   source.append(
"} \n \n");
 
  356 template<
typename StringT>
 
  359   source.append(
"__kernel void cg_ell_prod( \n");
 
  360   source.append(
"  __global const unsigned int * coords, \n");
 
  361   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  362   source.append(
"  unsigned int internal_row_num, \n");
 
  363   source.append(
"  unsigned int items_per_row, \n");
 
  364   source.append(
"  unsigned int aligned_items_per_row, \n");
 
  365   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
  366   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
  367   source.append(
"  unsigned int size, \n");
 
  368   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  369   source.append(
"  unsigned int buffer_size, \n");
 
  370   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
  371   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
 
  372   source.append(
"{ \n");
 
  373   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
  374   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
  375   source.append(
"  uint glb_id = get_global_id(0); \n");
 
  376   source.append(
"  uint glb_sz = get_global_size(0); \n");
 
  378   source.append(
"  for (uint row = glb_id; row < size; row += glb_sz) { \n");
 
  379   source.append(
"    "); source.append(numeric_string); source.append(
" sum = 0; \n");
 
  381   source.append(
"    uint offset = row; \n");
 
  382   source.append(
"    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
 
  383   source.append(
"      "); source.append(numeric_string); source.append(
" val = elements[offset]; \n");
 
  384   source.append(
"      sum += (val != 0) ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(
")0; \n");
 
  385   source.append(
"    } \n");
 
  387   source.append(
"    Ap[row] = sum; \n");
 
  388   source.append(
"    inner_prod_ApAp += sum * sum; \n");
 
  389   source.append(
"    inner_prod_pAp  += p[row] * sum; \n");
 
  390   source.append(
"  }  \n");
 
  393   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
  394   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
  395   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  396   source.append(
"  { \n");
 
  397   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  398   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  399   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
  400   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
  401   source.append(
"    } ");
 
  402   source.append(
"  } ");
 
  405   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
  406   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
  407   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
  408   source.append(
"  } \n");
 
  409   source.append(
"} \n \n");
 
  412 template<
typename StringT>
 
  415   source.append(
"__kernel void cg_sliced_ell_prod( \n");
 
  416   source.append(
"  __global const unsigned int * columns_per_block, \n");
 
  417   source.append(
"  __global const unsigned int * column_indices, \n");
 
  418   source.append(
"  __global const unsigned int * block_start, \n");
 
  419   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  420   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
  421   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
  422   source.append(
"  unsigned int size, \n");
 
  423   source.append(
"  unsigned int block_size, \n");
 
  424   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  425   source.append(
"  unsigned int buffer_size, \n");
 
  426   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
  427   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
 
  428   source.append(
"{ \n");
 
  429   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
  430   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
  431   source.append(
"  uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
 
  432   source.append(
"  uint id_in_block = get_local_id(0) % block_size; \n");
 
  433   source.append(
"  uint num_blocks  = (size - 1) / block_size + 1; \n");
 
  434   source.append(
"  uint global_warp_count  = blocks_per_workgroup * get_num_groups(0); \n");
 
  435   source.append(
"  uint global_warp_id     = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
 
  437   source.append(
"  for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
 
  438   source.append(
"    "); source.append(numeric_string); source.append(
" sum = 0; \n");
 
  440   source.append(
"    uint row    = block_idx * block_size + id_in_block; \n");
 
  441   source.append(
"    uint offset = block_start[block_idx]; \n");
 
  442   source.append(
"    uint num_columns = columns_per_block[block_idx]; \n");
 
  443   source.append(
"    for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
 
  444   source.append(
"      uint index = offset + item_id * block_size + id_in_block; \n");
 
  445   source.append(
"      "); source.append(numeric_string); source.append(
" val = elements[index]; \n");
 
  446   source.append(
"      sum += (val != 0) ? (p[column_indices[index]] * val) : 0; \n");
 
  447   source.append(
"    } \n");
 
  449   source.append(
"    if (row < size) {\n");
 
  450   source.append(
"      Ap[row] = sum; \n");
 
  451   source.append(
"      inner_prod_ApAp += sum * sum; \n");
 
  452   source.append(
"      inner_prod_pAp  += p[row] * sum; \n");
 
  453   source.append(
"    }  \n");
 
  454   source.append(
"  }  \n");
 
  457   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
  458   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
  459   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  460   source.append(
"  { \n");
 
  461   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  462   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  463   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
  464   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
  465   source.append(
"    } ");
 
  466   source.append(
"  } ");
 
  469   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
  470   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
  471   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
  472   source.append(
"  } \n");
 
  473   source.append(
"} \n \n");
 
  476 template<
typename StringT>
 
  479   source.append(
"__kernel void cg_hyb_prod( \n");
 
  480   source.append(
"  const __global int* ell_coords, \n");
 
  481   source.append(
"  const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
 
  482   source.append(
"  const __global uint* csr_rows, \n");
 
  483   source.append(
"  const __global uint* csr_cols, \n");
 
  484   source.append(
"  const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
 
  485   source.append(
"  unsigned int internal_row_num, \n");
 
  486   source.append(
"  unsigned int items_per_row, \n");
 
  487   source.append(
"  unsigned int aligned_items_per_row, \n");
 
  488   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
  489   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
  490   source.append(
"  unsigned int size, \n");
 
  491   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  492   source.append(
"  unsigned int buffer_size, \n");
 
  493   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
  494   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
 
  495   source.append(
"{ \n");
 
  496   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
  497   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
  498   source.append(
"  uint glb_id = get_global_id(0); \n");
 
  499   source.append(
"  uint glb_sz = get_global_size(0); \n");
 
  501   source.append(
"  for (uint row = glb_id; row < size; row += glb_sz) { \n");
 
  502   source.append(
"    "); source.append(numeric_string); source.append(
" sum = 0; \n");
 
  504   source.append(
"    uint offset = row; \n");
 
  505   source.append(
"    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
 
  506   source.append(
"      "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
 
  507   source.append(
"      sum += (val != 0) ? (p[ell_coords[offset]] * val) : 0; \n");
 
  508   source.append(
"    } \n");
 
  510   source.append(
"    uint col_begin = csr_rows[row]; \n");
 
  511   source.append(
"    uint col_end   = csr_rows[row + 1]; \n");
 
  513   source.append(
"    for (uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
 
  514   source.append(
"      sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
 
  515   source.append(
"    } \n");
 
  517   source.append(
"    Ap[row] = sum; \n");
 
  518   source.append(
"    inner_prod_ApAp += sum * sum; \n");
 
  519   source.append(
"    inner_prod_pAp  += p[row] * sum; \n");
 
  520   source.append(
"  }  \n");
 
  523   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
  524   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
  525   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  526   source.append(
"  { \n");
 
  527   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  528   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  529   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
  530   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
  531   source.append(
"    } ");
 
  532   source.append(
"  } ");
 
  535   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
  536   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
  537   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
  538   source.append(
"  } \n");
 
  539   source.append(
"} \n \n");
 
  546 template<
typename StringT>
 
  549   source.append(
"__kernel void bicgstab_update_s( \n");
 
  550   source.append(
"  __global "); source.append(numeric_string); source.append(
" * s, \n");
 
  551   source.append(
"  __global "); source.append(numeric_string); source.append(
" const * r, \n");
 
  552   source.append(
"  __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
 
  553   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  554   source.append(
"  unsigned int chunk_size, \n");
 
  555   source.append(
"  unsigned int chunk_offset, \n");
 
  556   source.append(
"  unsigned int size, \n");
 
  557   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array, \n");
 
  558   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_Ap_in_r0) \n");
 
  559   source.append(
"{ \n");
 
  561   source.append(
"  "); source.append(numeric_string); source.append(
" alpha = 0; \n");
 
  564   source.append(
"  shared_array[get_local_id(0)]  = inner_prod_buffer[get_local_id(0)]; \n");
 
  565   source.append(
"  shared_array_Ap_in_r0[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + 3 * chunk_size]; \n");
 
  566   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  567   source.append(
"  { \n");
 
  568   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  569   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  570   source.append(
"      shared_array[get_local_id(0)]  += shared_array[get_local_id(0) + stride];  \n");
 
  571   source.append(
"      shared_array_Ap_in_r0[get_local_id(0)] += shared_array_Ap_in_r0[get_local_id(0) + stride];  \n");
 
  572   source.append(
"    } ");
 
  573   source.append(
"  } ");
 
  576   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  577   source.append(
"  alpha = shared_array[0] / shared_array_Ap_in_r0[0]; ");
 
  579   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
 
  580   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 
  581   source.append(
"    "); source.append(numeric_string); source.append(
" value_s = s[i]; \n");
 
  582   source.append(
"     \n");
 
  583   source.append(
"    value_s = r[i] - alpha * Ap[i]; \n");
 
  584   source.append(
"    inner_prod_contrib += value_s * value_s; \n");
 
  585   source.append(
"     \n");
 
  586   source.append(
"    s[i] = value_s; \n");
 
  587   source.append(
"  }  \n");
 
  588   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  591   source.append(
"  shared_array[get_local_id(0)] = inner_prod_contrib; \n");
 
  592   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  593   source.append(
"  { \n");
 
  594   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  595   source.append(
"    if (get_local_id(0) < stride)  \n");
 
  596   source.append(
"      shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride];  \n");
 
  597   source.append(
"  } ");
 
  600   source.append(
" if (get_local_id(0) == 0) \n ");
 
  601   source.append(
"   inner_prod_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
 
  603   source.append(
"} \n");
 
  609 template<
typename StringT>
 
  612   source.append(
"__kernel void bicgstab_vector_update( \n");
 
  613   source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
  614   source.append(
"  "); source.append(numeric_string); source.append(
" alpha, \n");
 
  615   source.append(
"  __global "); source.append(numeric_string); source.append(
" * p, \n");
 
  616   source.append(
"  "); source.append(numeric_string); source.append(
" omega, \n");
 
  617   source.append(
"  __global "); source.append(numeric_string); source.append(
" const * s, \n");
 
  618   source.append(
"  __global "); source.append(numeric_string); source.append(
" * residual, \n");
 
  619   source.append(
"  __global "); source.append(numeric_string); source.append(
" const * As, \n");
 
  620   source.append(
"  "); source.append(numeric_string); source.append(
" beta, \n");
 
  621   source.append(
"  __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
 
  622   source.append(
"  __global "); source.append(numeric_string); source.append(
" const * r0star, \n");
 
  623   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  624   source.append(
"  unsigned int size, \n");
 
  625   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
 
  626   source.append(
"{ \n");
 
  627   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_r_r0star = 0; \n");
 
  628   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 
  629   source.append(
"    "); source.append(numeric_string); source.append(
" value_result = result[i]; \n");
 
  630   source.append(
"    "); source.append(numeric_string); source.append(
" value_p = p[i]; \n");
 
  631   source.append(
"    "); source.append(numeric_string); source.append(
" value_s = s[i]; \n");
 
  632   source.append(
"    "); source.append(numeric_string); source.append(
" value_residual = residual[i]; \n");
 
  633   source.append(
"    "); source.append(numeric_string); source.append(
" value_As = As[i]; \n");
 
  634   source.append(
"    "); source.append(numeric_string); source.append(
" value_Ap = Ap[i]; \n");
 
  635   source.append(
"    "); source.append(numeric_string); source.append(
" value_r0star = r0star[i]; \n");
 
  636   source.append(
"     \n");
 
  637   source.append(
"    value_result += alpha * value_p + omega * value_s; \n");
 
  638   source.append(
"    value_residual  = value_s - omega * value_As; \n");
 
  639   source.append(
"    value_p         = value_residual + beta * (value_p - omega * value_Ap); \n");
 
  640   source.append(
"     \n");
 
  641   source.append(
"    result[i]   = value_result; \n");
 
  642   source.append(
"    residual[i] = value_residual; \n");
 
  643   source.append(
"    p[i]        = value_p; \n");
 
  644   source.append(
"    inner_prod_r_r0star += value_residual * value_r0star; \n");
 
  645   source.append(
"  }  \n");
 
  648   source.append(
"  shared_array[get_local_id(0)] = inner_prod_r_r0star; \n");
 
  649   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  650   source.append(
"  { \n");
 
  651   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  652   source.append(
"    if (get_local_id(0) < stride)  \n");
 
  653   source.append(
"      shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride];  \n");
 
  654   source.append(
"  } ");
 
  657   source.append(
" if (get_local_id(0) == 0) \n ");
 
  658   source.append(
"   inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
 
  660   source.append(
"} \n");
 
  663 template<
typename StringT>
 
  666   std::stringstream ss;
 
  669   source.append(
"__kernel void bicgstab_csr_blocked_prod( \n");
 
  670   source.append(
"    __global const unsigned int * row_indices, \n");
 
  671   source.append(
"    __global const unsigned int * column_indices, \n");
 
  672   source.append(
"    __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  673   source.append(
"    __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
  674   source.append(
"    __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
  675   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
 
  676   source.append(
"    unsigned int size, \n");
 
  677   source.append(
"    __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  678   source.append(
"  unsigned int buffer_size, \n");
 
  679   source.append(
"  unsigned int buffer_offset, \n");
 
  680   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
  681   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
 
  682   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
 
  683   source.append(
"{ \n");
 
  684   source.append(
"  __local "); source.append(numeric_string); source.append(
" shared_elements[256]; \n");
 
  685   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
  686   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
  687   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
 
  689   source.append(
"  const unsigned int id_in_row = get_local_id(0) % " + ss.str() + 
"; \n");
 
  690   source.append(
"  const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
 
  691   source.append(
"  const unsigned int block_start = get_group_id(0) * block_increment; \n");
 
  692   source.append(
"  const unsigned int block_stop  = min(block_start + block_increment, size); \n");
 
  694   source.append(
"  for (unsigned int row  = block_start + get_local_id(0) / " + ss.str() + 
"; \n");
 
  695   source.append(
"                    row  < block_stop; \n");
 
  696   source.append(
"                    row += get_local_size(0) / " + ss.str() + 
") \n");
 
  697   source.append(
"  { \n");
 
  698   source.append(
"    "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
 
  699   source.append(
"    unsigned int row_end = row_indices[row+1]; \n");
 
  700   source.append(
"    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + 
") \n");
 
  701   source.append(
"      dot_prod += elements[i] * p[column_indices[i]]; \n");
 
  703   source.append(
"    shared_elements[get_local_id(0)] = dot_prod; \n");
 
  704   source.append(
"    #pragma unroll \n");
 
  705   source.append(
"    for (unsigned int k = 1; k < " + ss.str() + 
"; k *= 2) \n");
 
  706   source.append(
"      shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
 
  708   source.append(
"    if (id_in_row == 0) { \n");
 
  709   source.append(
"      Ap[row] = shared_elements[get_local_id(0)]; \n");
 
  710   source.append(
"      inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
 
  711   source.append(
"      inner_prod_pAp  +=                           p[row] * shared_elements[get_local_id(0)]; \n");
 
  712   source.append(
"      inner_prod_r0Ap +=                      r0star[row] * shared_elements[get_local_id(0)]; \n");
 
  713   source.append(
"    } \n");
 
  714   source.append(
"  } \n");
 
  717   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
  718   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
  719   source.append(
"  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
 
  720   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  721   source.append(
"  { \n");
 
  722   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  723   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  724   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
  725   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
  726   source.append(
"      shared_array_r0Ap[get_local_id(0)]  += shared_array_r0Ap[get_local_id(0) + stride];  \n");
 
  727   source.append(
"    } ");
 
  728   source.append(
"  } ");
 
  731   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
  732   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
  733   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
  734   source.append(
"    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
 
  735   source.append(
"  } \n");
 
  737   source.append(
"} \n");
 
  740 template<
typename StringT>
 
  743   source.append(
"__kernel void bicgstab_csr_prod( \n");
 
  744   source.append(
"  __global const unsigned int * row_indices, \n");
 
  745   source.append(
"  __global const unsigned int * column_indices, \n");
 
  746   source.append(
"  __global const unsigned int * row_blocks, \n");
 
  747   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  748   source.append(
"  unsigned int num_blocks, \n");
 
  749   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
  750   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
  751   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
 
  752   source.append(
"  unsigned int size, \n");
 
  753   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  754   source.append(
"  unsigned int buffer_size, \n");
 
  755   source.append(
"  unsigned int buffer_offset, \n");
 
  756   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
  757   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
 
  758   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
 
  759   source.append(
"{ \n");
 
  760   source.append(
"  __local "); source.append(numeric_string); source.append(
" shared_elements[1024]; \n");
 
  761   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
  762   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
  763   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
 
  765   source.append(
"  for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
 
  766   source.append(
"    unsigned int row_start = row_blocks[block_id]; \n");
 
  767   source.append(
"    unsigned int row_stop  = row_blocks[block_id + 1]; \n");
 
  768   source.append(
"    unsigned int rows_to_process = row_stop - row_start; \n");
 
  769   source.append(
"    unsigned int element_start = row_indices[row_start]; \n");
 
  770   source.append(
"    unsigned int element_stop = row_indices[row_stop]; \n");
 
  772   source.append(
"    if (rows_to_process > 1) { \n"); 
 
  774   source.append(
"      for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
 
  775   source.append(
"        shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
 
  777   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  780   source.append(
"      for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
 
  781   source.append(
"        "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
 
  782   source.append(
"        unsigned int thread_row_start = row_indices[row]     - element_start; \n");
 
  783   source.append(
"        unsigned int thread_row_stop  = row_indices[row + 1] - element_start; \n");
 
  784   source.append(
"        for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
 
  785   source.append(
"          dot_prod += shared_elements[i]; \n");
 
  786   source.append(
"        Ap[row] = dot_prod; \n");
 
  787   source.append(
"        inner_prod_ApAp += dot_prod * dot_prod; \n");
 
  788   source.append(
"        inner_prod_pAp  +=   p[row] * dot_prod; \n");
 
  789   source.append(
"        inner_prod_r0Ap  += r0star[row] * dot_prod; \n");
 
  790   source.append(
"      } \n");
 
  791   source.append(
"    } \n");
 
  793   source.append(
"    else  \n"); 
 
  794   source.append(
"    { \n");
 
  796   source.append(
"      shared_elements[get_local_id(0)] = 0; \n");
 
  797   source.append(
"      for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
 
  798   source.append(
"        shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
 
  801   source.append(
"      for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
 
  802   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  803   source.append(
"        if (get_local_id(0) < stride) \n");
 
  804   source.append(
"          shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
 
  805   source.append(
"      } \n");
 
  807   source.append(
"      if (get_local_id(0) == 0) { \n");
 
  808   source.append(
"        Ap[row_start] = shared_elements[0]; \n");
 
  809   source.append(
"        inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
 
  810   source.append(
"        inner_prod_pAp  +=       p[row_start] * shared_elements[0]; \n");
 
  811   source.append(
"        inner_prod_r0Ap +=  r0star[row_start] * shared_elements[0]; \n");
 
  812   source.append(
"      } \n");
 
  813   source.append(
"    } \n");
 
  814   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  815   source.append(
"  } \n");
 
  818   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
  819   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
  820   source.append(
"  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
 
  821   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  822   source.append(
"  { \n");
 
  823   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  824   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  825   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
  826   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
  827   source.append(
"      shared_array_r0Ap[get_local_id(0)]  += shared_array_r0Ap[get_local_id(0) + stride];  \n");
 
  828   source.append(
"    } ");
 
  829   source.append(
"  } ");
 
  832   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
  833   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
  834   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
  835   source.append(
"    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
 
  836   source.append(
"  } \n");
 
  838   source.append(
"} \n \n");
 
  842 template<
typename StringT>
 
  845   source.append(
"__kernel void bicgstab_coo_prod( \n");
 
  846   source.append(
"  __global const uint2 * coords,  \n");
 
  847   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  848   source.append(
"  __global const uint  * group_boundaries, \n");
 
  849   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
  850   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
  851   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
 
  852   source.append(
"  unsigned int size, \n");
 
  853   source.append(
"  __local unsigned int * shared_rows, \n");
 
  854   source.append(
"  __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
 
  855   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  856   source.append(
"  unsigned int buffer_size, \n");
 
  857   source.append(
"  unsigned int buffer_offset, \n");
 
  858   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
  859   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
 
  860   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
 
  861   source.append(
"{ \n");
 
  862   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
  863   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
  864   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
 
  867   source.append(
"  uint2 tmp; \n");
 
  868   source.append(
"  "); source.append(numeric_string); source.append(
" val; \n");
 
  869   source.append(
"  uint group_start = group_boundaries[get_group_id(0)]; \n");
 
  870   source.append(
"  uint group_end   = group_boundaries[get_group_id(0) + 1]; \n");
 
  871   source.append(
"  uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");   
 
  873   source.append(
"  uint local_index = 0; \n");
 
  875   source.append(
"  for (uint k = 0; k < k_end; ++k) { \n");
 
  876   source.append(
"    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
 
  878   source.append(
"    tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
 
  879   source.append(
"    val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
 
  882   source.append(
"    if (get_local_id(0) == 0 && k > 0) { \n");
 
  883   source.append(
"      if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
 
  884   source.append(
"        val += inter_results[get_local_size(0)-1]; \n");
 
  885   source.append(
"      else {\n");
 
  886   source.append(
"        "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_size(0)-1]; \n");
 
  887   source.append(
"        Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
 
  888   source.append(
"        inner_prod_ApAp += Ap_entry * Ap_entry; \n");
 
  889   source.append(
"        inner_prod_pAp  += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
 
  890   source.append(
"        inner_prod_r0Ap  += r0star[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
 
  891   source.append(
"      } \n");
 
  892   source.append(
"    } \n");
 
  895   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  896   source.append(
"    shared_rows[get_local_id(0)] = tmp.x; \n");
 
  897   source.append(
"    inter_results[get_local_id(0)] = val; \n");
 
  898   source.append(
"    "); source.append(numeric_string); source.append(
" left = 0; \n");
 
  899   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  901   source.append(
"    for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
 
  902   source.append(
"      left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
 
  903   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  904   source.append(
"      inter_results[get_local_id(0)] += left; \n");
 
  905   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  906   source.append(
"    } \n");
 
  909   source.append(
"    if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
 
  910   source.append(
"      shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
 
  911   source.append(
"      "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
 
  912   source.append(
"      Ap[tmp.x] = Ap_entry; \n");
 
  913   source.append(
"      inner_prod_ApAp += Ap_entry * Ap_entry; \n");
 
  914   source.append(
"      inner_prod_pAp  += p[tmp.x] * Ap_entry; \n");
 
  915   source.append(
"      inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
 
  916   source.append(
"    } \n");
 
  918   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  919   source.append(
"  }  \n"); 
 
  921   source.append(
"  if (local_index + 1 == group_end) {\n");  
 
  922   source.append(
"    "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
 
  923   source.append(
"    Ap[tmp.x] = Ap_entry; \n");
 
  924   source.append(
"    inner_prod_ApAp += Ap_entry * Ap_entry; \n");
 
  925   source.append(
"    inner_prod_pAp  += p[tmp.x] * Ap_entry; \n");
 
  926   source.append(
"    inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
 
  927   source.append(
"  }  \n");
 
  930   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
  931   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
  932   source.append(
"  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
 
  933   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
  934   source.append(
"  { \n");
 
  935   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  936   source.append(
"    if (get_local_id(0) < stride) { \n");
 
  937   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
  938   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
  939   source.append(
"      shared_array_r0Ap[get_local_id(0)]  += shared_array_r0Ap[get_local_id(0) + stride];  \n");
 
  940   source.append(
"    } ");
 
  941   source.append(
"  } ");
 
  944   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
  945   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
  946   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
  947   source.append(
"    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
 
  948   source.append(
"  } \n");
 
  950   source.append(
"} \n \n");
 
  955 template<
typename StringT>
 
  958   source.append(
"__kernel void bicgstab_ell_prod( \n");
 
  959   source.append(
"  __global const unsigned int * coords, \n");
 
  960   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  961   source.append(
"  unsigned int internal_row_num, \n");
 
  962   source.append(
"  unsigned int items_per_row, \n");
 
  963   source.append(
"  unsigned int aligned_items_per_row, \n");
 
  964   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
  965   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
  966   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
 
  967   source.append(
"  unsigned int size, \n");
 
  968   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
  969   source.append(
"  unsigned int buffer_size, \n");
 
  970   source.append(
"  unsigned int buffer_offset, \n");
 
  971   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
  972   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
 
  973   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
 
  974   source.append(
"{ \n");
 
  975   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
  976   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
  977   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
 
  978   source.append(
"  uint glb_id = get_global_id(0); \n");
 
  979   source.append(
"  uint glb_sz = get_global_size(0); \n");
 
  981   source.append(
"  for (uint row = glb_id; row < size; row += glb_sz) { \n");
 
  982   source.append(
"    "); source.append(numeric_string); source.append(
" sum = 0; \n");
 
  984   source.append(
"    uint offset = row; \n");
 
  985   source.append(
"    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
 
  986   source.append(
"      "); source.append(numeric_string); source.append(
" val = elements[offset]; \n");
 
  987   source.append(
"      sum += (val != 0) ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(
")0; \n");
 
  988   source.append(
"    } \n");
 
  990   source.append(
"    Ap[row] = sum; \n");
 
  991   source.append(
"    inner_prod_ApAp += sum * sum; \n");
 
  992   source.append(
"    inner_prod_pAp  += p[row] * sum; \n");
 
  993   source.append(
"    inner_prod_r0Ap += r0star[row] * sum; \n");
 
  994   source.append(
"  }  \n");
 
  997   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
  998   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
  999   source.append(
"  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
 
 1000   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
 1001   source.append(
"  { \n");
 
 1002   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1003   source.append(
"    if (get_local_id(0) < stride) { \n");
 
 1004   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
 1005   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
 1006   source.append(
"      shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride];  \n");
 
 1007   source.append(
"    } ");
 
 1008   source.append(
"  } ");
 
 1011   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
 1012   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
 1013   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
 1014   source.append(
"    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
 
 1015   source.append(
"  } \n");
 
 1016   source.append(
"} \n \n");
 
 1019 template<
typename StringT>
 
 1022   source.append(
"__kernel void bicgstab_sliced_ell_prod( \n");
 
 1023   source.append(
"  __global const unsigned int * columns_per_block, \n");
 
 1024   source.append(
"  __global const unsigned int * column_indices, \n");
 
 1025   source.append(
"  __global const unsigned int * block_start, \n");
 
 1026   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
 1027   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
 1028   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
 1029   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
 
 1030   source.append(
"  unsigned int size, \n");
 
 1031   source.append(
"  unsigned int block_size, \n");
 
 1032   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
 1033   source.append(
"  unsigned int buffer_size, \n");
 
 1034   source.append(
"  unsigned int buffer_offset, \n");
 
 1035   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
 1036   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
 
 1037   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
 
 1038   source.append(
"{ \n");
 
 1039   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
 1040   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
 1041   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
 
 1042   source.append(
"  uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
 
 1043   source.append(
"  uint id_in_block = get_local_id(0) % block_size; \n");
 
 1044   source.append(
"  uint num_blocks  = (size - 1) / block_size + 1; \n");
 
 1045   source.append(
"  uint global_warp_count  = blocks_per_workgroup * get_num_groups(0); \n");
 
 1046   source.append(
"  uint global_warp_id     = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
 
 1048   source.append(
"  for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
 
 1049   source.append(
"    "); source.append(numeric_string); source.append(
" sum = 0; \n");
 
 1051   source.append(
"    uint row    = block_idx * block_size + id_in_block; \n");
 
 1052   source.append(
"    uint offset = block_start[block_idx]; \n");
 
 1053   source.append(
"    uint num_columns = columns_per_block[block_idx]; \n");
 
 1054   source.append(
"    for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
 
 1055   source.append(
"      uint index = offset + item_id * block_size + id_in_block; \n");
 
 1056   source.append(
"      "); source.append(numeric_string); source.append(
" val = elements[index]; \n");
 
 1057   source.append(
"      sum += (val != 0) ? (p[column_indices[index]] * val) : 0; \n");
 
 1058   source.append(
"    } \n");
 
 1060   source.append(
"    if (row < size) {\n");
 
 1061   source.append(
"      Ap[row] = sum; \n");
 
 1062   source.append(
"      inner_prod_ApAp += sum * sum; \n");
 
 1063   source.append(
"      inner_prod_pAp  += p[row] * sum; \n");
 
 1064   source.append(
"      inner_prod_r0Ap += r0star[row] * sum; \n");
 
 1065   source.append(
"    }  \n");
 
 1066   source.append(
"  }  \n");
 
 1069   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
 1070   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
 1071   source.append(
"  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
 
 1072   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
 1073   source.append(
"  { \n");
 
 1074   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1075   source.append(
"    if (get_local_id(0) < stride) { \n");
 
 1076   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
 1077   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
 1078   source.append(
"      shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride];  \n");
 
 1079   source.append(
"    } ");
 
 1080   source.append(
"  } ");
 
 1083   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
 1084   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
 1085   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
 1086   source.append(
"    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
 
 1087   source.append(
"  } \n");
 
 1088   source.append(
"} \n \n");
 
 1091 template<
typename StringT>
 
 1094   source.append(
"__kernel void bicgstab_hyb_prod( \n");
 
 1095   source.append(
"  const __global int* ell_coords, \n");
 
 1096   source.append(
"  const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
 
 1097   source.append(
"  const __global uint* csr_rows, \n");
 
 1098   source.append(
"  const __global uint* csr_cols, \n");
 
 1099   source.append(
"  const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
 
 1100   source.append(
"  unsigned int internal_row_num, \n");
 
 1101   source.append(
"  unsigned int items_per_row, \n");
 
 1102   source.append(
"  unsigned int aligned_items_per_row, \n");
 
 1103   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
 1104   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
 1105   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
 
 1106   source.append(
"  unsigned int size, \n");
 
 1107   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
 1108   source.append(
"  unsigned int buffer_size, \n");
 
 1109   source.append(
"  unsigned int buffer_offset, \n");
 
 1110   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
 1111   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
 
 1112   source.append(
"   __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
 
 1113   source.append(
"{ \n");
 
 1114   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
 
 1115   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
 
 1116   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
 
 1117   source.append(
"  uint glb_id = get_global_id(0); \n");
 
 1118   source.append(
"  uint glb_sz = get_global_size(0); \n");
 
 1120   source.append(
"  for (uint row = glb_id; row < size; row += glb_sz) { \n");
 
 1121   source.append(
"    "); source.append(numeric_string); source.append(
" sum = 0; \n");
 
 1123   source.append(
"    uint offset = row; \n");
 
 1124   source.append(
"    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
 
 1125   source.append(
"      "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
 
 1126   source.append(
"      sum += (val != 0) ? (p[ell_coords[offset]] * val) : 0; \n");
 
 1127   source.append(
"    } \n");
 
 1129   source.append(
"    uint col_begin = csr_rows[row]; \n");
 
 1130   source.append(
"    uint col_end   = csr_rows[row + 1]; \n");
 
 1132   source.append(
"    for (uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
 
 1133   source.append(
"      sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
 
 1134   source.append(
"    } \n");
 
 1136   source.append(
"    Ap[row] = sum; \n");
 
 1137   source.append(
"    inner_prod_ApAp += sum * sum; \n");
 
 1138   source.append(
"    inner_prod_pAp  += p[row] * sum; \n");
 
 1139   source.append(
"    inner_prod_r0Ap += r0star[row] * sum; \n");
 
 1140   source.append(
"  }  \n");
 
 1143   source.append(
"  shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
 
 1144   source.append(
"  shared_array_pAp[get_local_id(0)]  = inner_prod_pAp; \n");
 
 1145   source.append(
"  shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
 
 1146   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
 1147   source.append(
"  { \n");
 
 1148   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1149   source.append(
"    if (get_local_id(0) < stride) { \n");
 
 1150   source.append(
"      shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride];  \n");
 
 1151   source.append(
"      shared_array_pAp[get_local_id(0)]  += shared_array_pAp[get_local_id(0) + stride];  \n");
 
 1152   source.append(
"      shared_array_r0Ap[get_local_id(0)]  += shared_array_r0Ap[get_local_id(0) + stride];  \n");
 
 1153   source.append(
"    } ");
 
 1154   source.append(
"  } ");
 
 1157   source.append(
"  if (get_local_id(0) == 0) { \n ");
 
 1158   source.append(
"    inner_prod_buffer[  buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
 
 1159   source.append(
"    inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
 
 1160   source.append(
"    inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
 
 1161   source.append(
"  } \n");
 
 1162   source.append(
"} \n \n");
 
 1168 template <
typename StringType>
 
 1171   source.append(
"__kernel void gmres_gram_schmidt_1( \n");
 
 1172   source.append(
"          __global "); source.append(numeric_string); source.append(
" const * krylov_basis, \n");
 
 1173   source.append(
"          unsigned int size, \n");
 
 1174   source.append(
"          unsigned int internal_size, \n");
 
 1175   source.append(
"          unsigned int k, \n");
 
 1176   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vi_in_vk_buffer, \n");
 
 1177   source.append(
"          unsigned int chunk_size) \n");
 
 1178   source.append(
"{ \n");
 
 1180   source.append(
"  __local "); source.append(numeric_string); source.append(
" shared_array[7*128]; \n");
 
 1183     source.append(
"  "); source.append(numeric_string); source.append(
" vi_in_vk[7]; \n");
 
 1185   source.append(
"  "); source.append(numeric_string); source.append(
" value_vk = 0; \n");
 
 1187   source.append(
"  unsigned int k_base = 0;   \n");
 
 1188   source.append(
"  while (k_base < k) {   \n");
 
 1189   source.append(
"    unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);   \n");
 
 1193     source.append(
"    for (uint j=0; j<vecs_in_iteration; ++j) \n");
 
 1194     source.append(
"      shared_array[get_local_id(0) + j*chunk_size] = 0; \n");
 
 1198     source.append(
"    vi_in_vk[0] = 0;\n");
 
 1199     source.append(
"    vi_in_vk[1] = 0;\n");
 
 1200     source.append(
"    vi_in_vk[2] = 0;\n");
 
 1201     source.append(
"    vi_in_vk[3] = 0;\n");
 
 1202     source.append(
"    vi_in_vk[4] = 0;\n");
 
 1203     source.append(
"    vi_in_vk[5] = 0;\n");
 
 1204     source.append(
"    vi_in_vk[6] = 0;\n");
 
 1206   source.append(
"    for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 
 1207   source.append(
"      value_vk = krylov_basis[i + k * internal_size]; \n");
 
 1208   source.append(
"       \n");
 
 1209   source.append(
"      for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
 
 1211     source.append(
"        shared_array[get_local_id(0) + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
 
 1213     source.append(
"        vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
 
 1214   source.append(
"    }  \n");
 
 1219     source.append(
"    for (uint j=0; j<vecs_in_iteration; ++j) \n");
 
 1220     source.append(
"      shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk[j]; \n");
 
 1222   source.append(
"    for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
 1223   source.append(
"    { \n");
 
 1224   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1225   source.append(
"      if (get_local_id(0) < stride) { \n");
 
 1226   source.append(
"        for (uint j=0; j<vecs_in_iteration; ++j) \n");
 
 1227   source.append(
"          shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride];  \n");
 
 1228   source.append(
"      } ");
 
 1229   source.append(
"    } ");
 
 1232   source.append(
"    if (get_local_id(0) == 0) \n ");
 
 1233   source.append(
"      for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
 
 1234   source.append(
"        vi_in_vk_buffer[get_group_id(0) + (k_base + j) * chunk_size] = shared_array[j*chunk_size]; ");
 
 1236   source.append(
"    k_base += vecs_in_iteration;   \n");
 
 1237   source.append(
"  }  \n");
 
 1239   source.append(
"} \n");
 
 1243 template <
typename StringType>
 
 1246   source.append(
"__kernel void gmres_gram_schmidt_2( \n");
 
 1247   source.append(
"          __global "); source.append(numeric_string); source.append(
" * krylov_basis, \n");
 
 1248   source.append(
"          unsigned int size, \n");
 
 1249   source.append(
"          unsigned int internal_size, \n");
 
 1250   source.append(
"          unsigned int k, \n");
 
 1251   source.append(
"          __global "); source.append(numeric_string); source.append(
" const * vi_in_vk_buffer, \n");
 
 1252   source.append(
"          unsigned int chunk_size, \n");
 
 1253   source.append(
"          __global "); source.append(numeric_string); source.append(
" * R_buffer, \n");
 
 1254   source.append(
"          unsigned int krylov_dim, \n");
 
 1255   source.append(
"          __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
 1256   source.append(
"         __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
 
 1257   source.append(
"{ \n");
 
 1259   source.append(
"  "); source.append(numeric_string); source.append(
" vk_dot_vk = 0; \n");
 
 1260   source.append(
"  "); source.append(numeric_string); source.append(
" value_vk = 0; \n");
 
 1262   source.append(
"  unsigned int k_base = 0;   \n");
 
 1263   source.append(
"  while (k_base < k) {   \n");
 
 1264   source.append(
"    unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);   \n");
 
 1267   source.append(
"    for (uint j=0; j<vecs_in_iteration; ++j) \n");
 
 1268   source.append(
"      shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk_buffer[get_local_id(0) + (k_base + j) * chunk_size]; \n");
 
 1269   source.append(
"    for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
 1270   source.append(
"    { \n");
 
 1271   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1272   source.append(
"      if (get_local_id(0) < stride) { \n");
 
 1273   source.append(
"        for (uint j=0; j<vecs_in_iteration; ++j) \n");
 
 1274   source.append(
"          shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride];  \n");
 
 1275   source.append(
"      } ");
 
 1276   source.append(
"    } ");
 
 1277   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1280   source.append(
"    for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 
 1281   source.append(
"      value_vk = krylov_basis[i + k * internal_size]; \n");
 
 1282   source.append(
"       \n");
 
 1283   source.append(
"      for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
 
 1284   source.append(
"        value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size]; \n");
 
 1285   source.append(
"      vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;  \n");
 
 1286   source.append(
"      krylov_basis[i + k * internal_size] = value_vk;  \n");
 
 1287   source.append(
"    }  \n");
 
 1290   source.append(
"    if (get_group_id(0) == 0) \n");
 
 1291   source.append(
"      for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
 
 1292   source.append(
"        R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size]; ");
 
 1293   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1295   source.append(
"    k_base += vecs_in_iteration;   \n");
 
 1296   source.append(
"  }  \n");
 
 1299   source.append(
"  shared_array[get_local_id(0)] = vk_dot_vk; \n");
 
 1300   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
 1301   source.append(
"  { \n");
 
 1302   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1303   source.append(
"    if (get_local_id(0) < stride) \n");
 
 1304   source.append(
"      shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride];  \n");
 
 1305   source.append(
"  } ");
 
 1308   source.append(
"  if (get_local_id(0) == 0) \n ");
 
 1309   source.append(
"    inner_prod_buffer[chunk_size+get_group_id(0)] = shared_array[0]; ");
 
 1311   source.append(
"} \n");
 
 1314 template <
typename StringType>
 
 1317   source.append(
"__kernel void gmres_normalize_vk( \n");
 
 1318   source.append(
"          __global "); source.append(numeric_string); source.append(
" * vk, \n");
 
 1319   source.append(
"          unsigned int vk_offset, \n");
 
 1320   source.append(
"          __global "); source.append(numeric_string); source.append(
" const * residual, \n");
 
 1321   source.append(
"          __global "); source.append(numeric_string); source.append(
" * R_buffer, \n");
 
 1322   source.append(
"          unsigned int R_offset, \n");
 
 1323   source.append(
"          __global "); source.append(numeric_string); source.append(
" const * inner_prod_buffer, \n");
 
 1324   source.append(
"          unsigned int chunk_size, \n");
 
 1325   source.append(
"          __global "); source.append(numeric_string); source.append(
" * r_dot_vk_buffer, \n");
 
 1326   source.append(
"          unsigned int chunk_offset, \n");
 
 1327   source.append(
"          unsigned int size, \n");
 
 1328   source.append(
"         __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
 
 1329   source.append(
"{ \n");
 
 1331   source.append(
"  "); source.append(numeric_string); source.append(
" norm_vk = 0; \n");
 
 1334   source.append(
"  shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + chunk_size]; \n");
 
 1335   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
 1336   source.append(
"  { \n");
 
 1337   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1338   source.append(
"    if (get_local_id(0) < stride) \n");
 
 1339   source.append(
"      shared_array[get_local_id(0)]  += shared_array[get_local_id(0) + stride];  \n");
 
 1340   source.append(
"  } ");
 
 1343   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1344   source.append(
"  norm_vk = sqrt(shared_array[0]); \n");
 
 1346   source.append(
"  "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
 
 1347   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 
 1348   source.append(
"    "); source.append(numeric_string); source.append(
" value_vk = vk[i + vk_offset] / norm_vk; \n");
 
 1349   source.append(
"     \n");
 
 1350   source.append(
"    inner_prod_contrib += residual[i] * value_vk; \n");
 
 1351   source.append(
"     \n");
 
 1352   source.append(
"    vk[i + vk_offset] = value_vk; \n");
 
 1353   source.append(
"  }  \n");
 
 1354   source.append(
"  barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1357   source.append(
"  shared_array[get_local_id(0)] = inner_prod_contrib; \n");
 
 1358   source.append(
"  for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
 
 1359   source.append(
"  { \n");
 
 1360   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1361   source.append(
"    if (get_local_id(0) < stride)  \n");
 
 1362   source.append(
"      shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride];  \n");
 
 1363   source.append(
"  } ");
 
 1366   source.append(
"  if (get_local_id(0) == 0) \n ");
 
 1367   source.append(
"    r_dot_vk_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
 
 1368   source.append(
"  if (get_global_id(0) == 0) \n ");
 
 1369   source.append(
"    R_buffer[R_offset] = norm_vk; \n");
 
 1371   source.append(
"} \n");
 
 1375 template <
typename StringType>
 
 1378   source.append(
"__kernel void gmres_update_result( \n");
 
 1379   source.append(
"          __global "); source.append(numeric_string); source.append(
" * result, \n");
 
 1380   source.append(
"          __global "); source.append(numeric_string); source.append(
" const * residual, \n");
 
 1381   source.append(
"          __global "); source.append(numeric_string); source.append(
" const * krylov_basis, \n");
 
 1382   source.append(
"          unsigned int size, \n");
 
 1383   source.append(
"          unsigned int internal_size, \n");
 
 1384   source.append(
"          __global "); source.append(numeric_string); source.append(
" const * coefficients, \n");
 
 1385   source.append(
"          unsigned int k) \n");
 
 1386   source.append(
"{ \n");
 
 1388   source.append(
"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 
 1389   source.append(
"    "); source.append(numeric_string); source.append(
" value_result = result[i] + coefficients[0] * residual[i]; \n");
 
 1390   source.append(
"     \n");
 
 1391   source.append(
"    for (unsigned int j = 1; j < k; ++j) \n");
 
 1392   source.append(
"      value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size]; \n");
 
 1393   source.append(
"     \n");
 
 1394   source.append(
"    result[i] = value_result; \n");
 
 1395   source.append(
"  }  \n");
 
 1397   source.append(
"} \n");
 
 1401 template <
typename StringType>
 
 1404   source.append(
"__kernel void gmres_csr_blocked_prod( \n");
 
 1405   source.append(
"  __global const unsigned int * row_indices, \n");
 
 1406   source.append(
"  __global const unsigned int * column_indices, \n");
 
 1407   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
 1408   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
 1409   source.append(
"  unsigned int offset_p, \n");
 
 1410   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
 1411   source.append(
"  unsigned int offset_Ap, \n");
 
 1412   source.append(
"  unsigned int size, \n");
 
 1413   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
 1414   source.append(
"  unsigned int buffer_size, \n");
 
 1415   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
 1416   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
 
 1417   source.append(
"{ \n");
 
 1418   source.append(
"  cg_csr_blocked_prod(row_indices, column_indices, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
 
 1419   source.append(
"} \n \n");
 
 1423 template <
typename StringType>
 
 1426   source.append(
"__kernel void gmres_csr_prod( \n");
 
 1427   source.append(
"  __global const unsigned int * row_indices, \n");
 
 1428   source.append(
"  __global const unsigned int * column_indices, \n");
 
 1429   source.append(
"  __global const unsigned int * row_blocks, \n");
 
 1430   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
 1431   source.append(
"  unsigned int num_blocks, \n");
 
 1432   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
 1433   source.append(
"  unsigned int offset_p, \n");
 
 1434   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
 1435   source.append(
"  unsigned int offset_Ap, \n");
 
 1436   source.append(
"  unsigned int size, \n");
 
 1437   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
 1438   source.append(
"  unsigned int buffer_size, \n");
 
 1439   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
 1440   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
 
 1441   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_elements) \n");
 
 1442   source.append(
"{ \n");
 
 1443   source.append(
"  cg_csr_prod(row_indices, column_indices, row_blocks, elements, num_blocks, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp, shared_elements); \n");
 
 1444   source.append(
"} \n \n");
 
 1448 template <
typename StringType>
 
 1451   source.append(
"__kernel void gmres_coo_prod( \n");
 
 1452   source.append(
"          __global const uint2 * coords,  \n");
 
 1453   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
 1454   source.append(
"          __global const uint  * group_boundaries, \n");
 
 1455   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
 1456   source.append(
"          unsigned int offset_p, \n");
 
 1457   source.append(
"          __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
 1458   source.append(
"          unsigned int offset_Ap, \n");
 
 1459   source.append(
"          unsigned int size, \n");
 
 1460   source.append(
"          __local unsigned int * shared_rows, \n");
 
 1461   source.append(
"          __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
 
 1462   source.append(
"          __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
 1463   source.append(
"          unsigned int buffer_size, \n");
 
 1464   source.append(
"          __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
 1465   source.append(
"          __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
 
 1466   source.append(
"{ \n");
 
 1467   source.append(
"  cg_coo_prod(coords, elements, group_boundaries, p + offset_p, Ap + offset_Ap, size, shared_rows, inter_results, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
 
 1468   source.append(
"} \n \n");
 
 1473 template <
typename StringType>
 
 1476   source.append(
"__kernel void gmres_ell_prod( \n");
 
 1477   source.append(
"  __global const unsigned int * coords, \n");
 
 1478   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
 1479   source.append(
"  unsigned int internal_row_num, \n");
 
 1480   source.append(
"  unsigned int items_per_row, \n");
 
 1481   source.append(
"  unsigned int aligned_items_per_row, \n");
 
 1482   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
 1483   source.append(
"  unsigned int offset_p, \n");
 
 1484   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
 1485   source.append(
"  unsigned int offset_Ap, \n");
 
 1486   source.append(
"  unsigned int size, \n");
 
 1487   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
 1488   source.append(
"  unsigned int buffer_size, \n");
 
 1489   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
 1490   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
 
 1491   source.append(
"{ \n");
 
 1492   source.append(
"  cg_ell_prod(coords, elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
 
 1493   source.append(
"} \n \n");
 
 1496 template <
typename StringType>
 
 1499   source.append(
"__kernel void gmres_sliced_ell_prod( \n");
 
 1500   source.append(
"  __global const unsigned int * columns_per_block, \n");
 
 1501   source.append(
"  __global const unsigned int * column_indices, \n");
 
 1502   source.append(
"  __global const unsigned int * block_start, \n");
 
 1503   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
 1504   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
 1505   source.append(
"  unsigned int offset_p, \n");
 
 1506   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
 1507   source.append(
"  unsigned int offset_Ap, \n");
 
 1508   source.append(
"  unsigned int size, \n");
 
 1509   source.append(
"  unsigned int block_size, \n");
 
 1510   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
 1511   source.append(
"  unsigned int buffer_size, \n");
 
 1512   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
 1513   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
 
 1514   source.append(
"{ \n");
 
 1515   source.append(
"  cg_sliced_ell_prod(columns_per_block, column_indices, block_start, elements, p + offset_p, Ap + offset_Ap, size, block_size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
 
 1516   source.append(
"} \n \n");
 
 1519 template <
typename StringType>
 
 1522   source.append(
"__kernel void gmres_hyb_prod( \n");
 
 1523   source.append(
"  const __global int* ell_coords, \n");
 
 1524   source.append(
"  const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
 
 1525   source.append(
"  const __global uint* csr_rows, \n");
 
 1526   source.append(
"  const __global uint* csr_cols, \n");
 
 1527   source.append(
"  const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
 
 1528   source.append(
"  unsigned int internal_row_num, \n");
 
 1529   source.append(
"  unsigned int items_per_row, \n");
 
 1530   source.append(
"  unsigned int aligned_items_per_row, \n");
 
 1531   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * p, \n");
 
 1532   source.append(
"  unsigned int offset_p, \n");
 
 1533   source.append(
"  __global "); source.append(numeric_string); source.append(
" * Ap, \n");
 
 1534   source.append(
"  unsigned int offset_Ap, \n");
 
 1535   source.append(
"  unsigned int size, \n");
 
 1536   source.append(
"  __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
 
 1537   source.append(
"  unsigned int buffer_size, \n");
 
 1538   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
 
 1539   source.append(
"  __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
 
 1540   source.append(
"{ \n");
 
 1541   source.append(
"  cg_hyb_prod(ell_coords, ell_elements, csr_rows, csr_cols, csr_elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
 
 1542   source.append(
"} \n \n");
 
 1552 template<
typename NumericT>
 
 1562     static std::map<cl_context, bool> init_done;
 
 1569       source.reserve(1024);
 
 1571       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
 1605       #ifdef VIENNACL_BUILD_INFO 
 1606       std::cout << 
"Creating program " << prog_name << std::endl;
 
 1608       ctx.add_program(source, prog_name);
 
 1609       init_done[ctx.handle().get()] = 
true;
 
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers. 
void generate_sliced_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_pipelined_gmres_blocked_prod(StringType &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
Some helper routines for reading/writing/printing scheduler expressions. 
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities. 
void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
static std::string program_name()
void generate_pipelined_gmres_gram_schmidt_stage1(StringType &source, std::string const &numeric_string, bool is_nvidia)
void generate_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
const viennacl::ocl::handle< cl_context > & handle() const 
Returns the context handle. 
void generate_hyb_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_gmres_normalize_vk(StringType &source, std::string const &numeric_string)
void generate_pipelined_gmres_gram_schmidt_stage2(StringType &source, std::string const &numeric_string)
void generate_compressed_matrix_pipelined_cg_blocked_prod(StringT &source, std::string const &numeric_string, unsigned int subwarp_size)
void generate_compressed_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
static void apply(viennacl::ocl::context const &)
void generate_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
const OCL_TYPE & get() const 
void generate_compressed_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_bicgstab_vector_update(StringT &source, std::string const &numeric_string)
void generate_coordinate_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_pipelined_bicgstab_blocked_prod(StringT &source, std::string const &numeric_string, unsigned int subwarp_size)
void generate_coordinate_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Provides the datastructures for dealing with a single statement such as 'x = y + z;'. 
Proxy classes for vectors. 
void generate_hyb_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
void generate_hyb_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL. 
void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_bicgstab_update_s(StringT &source, std::string const &numeric_string)
void generate_pipelined_cg_vector_update(StringT &source, std::string const &numeric_string)
void generate_pipelined_gmres_update_result(StringType &source, std::string const &numeric_string)
Helper class for converting a type to its string representation. 
void generate_compressed_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_sliced_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)