1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP 
    2 #define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP 
   41 template<
typename StringT>
 
   44   source.append(
"__kernel void block_trans_lu_backward( \n");
 
   45   source.append(
"  __global const unsigned int * row_jumper_U,  \n");     
 
   46   source.append(
"  __global const unsigned int * column_indices_U, \n");
 
   47   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements_U, \n");
 
   48   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * diagonal_U, \n");
 
   49   source.append(
"  __global const unsigned int * block_offsets, \n");
 
   50   source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
   51   source.append(
"  unsigned int size) \n");
 
   52   source.append(
"{ \n");
 
   53   source.append(
"  unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
 
   54   source.append(
"  unsigned int col_stop  = block_offsets[2*get_group_id(0)+1]; \n");
 
   55   source.append(
"  unsigned int row_start; \n");
 
   56   source.append(
"  unsigned int row_stop; \n");
 
   57   source.append(
"  "); source.append(numeric_string); source.append(
" result_entry = 0; \n");
 
   59   source.append(
"  if (col_start >= col_stop) \n");
 
   60   source.append(
"    return; \n");
 
   63   source.append(
"  for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
 
   64   source.append(
"  { \n");
 
   65   source.append(
"    unsigned int col = (col_stop - iter) - 1; \n");
 
   66   source.append(
"    result_entry = result[col] / diagonal_U[col]; \n");
 
   67   source.append(
"    row_start = row_jumper_U[col]; \n");
 
   68   source.append(
"    row_stop  = row_jumper_U[col + 1]; \n");
 
   69   source.append(
"    for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
 
   70   source.append(
"      result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
 
   71   source.append(
"    barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
   72   source.append(
"  } \n");
 
   75   source.append(
"  for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
 
   76   source.append(
"    result[col] /= diagonal_U[col]; \n");
 
   77   source.append(
"} \n");
 
   80 template<
typename StringT>
 
   83   source.append(
"__kernel void block_trans_unit_lu_forward( \n");
 
   84   source.append(
"  __global const unsigned int * row_jumper_L,  \n");     
 
   85   source.append(
"  __global const unsigned int * column_indices_L, \n");
 
   86   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements_L, \n");
 
   87   source.append(
"  __global const unsigned int * block_offsets, \n");
 
   88   source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
   89   source.append(
"  unsigned int size) \n");
 
   90   source.append(
"{ \n");
 
   91   source.append(
"  unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
 
   92   source.append(
"  unsigned int col_stop  = block_offsets[2*get_group_id(0)+1]; \n");
 
   93   source.append(
"  unsigned int row_start = row_jumper_L[col_start]; \n");
 
   94   source.append(
"  unsigned int row_stop; \n");
 
   95   source.append(
"  "); source.append(numeric_string); source.append(
" result_entry = 0; \n");
 
   97   source.append(
"  if (col_start >= col_stop) \n");
 
   98   source.append(
"    return; \n");
 
  101   source.append(
"  for (unsigned int col = col_start; col < col_stop; ++col) \n");
 
  102   source.append(
"  { \n");
 
  103   source.append(
"    result_entry = result[col]; \n");
 
  104   source.append(
"    row_stop = row_jumper_L[col + 1]; \n");
 
  105   source.append(
"    for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
 
  106   source.append(
"      result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
 
  107   source.append(
"    row_start = row_stop; \n"); 
 
  108   source.append(
"    barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  109   source.append(
"  } \n");
 
  111   source.append(
"}; \n");
 
  117   template<
typename StringT>
 
  119                                                     bool B_transposed, 
bool B_row_major, 
bool C_row_major)
 
  121     source.append(
"__kernel void ");
 
  123     source.append(
"( \n");
 
  124     source.append(
"  __global const unsigned int * sp_mat_row_indices, \n");
 
  125     source.append(
"  __global const unsigned int * sp_mat_col_indices, \n");
 
  126     source.append(
"  __global const "); source.append(numeric_string); source.append(
" * sp_mat_elements, \n");
 
  127     source.append(
"  __global const "); source.append(numeric_string); source.append(
" * d_mat, \n");
 
  128     source.append(
"  unsigned int d_mat_row_start, \n");
 
  129     source.append(
"  unsigned int d_mat_col_start, \n");
 
  130     source.append(
"  unsigned int d_mat_row_inc, \n");
 
  131     source.append(
"  unsigned int d_mat_col_inc, \n");
 
  132     source.append(
"  unsigned int d_mat_row_size, \n");
 
  133     source.append(
"  unsigned int d_mat_col_size, \n");
 
  134     source.append(
"  unsigned int d_mat_internal_rows, \n");
 
  135     source.append(
"  unsigned int d_mat_internal_cols, \n");
 
  136     source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
  137     source.append(
"  unsigned int result_row_start, \n");
 
  138     source.append(
"  unsigned int result_col_start, \n");
 
  139     source.append(
"  unsigned int result_row_inc, \n");
 
  140     source.append(
"  unsigned int result_col_inc, \n");
 
  141     source.append(
"  unsigned int result_row_size, \n");
 
  142     source.append(
"  unsigned int result_col_size, \n");
 
  143     source.append(
"  unsigned int result_internal_rows, \n");
 
  144     source.append(
"  unsigned int result_internal_cols) { \n");
 
  147     source.append(
"  for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
 
  149     source.append(
"    unsigned int row_start = sp_mat_row_indices[row]; \n");
 
  150     source.append(
"    unsigned int row_end = sp_mat_row_indices[row+1]; \n");
 
  153     source.append(
"    for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
 
  155     source.append(
"      "); source.append(numeric_string); source.append(
" r = 0; \n");
 
  157     source.append(
"      for (unsigned int k = row_start; k < row_end; k ++) { \n");
 
  159     source.append(
"        unsigned int j = sp_mat_col_indices[k]; \n");
 
  160     source.append(
"        "); source.append(numeric_string); source.append(
" x = sp_mat_elements[k]; \n");
 
  162     source.append(
"        "); source.append(numeric_string);
 
  163     if (B_transposed && B_row_major)
 
  164       source.append(
" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start +   j * d_mat_col_inc ]; \n");
 
  165     else if (B_transposed && !B_row_major)
 
  166       source.append(
" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc)                       + (d_mat_col_start +  j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
 
  167     else if (!B_transposed && B_row_major)
 
  168       source.append(
" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
 
  170       source.append(
" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc)                       + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
 
  171     source.append(
"        r += x * y; \n");
 
  172     source.append(
"      } \n");
 
  175       source.append(
"      result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
 
  177       source.append(
"      result[ (result_row_start + row * result_row_inc)                        + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
 
  178     source.append(
"    } \n");
 
  179     source.append(
"  } \n");
 
  181     source.append(
"} \n");
 
  185 template<
typename StringT>
 
  199 template<
typename StringT>
 
  203  source.append(
" __kernel void jacobi( \n");
 
  204  source.append(
"  __global const unsigned int * row_indices, \n");
 
  205  source.append(
"  __global const unsigned int * column_indices, \n");
 
  206  source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  207  source.append(
"  "); source.append(numeric_string); source.append(
" weight, \n");
 
  208  source.append(
"  __global const "); source.append(numeric_string); source.append(
" * old_result, \n");
 
  209  source.append(
"  __global "); source.append(numeric_string); source.append(
" * new_result, \n");
 
  210  source.append(
"  __global const "); source.append(numeric_string); source.append(
" * rhs, \n");
 
  211  source.append(
"  unsigned int size) \n");
 
  212  source.append(
"  { \n");
 
  213  source.append(
"   "); source.append(numeric_string); source.append(
" sum, diag=1; \n");
 
  214  source.append(
"   int col; \n");
 
  215  source.append(
"   for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
 
  216  source.append(
"   { \n");
 
  217  source.append(
"     sum = 0; \n");
 
  218  source.append(
"     for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
 
  219  source.append(
"     { \n");
 
  220  source.append(
"       col = column_indices[j]; \n");
 
  221  source.append(
"       if (i == col) \n");
 
  222  source.append(
"         diag = elements[j]; \n");
 
  223  source.append(
"       else \n");
 
  224  source.append(
"         sum += elements[j] * old_result[col]; \n");
 
  225  source.append(
"     } \n");
 
  226  source.append(
"     new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
 
  227  source.append(
"    } \n");
 
  228  source.append(
"  } \n");
 
  233 template<
typename StringT>
 
  237   source.append(
"__kernel void lu_backward( \n");
 
  238   source.append(
"  __global const unsigned int * row_indices, \n");
 
  239   source.append(
"  __global const unsigned int * column_indices, \n");
 
  240   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  241   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vector, \n");
 
  242   source.append(
"  unsigned int size) \n");
 
  243   source.append(
"{ \n");
 
  244   source.append(
"  __local unsigned int col_index_buffer[128]; \n");
 
  245   source.append(
"  __local "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
 
  246   source.append(
"  __local "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
 
  248   source.append(
"  unsigned int nnz = row_indices[size]; \n");
 
  249   source.append(
"  unsigned int current_row = size-1; \n");
 
  250   source.append(
"  unsigned int row_at_window_start = size-1; \n");
 
  251   source.append(
"  "); source.append(numeric_string); source.append(
" current_vector_entry = vector[size-1]; \n");
 
  252   source.append(
"  "); source.append(numeric_string); source.append(
" diagonal_entry = 0; \n");
 
  253   source.append(
"  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
 
  254   source.append(
"  unsigned int next_row = row_indices[size-1]; \n");
 
  256   source.append(
"  unsigned int i = loop_end + get_local_id(0); \n");
 
  257   source.append(
"  while (1) \n");
 
  258   source.append(
"  { \n");
 
  260   source.append(
"    if (i < nnz) \n");
 
  261   source.append(
"    { \n");
 
  262   source.append(
"      element_buffer[get_local_id(0)] = elements[i]; \n");
 
  263   source.append(
"      unsigned int tmp = column_indices[i]; \n");
 
  264   source.append(
"      col_index_buffer[get_local_id(0)] = tmp; \n");
 
  265   source.append(
"      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
 
  266   source.append(
"    } \n");
 
  268   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  271   source.append(
"    if (get_local_id(0) == 0) \n");
 
  272   source.append(
"    { \n");
 
  274   source.append(
"      for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
 
  275   source.append(
"      { \n");
 
  276   source.append(
"        unsigned int k = (get_local_size(0) - k2) - 1; \n");
 
  278   source.append(
"        if (i+k >= nnz) \n");
 
  279   source.append(
"          continue; \n");
 
  281   source.append(
"        if (col_index_buffer[k] > row_at_window_start) \n"); 
 
  282   source.append(
"          current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
 
  283   source.append(
"        else if (col_index_buffer[k] > current_row) \n"); 
 
  284   source.append(
"          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
 
  285   source.append(
"        else if (col_index_buffer[k] == current_row) \n");
 
  286   source.append(
"          diagonal_entry = element_buffer[k]; \n");
 
  288   source.append(
"        if (i+k == next_row) \n"); 
 
  289   source.append(
"        { \n");
 
  290   source.append(
"          vector[current_row] = current_vector_entry / diagonal_entry; \n");
 
  291   source.append(
"          if (current_row > 0) //load next row's data \n");
 
  292   source.append(
"          { \n");
 
  293   source.append(
"            --current_row; \n");
 
  294   source.append(
"            next_row = row_indices[current_row]; \n");
 
  295   source.append(
"            current_vector_entry = vector[current_row]; \n");
 
  296   source.append(
"          } \n");
 
  297   source.append(
"        } \n");
 
  300   source.append(
"      } \n"); 
 
  302   source.append(
"      row_at_window_start = current_row; \n");
 
  303   source.append(
"    } \n"); 
 
  305   source.append(
"    barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  307   source.append(
"    if (i < get_local_size(0)) \n");
 
  308   source.append(
"      break; \n");
 
  310   source.append(
"    i -= get_local_size(0); \n");
 
  311   source.append(
"  } \n"); 
 
  312   source.append(
"} \n");
 
  316 template<
typename StringT>
 
  321   source.append(
"__kernel void lu_forward( \n");
 
  322   source.append(
"  __global const unsigned int * row_indices, \n");
 
  323   source.append(
"  __global const unsigned int * column_indices, \n");
 
  324   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  325   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vector, \n");
 
  326   source.append(
"  unsigned int size) \n");
 
  327   source.append(
"{ \n");
 
  328   source.append(
"  __local unsigned int col_index_buffer[128]; \n");
 
  329   source.append(
"  __local "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
 
  330   source.append(
"  __local "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
 
  332   source.append(
"  unsigned int nnz = row_indices[size]; \n");
 
  333   source.append(
"  unsigned int current_row = 0; \n");
 
  334   source.append(
"  unsigned int row_at_window_start = 0; \n");
 
  335   source.append(
"  "); source.append(numeric_string); source.append(
" current_vector_entry = vector[0]; \n");
 
  336   source.append(
"  "); source.append(numeric_string); source.append(
" diagonal_entry; \n");
 
  337   source.append(
"  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
 
  338   source.append(
"  unsigned int next_row = row_indices[1]; \n");
 
  340   source.append(
"  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
 
  341   source.append(
"  { \n");
 
  343   source.append(
"    if (i < nnz) \n");
 
  344   source.append(
"    { \n");
 
  345   source.append(
"      element_buffer[get_local_id(0)] = elements[i]; \n");
 
  346   source.append(
"      unsigned int tmp = column_indices[i]; \n");
 
  347   source.append(
"      col_index_buffer[get_local_id(0)] = tmp; \n");
 
  348   source.append(
"      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
 
  349   source.append(
"    } \n");
 
  351   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  354   source.append(
"    if (get_local_id(0) == 0) \n");
 
  355   source.append(
"    { \n");
 
  357   source.append(
"      for (unsigned int k=0; k<get_local_size(0); ++k) \n");
 
  358   source.append(
"      { \n");
 
  359   source.append(
"        if (current_row < size && i+k == next_row) \n"); 
 
  360   source.append(
"        { \n");
 
  361   source.append(
"          vector[current_row] = current_vector_entry / diagonal_entry; \n");
 
  362   source.append(
"          ++current_row; \n");
 
  363   source.append(
"          if (current_row < size) \n"); 
 
  364   source.append(
"          { \n");
 
  365   source.append(
"            next_row = row_indices[current_row+1]; \n");
 
  366   source.append(
"            current_vector_entry = vector[current_row]; \n");
 
  367   source.append(
"          } \n");
 
  368   source.append(
"        } \n");
 
  370   source.append(
"        if (current_row < size && col_index_buffer[k] < current_row) \n"); 
 
  371   source.append(
"        { \n");
 
  372   source.append(
"          if (col_index_buffer[k] < row_at_window_start) \n"); 
 
  373   source.append(
"            current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
 
  374   source.append(
"          else if (col_index_buffer[k] < current_row) \n"); 
 
  375   source.append(
"            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
 
  376   source.append(
"        } \n");
 
  377   source.append(
"        else if (col_index_buffer[k] == current_row) \n");
 
  378   source.append(
"          diagonal_entry = element_buffer[k]; \n");
 
  380   source.append(
"      } \n"); 
 
  382   source.append(
"      row_at_window_start = current_row; \n");
 
  383   source.append(
"    } \n"); 
 
  385   source.append(
"    barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  386   source.append(
"  } \n"); 
 
  387   source.append(
"} \n");
 
  391 template<
typename StringT>
 
  394   source.append(
"__kernel void row_info_extractor( \n");
 
  395   source.append(
"  __global const unsigned int * row_indices, \n");
 
  396   source.append(
"  __global const unsigned int * column_indices, \n");
 
  397   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  398   source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
  399   source.append(
"  unsigned int size, \n");
 
  400   source.append(
"  unsigned int option \n");
 
  401   source.append(
"  ) \n");
 
  402   source.append(
"{ \n");
 
  403   source.append(
"  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
 
  404   source.append(
"  { \n");
 
  405   source.append(
"    "); source.append(numeric_string); source.append(
" value = 0; \n");
 
  406   source.append(
"    unsigned int row_end = row_indices[row+1]; \n");
 
  408   source.append(
"    switch (option) \n");
 
  409   source.append(
"    { \n");
 
  410   source.append(
"      case 0: \n"); 
 
  411   source.append(
"        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
 
  412   source.append(
"          value = max(value, fabs(elements[i])); \n");
 
  413   source.append(
"        break; \n");
 
  415   source.append(
"      case 1: \n"); 
 
  416   source.append(
"        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
 
  417   source.append(
"          value += fabs(elements[i]); \n");
 
  418   source.append(
"        break; \n");
 
  420   source.append(
"      case 2: \n"); 
 
  421   source.append(
"        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
 
  422   source.append(
"          value += elements[i] * elements[i]; \n");
 
  423   source.append(
"        value = sqrt(value); \n");
 
  424   source.append(
"        break; \n");
 
  426   source.append(
"      case 3: \n"); 
 
  427   source.append(
"        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
 
  428   source.append(
"        { \n");
 
  429   source.append(
"          if (column_indices[i] == row) \n");
 
  430   source.append(
"          { \n");
 
  431   source.append(
"            value = elements[i]; \n");
 
  432   source.append(
"            break; \n");
 
  433   source.append(
"          } \n");
 
  434   source.append(
"        } \n");
 
  435   source.append(
"        break; \n");
 
  437   source.append(
"      default: \n");
 
  438   source.append(
"        break; \n");
 
  439   source.append(
"    } \n");
 
  440   source.append(
"    result[row] = value; \n");
 
  441   source.append(
"  } \n");
 
  442   source.append(
"} \n");
 
  446 template<
typename StringT>
 
  451   source.append(
"__kernel void trans_lu_backward( \n");
 
  452   source.append(
"  __global const unsigned int * row_indices, \n");
 
  453   source.append(
"  __global const unsigned int * column_indices, \n");
 
  454   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  455   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * diagonal_entries, \n");
 
  456   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vector, \n");
 
  457   source.append(
"  unsigned int size) \n");
 
  458   source.append(
"{ \n");
 
  459   source.append(
"  __local unsigned int row_index_lookahead[256]; \n");
 
  460   source.append(
"  __local unsigned int row_index_buffer[256]; \n");
 
  462   source.append(
"  unsigned int row_index; \n");
 
  463   source.append(
"  unsigned int col_index; \n");
 
  464   source.append(
"  "); source.append(numeric_string); source.append(
" matrix_entry; \n");
 
  465   source.append(
"  unsigned int nnz = row_indices[size]; \n");
 
  466   source.append(
"  unsigned int row_at_window_start = size; \n");
 
  467   source.append(
"  unsigned int row_at_window_end; \n");
 
  468   source.append(
"  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
 
  470   source.append(
"  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
 
  471   source.append(
"  { \n");
 
  472   source.append(
"    unsigned int i = (nnz - i2) - 1; \n");
 
  473   source.append(
"    col_index    = (i2 < nnz) ? column_indices[i] : 0; \n");
 
  474   source.append(
"    matrix_entry = (i2 < nnz) ? elements[i]       : 0; \n");
 
  475   source.append(
"    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
 
  477   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  479   source.append(
"    if (i2 < nnz) \n");
 
  480   source.append(
"    { \n");
 
  481   source.append(
"      unsigned int row_index_dec = 0; \n");
 
  482   source.append(
"      while (row_index_lookahead[row_index_dec] > i) \n");
 
  483   source.append(
"        ++row_index_dec; \n");
 
  484   source.append(
"      row_index = row_at_window_start - row_index_dec; \n");
 
  485   source.append(
"      row_index_buffer[get_local_id(0)] = row_index; \n");
 
  486   source.append(
"    } \n");
 
  487   source.append(
"    else \n");
 
  488   source.append(
"    { \n");
 
  489   source.append(
"      row_index = size+1; \n");
 
  490   source.append(
"      row_index_buffer[get_local_id(0)] = 0; \n");
 
  491   source.append(
"    } \n");
 
  493   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  495   source.append(
"    row_at_window_start = row_index_buffer[0]; \n");
 
  496   source.append(
"    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
 
  499   source.append(
"    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
 
  500   source.append(
"    { \n");
 
  501   source.append(
"      unsigned int row = row_at_window_start - row2; \n");
 
  502   source.append(
"      "); source.append(numeric_string); source.append(
" result_entry = vector[row] / diagonal_entries[row]; \n");
 
  504   source.append(
"      if ( (row_index == row) && (col_index < row) ) \n");
 
  505   source.append(
"        vector[col_index] -= result_entry * matrix_entry; \n");
 
  507   source.append(
"      barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  508   source.append(
"    } \n");
 
  510   source.append(
"    row_at_window_start = row_at_window_end; \n");
 
  511   source.append(
"  } \n");
 
  514   source.append(
"  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
 
  515   source.append(
"    vector[i] /= diagonal_entries[i]; \n");
 
  516   source.append(
"} \n");
 
  520 template<
typename StringT>
 
  525   source.append(
"__kernel void trans_lu_forward( \n");
 
  526   source.append(
"  __global const unsigned int * row_indices, \n");
 
  527   source.append(
"  __global const unsigned int * column_indices, \n");
 
  528   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  529   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * diagonal_entries, \n");
 
  530   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vector, \n");
 
  531   source.append(
"  unsigned int size) \n");
 
  532   source.append(
"{ \n");
 
  533   source.append(
"  __local unsigned int row_index_lookahead[256]; \n");
 
  534   source.append(
"  __local unsigned int row_index_buffer[256]; \n");
 
  536   source.append(
"  unsigned int row_index; \n");
 
  537   source.append(
"  unsigned int col_index; \n");
 
  538   source.append(
"  "); source.append(numeric_string); source.append(
" matrix_entry; \n");
 
  539   source.append(
"  unsigned int nnz = row_indices[size]; \n");
 
  540   source.append(
"  unsigned int row_at_window_start = 0; \n");
 
  541   source.append(
"  unsigned int row_at_window_end = 0; \n");
 
  542   source.append(
"  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
 
  544   source.append(
"  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
 
  545   source.append(
"  { \n");
 
  546   source.append(
"    col_index    = (i < nnz) ? column_indices[i] : 0; \n");
 
  547   source.append(
"    matrix_entry = (i < nnz) ? elements[i]       : 0; \n");
 
  548   source.append(
"    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
 
  550   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  552   source.append(
"    if (i < nnz) \n");
 
  553   source.append(
"    { \n");
 
  554   source.append(
"      unsigned int row_index_inc = 0; \n");
 
  555   source.append(
"      while (i >= row_index_lookahead[row_index_inc + 1]) \n");
 
  556   source.append(
"        ++row_index_inc; \n");
 
  557   source.append(
"      row_index = row_at_window_start + row_index_inc; \n");
 
  558   source.append(
"      row_index_buffer[get_local_id(0)] = row_index; \n");
 
  559   source.append(
"    } \n");
 
  560   source.append(
"    else \n");
 
  561   source.append(
"    { \n");
 
  562   source.append(
"      row_index = size+1; \n");
 
  563   source.append(
"      row_index_buffer[get_local_id(0)] = size - 1; \n");
 
  564   source.append(
"    } \n");
 
  566   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  568   source.append(
"    row_at_window_start = row_index_buffer[0]; \n");
 
  569   source.append(
"    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
 
  572   source.append(
"    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
 
  573   source.append(
"    { \n");
 
  574   source.append(
"      "); source.append(numeric_string); source.append(
" result_entry = vector[row] / diagonal_entries[row]; \n");
 
  576   source.append(
"      if ( (row_index == row) && (col_index > row) ) \n");
 
  577   source.append(
"        vector[col_index] -= result_entry * matrix_entry; \n");
 
  579   source.append(
"      barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  580   source.append(
"    } \n");
 
  582   source.append(
"    row_at_window_start = row_at_window_end; \n");
 
  583   source.append(
"  } \n");
 
  586   source.append(
"  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
 
  587   source.append(
"    vector[i] /= diagonal_entries[i]; \n");
 
  588   source.append(
"} \n");
 
  592 template<
typename StringT>
 
  597   source.append(
"__kernel void trans_unit_lu_backward( \n");
 
  598   source.append(
"  __global const unsigned int * row_indices, \n");
 
  599   source.append(
"  __global const unsigned int * column_indices, \n");
 
  600   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  601   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vector, \n");
 
  602   source.append(
"  unsigned int size) \n");
 
  603   source.append(
"{ \n");
 
  604   source.append(
"  __local unsigned int row_index_lookahead[256]; \n");
 
  605   source.append(
"  __local unsigned int row_index_buffer[256]; \n");
 
  607   source.append(
"  unsigned int row_index; \n");
 
  608   source.append(
"  unsigned int col_index; \n");
 
  609   source.append(
"  "); source.append(numeric_string); source.append(
" matrix_entry; \n");
 
  610   source.append(
"  unsigned int nnz = row_indices[size]; \n");
 
  611   source.append(
"  unsigned int row_at_window_start = size; \n");
 
  612   source.append(
"  unsigned int row_at_window_end; \n");
 
  613   source.append(
"  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
 
  615   source.append(
"  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
 
  616   source.append(
"  { \n");
 
  617   source.append(
"    unsigned int i = (nnz - i2) - 1; \n");
 
  618   source.append(
"    col_index    = (i2 < nnz) ? column_indices[i] : 0; \n");
 
  619   source.append(
"    matrix_entry = (i2 < nnz) ? elements[i]       : 0; \n");
 
  620   source.append(
"    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
 
  622   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  624   source.append(
"    if (i2 < nnz) \n");
 
  625   source.append(
"    { \n");
 
  626   source.append(
"      unsigned int row_index_dec = 0; \n");
 
  627   source.append(
"      while (row_index_lookahead[row_index_dec] > i) \n");
 
  628   source.append(
"        ++row_index_dec; \n");
 
  629   source.append(
"      row_index = row_at_window_start - row_index_dec; \n");
 
  630   source.append(
"      row_index_buffer[get_local_id(0)] = row_index; \n");
 
  631   source.append(
"    } \n");
 
  632   source.append(
"    else \n");
 
  633   source.append(
"    { \n");
 
  634   source.append(
"      row_index = size+1; \n");
 
  635   source.append(
"      row_index_buffer[get_local_id(0)] = 0; \n");
 
  636   source.append(
"    } \n");
 
  638   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  640   source.append(
"    row_at_window_start = row_index_buffer[0]; \n");
 
  641   source.append(
"    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
 
  644   source.append(
"    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
 
  645   source.append(
"    { \n");
 
  646   source.append(
"      unsigned int row = row_at_window_start - row2; \n");
 
  647   source.append(
"      "); source.append(numeric_string); source.append(
" result_entry = vector[row]; \n");
 
  649   source.append(
"      if ( (row_index == row) && (col_index < row) ) \n");
 
  650   source.append(
"        vector[col_index] -= result_entry * matrix_entry; \n");
 
  652   source.append(
"      barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  653   source.append(
"    } \n");
 
  655   source.append(
"    row_at_window_start = row_at_window_end; \n");
 
  656   source.append(
"  } \n");
 
  657   source.append(
"} \n");
 
  662 template<
typename StringT>
 
  667   source.append(
"__kernel void trans_unit_lu_forward( \n");
 
  668   source.append(
"  __global const unsigned int * row_indices, \n");
 
  669   source.append(
"  __global const unsigned int * column_indices, \n");
 
  670   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  671   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vector, \n");
 
  672   source.append(
"  unsigned int size) \n");
 
  673   source.append(
"{ \n");
 
  674   source.append(
"  __local unsigned int row_index_lookahead[256]; \n");
 
  675   source.append(
"  __local unsigned int row_index_buffer[256]; \n");
 
  677   source.append(
"  unsigned int row_index; \n");
 
  678   source.append(
"  unsigned int col_index; \n");
 
  679   source.append(
"  "); source.append(numeric_string); source.append(
" matrix_entry; \n");
 
  680   source.append(
"  unsigned int nnz = row_indices[size]; \n");
 
  681   source.append(
"  unsigned int row_at_window_start = 0; \n");
 
  682   source.append(
"  unsigned int row_at_window_end = 0; \n");
 
  683   source.append(
"  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
 
  685   source.append(
"  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
 
  686   source.append(
"  { \n");
 
  687   source.append(
"    col_index    = (i < nnz) ? column_indices[i] : 0; \n");
 
  688   source.append(
"    matrix_entry = (i < nnz) ? elements[i]       : 0; \n");
 
  689   source.append(
"    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
 
  691   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  693   source.append(
"    if (i < nnz) \n");
 
  694   source.append(
"    { \n");
 
  695   source.append(
"      unsigned int row_index_inc = 0; \n");
 
  696   source.append(
"      while (i >= row_index_lookahead[row_index_inc + 1]) \n");
 
  697   source.append(
"        ++row_index_inc; \n");
 
  698   source.append(
"      row_index = row_at_window_start + row_index_inc; \n");
 
  699   source.append(
"      row_index_buffer[get_local_id(0)] = row_index; \n");
 
  700   source.append(
"    } \n");
 
  701   source.append(
"    else \n");
 
  702   source.append(
"    { \n");
 
  703   source.append(
"      row_index = size+1; \n");
 
  704   source.append(
"      row_index_buffer[get_local_id(0)] = size - 1; \n");
 
  705   source.append(
"    } \n");
 
  707   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  709   source.append(
"    row_at_window_start = row_index_buffer[0]; \n");
 
  710   source.append(
"    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
 
  713   source.append(
"    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
 
  714   source.append(
"    { \n");
 
  715   source.append(
"      "); source.append(numeric_string); source.append(
" result_entry = vector[row]; \n");
 
  717   source.append(
"      if ( (row_index == row) && (col_index > row) ) \n");
 
  718   source.append(
"        vector[col_index] -= result_entry * matrix_entry; \n");
 
  720   source.append(
"      barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  721   source.append(
"    } \n");
 
  723   source.append(
"    row_at_window_start = row_at_window_end; \n");
 
  724   source.append(
"  } \n");
 
  725   source.append(
"} \n");
 
  729 template<
typename StringT>
 
  734   source.append(
"__kernel void trans_unit_lu_forward_slow( \n");
 
  735   source.append(
"  __global const unsigned int * row_indices, \n");
 
  736   source.append(
"  __global const unsigned int * column_indices, \n");
 
  737   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  738   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vector, \n");
 
  739   source.append(
"  unsigned int size) \n");
 
  740   source.append(
"{ \n");
 
  741   source.append(
"  for (unsigned int row = 0; row < size; ++row) \n");
 
  742   source.append(
"  { \n");
 
  743   source.append(
"    "); source.append(numeric_string); source.append(
" result_entry = vector[row]; \n");
 
  745   source.append(
"    unsigned int row_start = row_indices[row]; \n");
 
  746   source.append(
"    unsigned int row_stop  = row_indices[row + 1]; \n");
 
  747   source.append(
"    for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
 
  748   source.append(
"    { \n");
 
  749   source.append(
"      unsigned int col_index = column_indices[entry_index]; \n");
 
  750   source.append(
"      if (col_index > row) \n");
 
  751   source.append(
"        vector[col_index] -= result_entry * elements[entry_index]; \n");
 
  752   source.append(
"    } \n");
 
  754   source.append(
"    barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  755   source.append(
"  } \n");
 
  756   source.append(
"} \n");
 
  760 template<
typename StringT>
 
  765   source.append(
"__kernel void unit_lu_backward( \n");
 
  766   source.append(
"  __global const unsigned int * row_indices, \n");
 
  767   source.append(
"  __global const unsigned int * column_indices, \n");
 
  768   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  769   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vector, \n");
 
  770   source.append(
"  unsigned int size) \n");
 
  771   source.append(
"{ \n");
 
  772   source.append(
"  __local  unsigned int col_index_buffer[128]; \n");
 
  773   source.append(
"  __local  "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
 
  774   source.append(
"  __local  "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
 
  776   source.append(
"  unsigned int nnz = row_indices[size]; \n");
 
  777   source.append(
"  unsigned int current_row = size-1; \n");
 
  778   source.append(
"  unsigned int row_at_window_start = size-1; \n");
 
  779   source.append(
"  "); source.append(numeric_string); source.append(
" current_vector_entry = vector[size-1]; \n");
 
  780   source.append(
"  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
 
  781   source.append(
"  unsigned int next_row = row_indices[size-1]; \n");
 
  783   source.append(
"  unsigned int i = loop_end + get_local_id(0); \n");
 
  784   source.append(
"  while (1) \n");
 
  785   source.append(
"  { \n");
 
  787   source.append(
"    if (i < nnz) \n");
 
  788   source.append(
"    { \n");
 
  789   source.append(
"      element_buffer[get_local_id(0)] = elements[i]; \n");
 
  790   source.append(
"      unsigned int tmp = column_indices[i]; \n");
 
  791   source.append(
"      col_index_buffer[get_local_id(0)] = tmp; \n");
 
  792   source.append(
"      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
 
  793   source.append(
"    } \n");
 
  795   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  798   source.append(
"    if (get_local_id(0) == 0) \n");
 
  799   source.append(
"    { \n");
 
  801   source.append(
"      for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
 
  802   source.append(
"      { \n");
 
  803   source.append(
"        unsigned int k = (get_local_size(0) - k2) - 1; \n");
 
  805   source.append(
"        if (i+k >= nnz) \n");
 
  806   source.append(
"          continue; \n");
 
  808   source.append(
"        if (col_index_buffer[k] > row_at_window_start) \n"); 
 
  809   source.append(
"          current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
 
  810   source.append(
"        else if (col_index_buffer[k] > current_row) \n"); 
 
  811   source.append(
"          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
 
  813   source.append(
"        if (i+k == next_row) \n"); 
 
  814   source.append(
"        { \n");
 
  815   source.append(
"          vector[current_row] = current_vector_entry; \n");
 
  816   source.append(
"          if (current_row > 0) \n"); 
 
  817   source.append(
"          { \n");
 
  818   source.append(
"            --current_row; \n");
 
  819   source.append(
"            next_row = row_indices[current_row]; \n");
 
  820   source.append(
"            current_vector_entry = vector[current_row]; \n");
 
  821   source.append(
"          } \n");
 
  822   source.append(
"        } \n");
 
  825   source.append(
"      } \n"); 
 
  827   source.append(
"      row_at_window_start = current_row; \n");
 
  828   source.append(
"    } \n"); 
 
  830   source.append(
"    barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  832   source.append(
"    if (i < get_local_size(0)) \n");
 
  833   source.append(
"      break; \n");
 
  835   source.append(
"    i -= get_local_size(0); \n");
 
  836   source.append(
"  } \n"); 
 
  837   source.append(
"} \n");
 
  841 template<
typename StringT>
 
  846   source.append(
"__kernel void unit_lu_forward( \n");
 
  847   source.append(
"  __global const unsigned int * row_indices, \n");
 
  848   source.append(
"  __global const unsigned int * column_indices, \n");
 
  849   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  850   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vector, \n");
 
  851   source.append(
"  unsigned int size) \n");
 
  852   source.append(
"{ \n");
 
  853   source.append(
"  __local  unsigned int col_index_buffer[128]; \n");
 
  854   source.append(
"  __local  "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
 
  855   source.append(
"  __local  "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
 
  857   source.append(
"  unsigned int nnz = row_indices[size]; \n");
 
  858   source.append(
"  unsigned int current_row = 0; \n");
 
  859   source.append(
"  unsigned int row_at_window_start = 0; \n");
 
  860   source.append(
"  "); source.append(numeric_string); source.append(
" current_vector_entry = vector[0]; \n");
 
  861   source.append(
"  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
 
  862   source.append(
"  unsigned int next_row = row_indices[1]; \n");
 
  864   source.append(
"  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
 
  865   source.append(
"  { \n");
 
  867   source.append(
"    if (i < nnz) \n");
 
  868   source.append(
"    { \n");
 
  869   source.append(
"      element_buffer[get_local_id(0)] = elements[i]; \n");
 
  870   source.append(
"      unsigned int tmp = column_indices[i]; \n");
 
  871   source.append(
"      col_index_buffer[get_local_id(0)] = tmp; \n");
 
  872   source.append(
"      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
 
  873   source.append(
"    } \n");
 
  875   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  878   source.append(
"    if (get_local_id(0) == 0) \n");
 
  879   source.append(
"    { \n");
 
  881   source.append(
"      for (unsigned int k=0; k<get_local_size(0); ++k) \n");
 
  882   source.append(
"      { \n");
 
  883   source.append(
"        if (i+k == next_row) \n"); 
 
  884   source.append(
"        { \n");
 
  885   source.append(
"          vector[current_row] = current_vector_entry; \n");
 
  886   source.append(
"          ++current_row; \n");
 
  887   source.append(
"          if (current_row < size) //load next row's data \n");
 
  888   source.append(
"          { \n");
 
  889   source.append(
"            next_row = row_indices[current_row+1]; \n");
 
  890   source.append(
"            current_vector_entry = vector[current_row]; \n");
 
  891   source.append(
"          } \n");
 
  892   source.append(
"        } \n");
 
  894   source.append(
"        if (current_row < size && col_index_buffer[k] < current_row) \n"); 
 
  895   source.append(
"        { \n");
 
  896   source.append(
"          if (col_index_buffer[k] < row_at_window_start) \n"); 
 
  897   source.append(
"            current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
 
  898   source.append(
"          else if (col_index_buffer[k] < current_row) \n"); 
 
  899   source.append(
"            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
 
  900   source.append(
"        } \n");
 
  902   source.append(
"      } \n"); 
 
  904   source.append(
"      row_at_window_start = current_row; \n");
 
  905   source.append(
"    } \n"); 
 
  907   source.append(
"    barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  908   source.append(
"  } //for i \n");
 
  909   source.append(
"} \n");
 
  913 template<
typename StringT>
 
  916   std::stringstream ss;
 
  920     source.append(
"__kernel void vec_mul_nvidia_alpha_beta( \n");
 
  922     source.append(
"__kernel void vec_mul_nvidia( \n");
 
  923   source.append(
"    __global const unsigned int * row_indices, \n");
 
  924   source.append(
"    __global const unsigned int * column_indices, \n");
 
  925   source.append(
"  __global const unsigned int * row_blocks, \n");
 
  926   source.append(
"    __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  927   source.append(
"  unsigned int num_blocks, \n");
 
  928   source.append(
"    __global const "); source.append(numeric_string); source.append(
" * x, \n");
 
  929   source.append(
"    uint4 layout_x, \n");
 
  930   if (with_alpha_beta) { source.append(
"    "); source.append(numeric_string); source.append(
" alpha, \n"); }
 
  931   source.append(
"    __global "); source.append(numeric_string); source.append(
" * result, \n");
 
  932   source.append(
"    uint4 layout_result \n");
 
  933   if (with_alpha_beta) { source.append(
"    , "); source.append(numeric_string); source.append(
" beta \n"); }
 
  934   source.append(
") { \n");
 
  935   source.append(
"  __local "); source.append(numeric_string); source.append(
" shared_elements[256]; \n");
 
  937   source.append(
"  const unsigned int id_in_row = get_local_id(0) % " + ss.str() + 
"; \n");
 
  938   source.append(
"  const unsigned int block_increment = get_local_size(0) * ((layout_result.z - 1) / (get_global_size(0)) + 1); \n");
 
  939   source.append(
"  const unsigned int block_start = get_group_id(0) * block_increment; \n");
 
  940   source.append(
"  const unsigned int block_stop  = min(block_start + block_increment, layout_result.z); \n");
 
  942   source.append(
"  for (unsigned int row  = block_start + get_local_id(0) / " + ss.str() + 
"; \n");
 
  943   source.append(
"                    row  < block_stop; \n");
 
  944   source.append(
"                    row += get_local_size(0) / " + ss.str() + 
") \n");
 
  945   source.append(
"  { \n");
 
  946   source.append(
"    "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
 
  947   source.append(
"    unsigned int row_end = row_indices[row+1]; \n");
 
  948   source.append(
"    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + 
") \n");
 
  949   source.append(
"      dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
 
  951   source.append(
"    shared_elements[get_local_id(0)] = dot_prod; \n");
 
  952   source.append(
"    #pragma unroll \n");
 
  953   source.append(
"    for (unsigned int k = 1; k < " + ss.str() + 
"; k *= 2) \n");
 
  954   source.append(
"      shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
 
  956   source.append(
"    if (id_in_row == 0) \n");
 
  958     source.append(
"      result[row * layout_result.y + layout_result.x] = alpha * shared_elements[get_local_id(0)] + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
 
  960     source.append(
"      result[row * layout_result.y + layout_result.x] = shared_elements[get_local_id(0)]; \n");
 
  961   source.append(
"  } \n");
 
  962   source.append(
"} \n");
 
  966 template<
typename StringT>
 
  970     source.append(
"__kernel void vec_mul_alpha_beta( \n");
 
  972     source.append(
"__kernel void vec_mul( \n");
 
  973   source.append(
"  __global const unsigned int * row_indices, \n");
 
  974   source.append(
"  __global const unsigned int * column_indices, \n");
 
  975   source.append(
"  __global const unsigned int * row_blocks, \n");
 
  976   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
  977   source.append(
"  unsigned int num_blocks, \n");
 
  978   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * x, \n");
 
  979   source.append(
"  uint4 layout_x, \n");
 
  980   if (with_alpha_beta) { source.append(
"  "); source.append(numeric_string); source.append(
" alpha, \n"); }
 
  981   source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
  982   source.append(
"  uint4 layout_result \n");
 
  983   if (with_alpha_beta) { source.append(
"  , "); source.append(numeric_string); source.append(
" beta \n"); }
 
  984   source.append(
") { \n");
 
  985   source.append(
"  __local "); source.append(numeric_string); source.append(
" shared_elements[1024]; \n");
 
  987   source.append(
"  unsigned int row_start = row_blocks[get_group_id(0)]; \n");
 
  988   source.append(
"  unsigned int row_stop  = row_blocks[get_group_id(0) + 1]; \n");
 
  989   source.append(
"  unsigned int rows_to_process = row_stop - row_start; \n");
 
  990   source.append(
"  unsigned int element_start = row_indices[row_start]; \n");
 
  991   source.append(
"  unsigned int element_stop = row_indices[row_stop]; \n");
 
  993   source.append(
"  if (rows_to_process > 4) { \n"); 
 
  995   source.append(
"    for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
 
  996   source.append(
"      shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
 
  998   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1001   source.append(
"    for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
 
 1002   source.append(
"      "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
 
 1003   source.append(
"      unsigned int thread_row_start = row_indices[row]     - element_start; \n");
 
 1004   source.append(
"      unsigned int thread_row_stop  = row_indices[row + 1] - element_start; \n");
 
 1005   source.append(
"      for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
 
 1006   source.append(
"        dot_prod += shared_elements[i]; \n");
 
 1007   if (with_alpha_beta)
 
 1008     source.append(
"      result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
 
 1010     source.append(
"      result[row * layout_result.y + layout_result.x] = dot_prod; \n");
 
 1011   source.append(
"    } \n");
 
 1012   source.append(
"  } \n");
 
 1015   source.append(
"  else if (rows_to_process > 1) \n"); 
 
 1016   source.append(
"  {\n");
 
 1018   source.append(
"    for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0))\n");
 
 1019   source.append(
"      shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x];\n");
 
 1021   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1024   source.append(
"    for (unsigned int row = row_start; row < row_stop; ++row)\n");
 
 1025   source.append(
"    {\n");
 
 1026   source.append(
"      unsigned int current_row_start = row_indices[row]     - element_start;\n");
 
 1027   source.append(
"      unsigned int current_row_stop  = row_indices[row + 1] - element_start;\n");
 
 1028   source.append(
"      unsigned int thread_base_id  = current_row_start + get_local_id(0);\n");
 
 1031   source.append(
"      for (unsigned int j = thread_base_id + get_local_size(0); j < current_row_stop; j += get_local_size(0))\n");
 
 1032   source.append(
"        shared_elements[thread_base_id] += shared_elements[j];\n");
 
 1035   source.append(
"      for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n");
 
 1036   source.append(
"      {\n");
 
 1037   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE);\n");
 
 1038   source.append(
"        if (get_local_id(0) < stride && thread_base_id < current_row_stop)\n");
 
 1039   source.append(
"          shared_elements[thread_base_id] += (thread_base_id + stride < current_row_stop) ? shared_elements[thread_base_id+stride] : 0;\n");
 
 1040   source.append(
"      }\n");
 
 1041   source.append(
"      "); source.append(numeric_string); source.append(
" row_result = 0; \n");
 
 1042   source.append(
"      if (current_row_stop > current_row_start)\n");
 
 1043   source.append(
"        row_result = shared_elements[current_row_start]; \n");
 
 1044   source.append(
"      if (get_local_id(0) == 0)\n");
 
 1045   if (with_alpha_beta)
 
 1046     source.append(
"        result[row * layout_result.y + layout_result.x] = alpha * row_result + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0);\n");
 
 1048     source.append(
"        result[row * layout_result.y + layout_result.x] = row_result;\n");
 
 1049   source.append(
"    }\n");
 
 1050   source.append(
"  }\n");
 
 1053   source.append(
"  else  \n"); 
 
 1054   source.append(
"  { \n");
 
 1056   source.append(
"    shared_elements[get_local_id(0)] = 0; \n");
 
 1057   source.append(
"    for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
 
 1058   source.append(
"      shared_elements[get_local_id(0)] += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
 
 1061   source.append(
"    for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
 
 1062   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1063   source.append(
"      if (get_local_id(0) < stride) \n");
 
 1064   source.append(
"        shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
 
 1065   source.append(
"    } \n");
 
 1067   source.append(
"    if (get_local_id(0) == 0) \n");
 
 1068   if (with_alpha_beta)
 
 1069     source.append(
"      result[row_start * layout_result.y + layout_result.x] = alpha * shared_elements[0] + ((beta != 0) ? beta * result[row_start * layout_result.y + layout_result.x] : 0); \n");
 
 1071     source.append(
"      result[row_start * layout_result.y + layout_result.x] = shared_elements[0]; \n");
 
 1072   source.append(
"  } \n");
 
 1074   source.append(
"} \n");
 
 1079 template<
typename StringT>
 
 1082   if (with_alpha_beta)
 
 1083     source.append(
"__kernel void vec_mul4_alpha_beta( \n");
 
 1085     source.append(
"__kernel void vec_mul4( \n");
 
 1086   source.append(
"  __global const unsigned int * row_indices, \n");
 
 1087   source.append(
"  __global const uint4 * column_indices, \n");
 
 1088   source.append(
"  __global const "); source.append(numeric_string); source.append(
"4 * elements, \n");
 
 1089   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * x, \n");
 
 1090   source.append(
"  uint4 layout_x, \n");
 
 1091   if (with_alpha_beta) { source.append(
"  "); source.append(numeric_string); source.append(
" alpha, \n"); }
 
 1092   source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
 1093   source.append(
"  uint4 layout_result \n");
 
 1094   if (with_alpha_beta) { source.append(
"  , "); source.append(numeric_string); source.append(
" beta \n"); }
 
 1095   source.append(
") { \n");
 
 1096   source.append(
"  "); source.append(numeric_string); source.append(
" dot_prod; \n");
 
 1097   source.append(
"  unsigned int start, next_stop; \n");
 
 1098   source.append(
"  uint4 col_idx; \n");
 
 1099   source.append(
"  "); source.append(numeric_string); source.append(
"4 tmp_vec; \n");
 
 1100   source.append(
"  "); source.append(numeric_string); source.append(
"4 tmp_entries; \n");
 
 1102   source.append(
"  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
 
 1103   source.append(
"  { \n");
 
 1104   source.append(
"    dot_prod = 0; \n");
 
 1105   source.append(
"    start = row_indices[row] / 4; \n");
 
 1106   source.append(
"    next_stop = row_indices[row+1] / 4; \n");
 
 1108   source.append(
"    for (unsigned int i = start; i < next_stop; ++i) \n");
 
 1109   source.append(
"    { \n");
 
 1110   source.append(
"      col_idx = column_indices[i]; \n");
 
 1112   source.append(
"      tmp_entries = elements[i]; \n");
 
 1113   source.append(
"      tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
 
 1114   source.append(
"      tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
 
 1115   source.append(
"      tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
 
 1116   source.append(
"      tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
 
 1118   source.append(
"      dot_prod += dot(tmp_entries, tmp_vec); \n");
 
 1119   source.append(
"    } \n");
 
 1120   if (with_alpha_beta)
 
 1121     source.append(
"    result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
 
 1123     source.append(
"    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
 
 1124   source.append(
"  } \n");
 
 1125   source.append(
"} \n");
 
 1128 template<
typename StringT>
 
 1131   if (with_alpha_beta)
 
 1132     source.append(
"__kernel void vec_mul8_alpha_beta( \n");
 
 1134     source.append(
"__kernel void vec_mul8( \n");
 
 1135   source.append(
"  __global const unsigned int * row_indices, \n");
 
 1136   source.append(
"  __global const uint8 * column_indices, \n");
 
 1137   source.append(
"  __global const "); source.append(numeric_string); source.append(
"8 * elements, \n");
 
 1138   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * x, \n");
 
 1139   source.append(
"  uint4 layout_x, \n");
 
 1140   if (with_alpha_beta) { source.append(
"  "); source.append(numeric_string); source.append(
" alpha, \n"); }
 
 1141   source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
 1142   source.append(
"  uint4 layout_result \n");
 
 1143   if (with_alpha_beta) { source.append(
" , "); source.append(numeric_string); source.append(
" beta \n"); }
 
 1144   source.append(
") { \n");
 
 1145   source.append(
"  "); source.append(numeric_string); source.append(
" dot_prod; \n");
 
 1146   source.append(
"  unsigned int start, next_stop; \n");
 
 1147   source.append(
"  uint8 col_idx; \n");
 
 1148   source.append(
"  "); source.append(numeric_string); source.append(
"8 tmp_vec; \n");
 
 1149   source.append(
"  "); source.append(numeric_string); source.append(
"8 tmp_entries; \n");
 
 1151   source.append(
"  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
 
 1152   source.append(
"  { \n");
 
 1153   source.append(
"    dot_prod = 0; \n");
 
 1154   source.append(
"    start = row_indices[row] / 8; \n");
 
 1155   source.append(
"    next_stop = row_indices[row+1] / 8; \n");
 
 1157   source.append(
"    for (unsigned int i = start; i < next_stop; ++i) \n");
 
 1158   source.append(
"    { \n");
 
 1159   source.append(
"      col_idx = column_indices[i]; \n");
 
 1161   source.append(
"      tmp_entries = elements[i]; \n");
 
 1162   source.append(
"      tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
 
 1163   source.append(
"      tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
 
 1164   source.append(
"      tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
 
 1165   source.append(
"      tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
 
 1166   source.append(
"      tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
 
 1167   source.append(
"      tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
 
 1168   source.append(
"      tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
 
 1169   source.append(
"      tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
 
 1171   source.append(
"      dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
 
 1172   source.append(
"      dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
 
 1173   source.append(
"    } \n");
 
 1174   if (with_alpha_beta)
 
 1175     source.append(
"    result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
 
 1177     source.append(
"    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
 
 1178   source.append(
"  } \n");
 
 1179   source.append(
"} \n");
 
 1182 template<
typename StringT>
 
 1185   source.append(
"__kernel void vec_mul_cpu( \n");
 
 1186   source.append(
"  __global const unsigned int * row_indices, \n");
 
 1187   source.append(
"  __global const unsigned int * column_indices, \n");
 
 1188   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
 1189   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * vector, \n");
 
 1190   source.append(
"  "); source.append(numeric_string); source.append(
" alpha, \n");
 
 1191   source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
 1192   source.append(
"  "); source.append(numeric_string); source.append(
" beta, \n");
 
 1193   source.append(
"  unsigned int size) \n");
 
 1194   source.append(
"{ \n");
 
 1195   source.append(
"  unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
 
 1196   source.append(
"  unsigned int row_start = get_global_id(0) * work_per_item; \n");
 
 1197   source.append(
"  unsigned int row_stop  = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
 
 1198   source.append(
"  for (unsigned int row = row_start; row < row_stop; ++row) \n");
 
 1199   source.append(
"  { \n");
 
 1200   source.append(
"    "); source.append(numeric_string); source.append(
" dot_prod = ("); source.append(numeric_string); source.append(
")0; \n");
 
 1201   source.append(
"    unsigned int row_end = row_indices[row+1]; \n");
 
 1202   source.append(
"    for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
 
 1203   source.append(
"      dot_prod += elements[i] * vector[column_indices[i]]; \n");
 
 1204   source.append(
"    result[row] = alpha * dot_prod + ((beta != 0) ? beta * result[row] : 0); \n");
 
 1205   source.append(
"  } \n");
 
 1206   source.append(
"} \n");
 
 1216 template<
typename StringT>
 
 1219   source.append(
"__kernel void spgemm_stage1( \n");
 
 1220   source.append(
"  __global const unsigned int * A_row_indices, \n");
 
 1221   source.append(
"  __global const unsigned int * A_column_indices, \n");
 
 1222   source.append(
"  unsigned int A_size1, \n");
 
 1223   source.append(
"  __global unsigned int * group_nnz_array) \n");
 
 1224   source.append(
"{ \n");
 
 1225   source.append(
"  unsigned int work_per_item = max((uint) ((A_size1 - 1) / get_global_size(0) + 1), (uint) 1); \n");
 
 1226   source.append(
"  unsigned int row_start = get_global_id(0) * work_per_item; \n");
 
 1227   source.append(
"  unsigned int row_stop  = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) A_size1); \n");
 
 1228   source.append(
"  unsigned int max_A_nnz = 0; \n");
 
 1229   source.append(
"  for (unsigned int row = row_start; row < row_stop; ++row) \n");
 
 1230   source.append(
"    max_A_nnz = max(max_A_nnz, A_row_indices[row + 1] - A_row_indices[row]); \n");
 
 1233   source.append(
"  __local unsigned int shared_nnz[256]; \n");
 
 1234   source.append(
"  shared_nnz[get_local_id(0)] = max_A_nnz; \n");
 
 1237   source.append(
"  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
 
 1238   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1239   source.append(
"    if (get_local_id(0) < stride) \n");
 
 1240   source.append(
"      shared_nnz[get_local_id(0)] = max(shared_nnz[get_local_id(0)], shared_nnz[get_local_id(0) + stride]); \n");
 
 1241   source.append(
"  } \n");
 
 1243   source.append(
"  if (get_local_id(0) == 0) \n");
 
 1244   source.append(
"    group_nnz_array[get_group_id(0)] = shared_nnz[0]; \n");
 
 1245   source.append(
"} \n");
 
 1253 template<
typename StringT>
 
 1256   source.append(
"__kernel void spgemm_decompose_1( \n");
 
 1257   source.append(
"  __global const unsigned int * A_row_indices, \n");
 
 1258   source.append(
"  unsigned int A_size1, \n");
 
 1259   source.append(
"  unsigned int max_per_row, \n");
 
 1260   source.append(
"  __global unsigned int * chunks_per_row) \n");
 
 1261   source.append(
"{ \n");
 
 1262   source.append(
"  for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) {\n");
 
 1263   source.append(
"    unsigned int num_entries = A_row_indices[row+1] - A_row_indices[row]; \n");
 
 1264   source.append(
"    chunks_per_row[row] = (num_entries < max_per_row) ? 1 : ((num_entries - 1) / max_per_row + 1); \n");
 
 1265   source.append(
"  } \n");
 
 1266   source.append(
"} \n");
 
 1274 template<
typename StringT>
 
 1277   source.append(
"__kernel void spgemm_A2( \n");
 
 1278   source.append(
"  __global unsigned int *A2_row_indices, \n");
 
 1279   source.append(
"  __global unsigned int *A2_col_indices, \n");
 
 1280   source.append(
"  __global "); source.append(numeric_string); source.append(
" *A2_elements, \n");
 
 1281   source.append(
"  unsigned int A2_size1, \n");
 
 1282   source.append(
"  __global const unsigned int *new_row_buffer) \n");
 
 1283   source.append(
"{ \n");
 
 1284   source.append(
"  for (unsigned int i = get_global_id(0); i < A2_size1; i += get_global_size(0)) {\n");
 
 1285   source.append(
"    unsigned int index_start = new_row_buffer[i]; \n");
 
 1286   source.append(
"    unsigned int index_stop  = new_row_buffer[i+1]; \n");
 
 1288   source.append(
"    A2_row_indices[i] = index_start; \n");
 
 1290   source.append(
"    for (unsigned int j = index_start; j < index_stop; ++j) { \n");
 
 1291   source.append(
"      A2_col_indices[j] = j; \n");
 
 1292   source.append(
"      A2_elements[j] = 1; \n");
 
 1293   source.append(
"    } \n");
 
 1294   source.append(
"  } \n");
 
 1296   source.append(
"  if (get_global_id(0) == 0) \n");
 
 1297   source.append(
"    A2_row_indices[A2_size1] = new_row_buffer[A2_size1]; \n");
 
 1298   source.append(
"} \n");
 
 1305 template<
typename StringT>
 
 1308   source.append(
"__kernel void spgemm_G1( \n");
 
 1309   source.append(
"  __global unsigned int *G1_row_indices, \n");
 
 1310   source.append(
"  __global unsigned int *G1_col_indices, \n");
 
 1311   source.append(
"  __global "); source.append(numeric_string); source.append(
" *G1_elements, \n");
 
 1312   source.append(
"  unsigned int G1_size1, \n");
 
 1313   source.append(
"  __global const unsigned int *A_row_indices, \n");
 
 1314   source.append(
"  __global const unsigned int *A_col_indices, \n");
 
 1315   source.append(
"  __global const "); source.append(numeric_string); source.append(
" *A_elements, \n");
 
 1316   source.append(
"  unsigned int A_size1, \n");
 
 1317   source.append(
"  unsigned int A_nnz, \n");
 
 1318   source.append(
"  unsigned int max_per_row, \n");
 
 1319   source.append(
"  __global const unsigned int *new_row_buffer) \n");
 
 1320   source.append(
"{ \n");
 
 1323   source.append(
"  for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) {\n");
 
 1324   source.append(
"    G1_col_indices[i] = A_col_indices[i]; \n");
 
 1325   source.append(
"    G1_elements[i]    = A_elements[i]; \n");
 
 1326   source.append(
"  } \n");
 
 1329   source.append(
"  for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) {\n");
 
 1330   source.append(
"    unsigned int old_start = A_row_indices[i]; \n");
 
 1331   source.append(
"    unsigned int new_start = new_row_buffer[i]; \n");
 
 1332   source.append(
"    unsigned int row_chunks = new_row_buffer[i+1] - new_start; \n");
 
 1334   source.append(
"    for (unsigned int j=0; j<row_chunks; ++j) \n");
 
 1335   source.append(
"      G1_row_indices[new_start + j] = old_start + j * max_per_row; \n");
 
 1336   source.append(
"  } \n");
 
 1339   source.append(
"  if (get_global_id(0) == 0) \n");
 
 1340   source.append(
"    G1_row_indices[G1_size1] = A_row_indices[A_size1]; \n");
 
 1341   source.append(
"} \n");
 
 1351 template<
typename StringT>
 
 1354   source.append(
"__attribute__((reqd_work_group_size(32, 1, 1))) \n");
 
 1355   source.append(
"__kernel void spgemm_stage2( \n");
 
 1356   source.append(
"  __global const unsigned int * A_row_indices, \n");
 
 1357   source.append(
"  __global const unsigned int * A_col_indices, \n");
 
 1358   source.append(
"  unsigned int A_size1, \n");
 
 1359   source.append(
"  __global const unsigned int * B_row_indices, \n");
 
 1360   source.append(
"  __global const unsigned int * B_col_indices, \n");
 
 1361   source.append(
"  unsigned int B_size2, \n");
 
 1362   source.append(
"  __global unsigned int * C_row_indices) \n");
 
 1363   source.append(
"{ \n");
 
 1364   source.append(
"  unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
 
 1365   source.append(
"  unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
 
 1366   source.append(
"  unsigned int row_C_stop  = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
 
 1367   source.append(
"  __local unsigned int shared_front[32]; \n");
 
 1369   source.append(
"  for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
 
 1370   source.append(
"  { \n");
 
 1371   source.append(
"    unsigned int row_A_start = A_row_indices[row_C]; \n");
 
 1372   source.append(
"    unsigned int row_A_end   = A_row_indices[row_C+1]; \n");
 
 1374   source.append(
"    unsigned int my_row_B = row_A_start + get_local_id(0); \n");
 
 1375   source.append(
"    unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
 
 1376   source.append(
"    unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
 
 1377   source.append(
"    unsigned int row_B_end   = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
 
 1379   source.append(
"    unsigned int num_nnz = 0; \n");
 
 1380   source.append(
"    if (row_A_end - row_A_start > 1) { \n"); 
 
 1382   source.append(
"      unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
 
 1383   source.append(
"      while (1) { \n");
 
 1386   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1387   source.append(
"        shared_front[get_local_id(0)] = current_front_index; \n");
 
 1388   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1389   source.append(
"        if (get_local_id(0) < 16) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 16]); \n");
 
 1390   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1391   source.append(
"        if (get_local_id(0) < 8)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 8]); \n");
 
 1392   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1393   source.append(
"        if (get_local_id(0) < 4)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 4]); \n");
 
 1394   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1395   source.append(
"        if (get_local_id(0) < 2)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 2]); \n");
 
 1396   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1397   source.append(
"        if (get_local_id(0) < 1)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 1]); \n");
 
 1398   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1400   source.append(
"        if (shared_front[0] == B_size2) break; \n");
 
 1403   source.append(
"        if (current_front_index == shared_front[0]) { \n");
 
 1404   source.append(
"          ++row_B_start; \n");
 
 1405   source.append(
"          current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
 
 1406   source.append(
"        } \n");
 
 1408   source.append(
"        ++num_nnz;  \n");
 
 1409   source.append(
"      }  \n");
 
 1410   source.append(
"    } else { num_nnz = row_B_end - row_B_start; }\n");
 
 1413   source.append(
"    if (get_local_id(0) == 0) \n");
 
 1414   source.append(
"      C_row_indices[row_C] = num_nnz; \n");
 
 1416   source.append(
"  } \n");
 
 1418   source.append(
"} \n");
 
 1427 template<
typename StringT>
 
 1430   source.append(
"__attribute__((reqd_work_group_size(32, 1, 1))) \n");
 
 1431   source.append(
"__kernel void spgemm_stage3( \n");
 
 1432   source.append(
"  __global const unsigned int * A_row_indices, \n");
 
 1433   source.append(
"  __global const unsigned int * A_col_indices, \n");
 
 1434   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * A_elements, \n");
 
 1435   source.append(
"  unsigned int A_size1, \n");
 
 1436   source.append(
"  __global const unsigned int * B_row_indices, \n");
 
 1437   source.append(
"  __global const unsigned int * B_col_indices, \n");
 
 1438   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * B_elements, \n");
 
 1439   source.append(
"  unsigned int B_size2, \n");
 
 1440   source.append(
"  __global unsigned int * C_row_indices, \n");
 
 1441   source.append(
"  __global unsigned int * C_col_indices, \n");
 
 1442   source.append(
"  __global "); source.append(numeric_string); source.append(
" * C_elements) \n");
 
 1443   source.append(
"{ \n");
 
 1444   source.append(
"  unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
 
 1445   source.append(
"  unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
 
 1446   source.append(
"  unsigned int row_C_stop  = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
 
 1447   source.append(
"  __local unsigned int shared_front[32]; \n");
 
 1448   source.append(
"  __local "); source.append(numeric_string); source.append(
" shared_front_values[32]; \n");
 
 1449   source.append(
"  unsigned int local_id = get_local_id(0); \n");
 
 1451   source.append(
"  for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
 
 1452   source.append(
"  { \n");
 
 1453   source.append(
"    unsigned int row_A_start = A_row_indices[row_C]; \n");
 
 1454   source.append(
"    unsigned int row_A_end   = A_row_indices[row_C+1]; \n");
 
 1456   source.append(
"    unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? local_id : 0); \n"); 
 
 1457   source.append(
"    unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B]        : 0; \n");
 
 1458   source.append(
"    unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index]     : 0; \n");
 
 1459   source.append(
"    unsigned int row_B_end   = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
 
 1461   source.append(
"    "); source.append(numeric_string); source.append(
" val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0; \n");
 
 1462   source.append(
"    unsigned int index_in_C = C_row_indices[row_C] + local_id; \n");
 
 1464   source.append(
"    if (row_A_end - row_A_start > 1) { \n"); 
 
 1466   source.append(
"      unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
 
 1467   source.append(
"      "); source.append(numeric_string); source.append(
" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start]    : 0; \n");
 
 1469   source.append(
"      unsigned int index_buffer = 0; \n");
 
 1470   source.append(
"      "); source.append(numeric_string); source.append(
" value_buffer = 0; \n");
 
 1471   source.append(
"      unsigned int buffer_size = 0; \n");
 
 1473   source.append(
"      while (1) { \n");
 
 1476   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1477   source.append(
"        shared_front[local_id] = current_front_index; \n");
 
 1478   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1479   source.append(
"        if (local_id < 16) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 16]); \n");
 
 1480   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1481   source.append(
"        if (local_id < 8)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 8]); \n");
 
 1482   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1483   source.append(
"        if (local_id < 4)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 4]); \n");
 
 1484   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1485   source.append(
"        if (local_id < 2)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 2]); \n");
 
 1486   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1487   source.append(
"        if (local_id < 1)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 1]); \n");
 
 1488   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1490   source.append(
"        if (shared_front[0] == B_size2) break; \n");
 
 1493   source.append(
"        shared_front_values[local_id] = (current_front_index == shared_front[0]) ? val_A * current_front_value : 0; \n");
 
 1494   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1495   source.append(
"        if (local_id < 16) shared_front_values[local_id] += shared_front_values[local_id + 16]; \n");
 
 1496   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1497   source.append(
"        if (local_id < 8)  shared_front_values[local_id] += shared_front_values[local_id + 8]; \n");
 
 1498   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1499   source.append(
"        if (local_id < 4)  shared_front_values[local_id] += shared_front_values[local_id + 4]; \n");
 
 1500   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1501   source.append(
"        if (local_id < 2)  shared_front_values[local_id] += shared_front_values[local_id + 2]; \n");
 
 1502   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1503   source.append(
"        if (local_id < 1)  shared_front_values[local_id] += shared_front_values[local_id + 1]; \n");
 
 1504   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
 1507   source.append(
"        if (current_front_index == shared_front[0]) { \n");
 
 1508   source.append(
"          ++row_B_start; \n");
 
 1509   source.append(
"          current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
 
 1510   source.append(
"          current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start]    : 0; \n");
 
 1511   source.append(
"        } \n");
 
 1514   source.append(
"        index_buffer = (local_id == buffer_size) ? shared_front[0]        : index_buffer;  \n");
 
 1515   source.append(
"        value_buffer = (local_id == buffer_size) ? shared_front_values[0] : value_buffer;  \n");
 
 1516   source.append(
"        ++buffer_size;  \n");
 
 1519   source.append(
"        if (buffer_size == get_local_size(0)) {  \n");
 
 1520   source.append(
"          C_col_indices[index_in_C] = index_buffer; \n");
 
 1521   source.append(
"          C_elements[index_in_C]    = value_buffer; \n");
 
 1522   source.append(
"        } \n");
 
 1525   source.append(
"          index_in_C += (buffer_size == get_local_size(0)) ? get_local_size(0) : 0; \n");
 
 1526   source.append(
"          buffer_size = (buffer_size == get_local_size(0)) ?                0  : buffer_size; \n");
 
 1528   source.append(
"      }  \n");
 
 1531   source.append(
"      if (local_id < buffer_size) {  \n");
 
 1532   source.append(
"        C_col_indices[index_in_C] = index_buffer; \n");
 
 1533   source.append(
"        C_elements[index_in_C]    = value_buffer; \n");
 
 1534   source.append(
"      } \n");
 
 1537   source.append(
"    } else { \n");
 
 1538   source.append(
"      for (unsigned int i = row_B_start + local_id; i < row_B_end; i += get_local_size(0)) { \n");
 
 1539   source.append(
"        C_col_indices[index_in_C] = B_col_indices[i]; \n");
 
 1540   source.append(
"        C_elements[index_in_C]    = val_A * B_elements[i]; \n");
 
 1541   source.append(
"        index_in_C += get_local_size(0); \n");
 
 1542   source.append(
"      } \n");
 
 1543   source.append(
"    } \n");
 
 1545   source.append(
"  } \n");
 
 1547   source.append(
"} \n");
 
 1552 template<
typename StringT>
 
 1563 template<
typename StringT>
 
 1567  source.append(
" __kernel void assign_to_dense( \n");
 
 1568  source.append(
"  __global const unsigned int * row_indices, \n");
 
 1569  source.append(
"  __global const unsigned int * column_indices, \n");
 
 1570  source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
 1571  source.append(
"  __global "); source.append(numeric_string); source.append(
" * B, \n");
 
 1572  source.append(
"  unsigned int B_row_start, \n");
 
 1573  source.append(
"  unsigned int B_col_start, \n");
 
 1574  source.append(
"  unsigned int B_row_inc, \n");
 
 1575  source.append(
"  unsigned int B_col_inc, \n");
 
 1576  source.append(
"  unsigned int B_row_size, \n");
 
 1577  source.append(
"  unsigned int B_col_size, \n");
 
 1578  source.append(
"  unsigned int B_internal_rows, \n");
 
 1579  source.append(
"  unsigned int B_internal_cols) { \n");
 
 1581  source.append(
"   for (unsigned int i = get_global_id(0); i < B_row_size; i += get_global_size(0)) \n");
 
 1582  source.append(
"   { \n");
 
 1583  source.append(
"     unsigned int row_end = row_indices[i+1]; \n");
 
 1584  source.append(
"     for (unsigned int j = row_indices[i]; j<row_end; j++) \n");
 
 1585  source.append(
"     { \n");
 
 1586  source.append(
"       B[(B_row_start + i * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j]; \n");
 
 1587  source.append(
"     } \n");
 
 1588  source.append(
"   } \n");
 
 1589  source.append(
"  } \n");
 
 1598 template<
typename NumericT>
 
 1608     static std::map<cl_context, bool> init_done;
 
 1615       source.reserve(1024);
 
 1617       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
 1619       if (numeric_string == 
"float" || numeric_string == 
"double")
 
 1641       #ifdef VIENNACL_BUILD_INFO 
 1642       std::cout << 
"Creating program " << prog_name << std::endl;
 
 1644       ctx.add_program(source, prog_name);
 
 1645       init_done[ctx.handle().get()] = 
true;
 
 1652 template<
typename NumericT>
 
 1662     static std::map<cl_context, bool> init_done;
 
 1669       source.reserve(1024);
 
 1671       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
 1673       if (numeric_string == 
"float" || numeric_string == 
"double")
 
 1689       #ifdef VIENNACL_BUILD_INFO 
 1690       std::cout << 
"Creating program " << prog_name << std::endl;
 
 1692       ctx.add_program(source, prog_name);
 
 1693       init_done[ctx.handle().get()] = 
true;
 
void generate_compressed_matrix_row_info_extractor(StringT &source, std::string const &numeric_string)
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
void generate_compressed_matrix_trans_lu_forward(StringT &source, std::string const &numeric_string)
Provides OpenCL-related utilities. 
void generate_compressed_matrix_block_trans_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_dense_matrix_mult(StringT &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Generate kernel for C = A * B with A being a compressed_matrix, B and C dense. 
void generate_compressed_matrix_vec_mul_cpu(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_jacobi(StringT &source, std::string const &numeric_string)
const viennacl::ocl::handle< cl_context > & handle() const 
Returns the context handle. 
Common implementations shared by OpenCL-based operations. 
void generate_compressed_matrix_assign_to_dense(StringT &source, std::string const &numeric_string)
static std::string program_name()
void generate_compressed_matrix_block_trans_unit_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_vec_mul_nvidia(StringT &source, std::string const &numeric_string, unsigned int subwarp_size, bool with_alpha_beta)
void generate_compressed_matrix_compressed_matrix_prod_G1(StringT &source, std::string const &numeric_string)
OpenCL kernel for filling G_1 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonz...
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const 
void generate_compressed_matrix_compressed_matrix_prod_3(StringT &source, std::string const &numeric_string)
OpenCL kernel for the third stage of sparse matrix-matrix multiplication. 
void generate_compressed_matrix_trans_unit_lu_forward(StringT &source, std::string const &numeric_string)
Main kernel class for generating OpenCL kernels for compressed_matrix (except solvers). 
void generate_compressed_matrix_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod(StringT &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
void generate_compressed_matrix_dense_matrix_multiplication(StringT &source, std::string const &numeric_string)
Main kernel class for triangular solver OpenCL kernels for compressed_matrix. 
void generate_compressed_matrix_compressed_matrix_prod_2(StringT &source)
OpenCL kernel for the second stage of sparse matrix-matrix multiplication. 
void generate_compressed_matrix_trans_unit_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_1(StringT &source)
OpenCL kernel for the first stage of sparse matrix-matrix multiplication. 
void generate_compressed_matrix_vec_mul8(StringT &source, std::string const &numeric_string, bool with_alpha_beta)
void generate_compressed_matrix_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_A2(StringT &source, std::string const &numeric_string)
OpenCL kernel for filling A_2 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonz...
void generate_compressed_matrix_vec_mul4(StringT &source, std::string const &numeric_string, bool with_alpha_beta)
static std::string program_name()
Representation of an OpenCL kernel in ViennaCL. 
void generate_compressed_matrix_vec_mul(StringT &source, std::string const &numeric_string, bool with_alpha_beta)
static void init(viennacl::ocl::context &ctx)
void generate_compressed_matrix_trans_unit_lu_forward_slow(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation. 
void generate_compressed_matrix_trans_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_decompose_1(StringT &source)
OpenCL kernel for decomposing A in C = A * B, such that A = A_2 * G_1 with G_1 containing at most 32 ...
void generate_compressed_matrix_unit_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_unit_lu_backward(StringT &source, std::string const &numeric_string)