1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP 
    2 #define VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP 
   39 template<
typename StringT>
 
   42   source.append(
"float get_element(__global const unsigned int * row_indices, \n");
 
   43   source.append(
"           __global const unsigned int * column_indices, \n");
 
   44   source.append(
"           __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
   45   source.append(
"           unsigned int row, \n");
 
   46   source.append(
"           unsigned int col) \n");
 
   47   source.append(
"{ \n");
 
   48   source.append(
"  unsigned int row_end = row_indices[row+1]; \n");
 
   49   source.append(
"  for (unsigned int i = row_indices[row]; i < row_end; ++i){ \n");
 
   50   source.append(
"    if (column_indices[i] == col) \n");
 
   51   source.append(
"      return elements[i]; \n");
 
   52   source.append(
"    if (column_indices[i] > col) \n");
 
   53   source.append(
"      return 0; \n");
 
   54   source.append(
"  } \n");
 
   55   source.append(
"  return 0; \n");
 
   56   source.append(
"} \n");
 
   58   source.append(
"void block_assembly(__global const unsigned int * row_indices, \n");
 
   59   source.append(
"          __global const unsigned int * column_indices, \n");
 
   60   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
   61   source.append(
"          __global const unsigned int * matrix_dimensions, \n");
 
   62   source.append(
"          __global const unsigned int * set_I, \n");
 
   63   source.append(
"          __global const unsigned int * set_J, \n");
 
   64   source.append(
"          unsigned int matrix_ind, \n");
 
   65   source.append(
"          __global "); source.append(numeric_string); source.append(
" * com_A_I_J) \n");
 
   66   source.append(
"{ \n");
 
   67   source.append(
"  unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
 
   68   source.append(
"  unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
 
   70   source.append(
"  for (unsigned int i = 0; i < col_n; ++i){ \n");
 
   72   source.append(
"        for (unsigned int j = 0; j < row_n; j++){ \n");
 
   73   source.append(
"          com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]); \n");
 
   74   source.append(
"        } \n");
 
   75   source.append(
"      } \n");
 
   76   source.append(
"} \n");
 
   78   source.append(
"__kernel void assemble_blocks( \n");
 
   79   source.append(
"          __global const unsigned int * row_indices, \n");
 
   80   source.append(
"          __global const unsigned int * column_indices, \n");
 
   81   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
   82   source.append(
"          __global const unsigned int * set_I, \n");
 
   83   source.append(
"        __global const unsigned int * set_J, \n");
 
   84   source.append(
"      __global const unsigned int * i_ind, \n");
 
   85   source.append(
"      __global const unsigned int * j_ind, \n");
 
   86   source.append(
"        __global const unsigned int * block_ind, \n");
 
   87   source.append(
"        __global const unsigned int * matrix_dimensions, \n");
 
   88   source.append(
"      __global "); source.append(numeric_string); source.append(
" * com_A_I_J, \n");
 
   89   source.append(
"      __global unsigned int * g_is_update, \n");
 
   90   source.append(
"                   unsigned int  block_elems_num) \n");
 
   91   source.append(
"{ \n");
 
   92   source.append(
"    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
 
   93   source.append(
"        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
 
   94   source.append(
"            block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]); \n");
 
   95   source.append(
"        } \n");
 
   96   source.append(
"    } \n");
 
   97   source.append(
"  } \n");
 
  100 template<
typename StringT>
 
  103   source.append(
"  void assemble_bv(__global "); source.append(numeric_string); source.append(
" * g_bv_r, __global "); source.append(numeric_string); source.append(
" * g_bv, unsigned int col_n){ \n");
 
  104   source.append(
"    for (unsigned int i = 0; i < col_n; ++i){ \n");
 
  105   source.append(
"      g_bv_r[i] = g_bv[ i]; \n");
 
  106   source.append(
"    } \n");
 
  107   source.append(
"  } \n");
 
  109   source.append(
"  void assemble_bv_block(__global "); source.append(numeric_string); source.append(
" * g_bv_r, __global "); source.append(numeric_string); source.append(
" * g_bv, unsigned int col_n, \n");
 
  110   source.append(
"               __global "); source.append(numeric_string); source.append(
" * g_bv_u, unsigned int col_n_u) \n");
 
  111   source.append(
"  { \n");
 
  112   source.append(
"    assemble_bv(g_bv_r, g_bv, col_n); \n");
 
  113   source.append(
"    assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u); \n");
 
  114   source.append(
"  } \n");
 
  116   source.append(
"  __kernel void block_bv_assembly(__global "); source.append(numeric_string); source.append(
" * g_bv, \n");
 
  117   source.append(
"              __global unsigned int * start_bv_ind, \n");
 
  118   source.append(
"              __global unsigned int * matrix_dimensions, \n");
 
  119   source.append(
"              __global "); source.append(numeric_string); source.append(
" * g_bv_u, \n");
 
  120   source.append(
"              __global unsigned int * start_bv_u_ind, \n");
 
  121   source.append(
"              __global unsigned int * matrix_dimensions_u, \n");
 
  122   source.append(
"              __global "); source.append(numeric_string); source.append(
" * g_bv_r, \n");
 
  123   source.append(
"              __global unsigned int * start_bv_r_ind, \n");
 
  124   source.append(
"              __global unsigned int * matrix_dimensions_r, \n");
 
  125   source.append(
"              __global unsigned int * g_is_update, \n");
 
  126   source.append(
"              //__local  "); source.append(numeric_string); source.append(
" * local_gb, \n");
 
  127   source.append(
"              unsigned int  block_elems_num) \n");
 
  128   source.append(
"  { \n");
 
  129   source.append(
"    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
 
  130   source.append(
"      if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
 
  131   source.append(
"        assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]); \n");
 
  132   source.append(
"      } \n");
 
  133   source.append(
"    } \n");
 
  134   source.append(
"  } \n");
 
  137 template<
typename StringT>
 
  140   source.append(
"void custom_dot_prod_ls(__global "); source.append(numeric_string); source.append(
" * A, unsigned int row_n, __global "); source.append(numeric_string); source.append(
" * v, unsigned int ind, "); source.append(numeric_string); source.append(
" *res){ \n");
 
  141   source.append(
"  *res = 0.0; \n");
 
  142   source.append(
"  for (unsigned int j = ind; j < row_n; ++j){ \n");
 
  143   source.append(
"    if (j == ind){ \n");
 
  144   source.append(
"      *res += v[ j]; \n");
 
  145   source.append(
"    }else{ \n");
 
  146   source.append(
"      *res += A[ j + ind*row_n]*v[ j]; \n");
 
  147   source.append(
"    } \n");
 
  148   source.append(
"  } \n");
 
  149   source.append(
"} \n");
 
  151   source.append(
"void backwardSolve(__global "); source.append(numeric_string); source.append(
" * R,  unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(
" * y, __global "); source.append(numeric_string); source.append(
" * x){ \n");
 
  152   source.append(
"  for (int i = col_n-1; i >= 0; i--) { \n");
 
  153   source.append(
"    x[ i] = y[ i]; \n");
 
  154   source.append(
"    for (int j = i+1; j < col_n; ++j) { \n");
 
  155   source.append(
"      x[ i] -= R[ i + j*row_n]*x[ j]; \n");
 
  156   source.append(
"    } \n");
 
  157   source.append(
"    x[i] /= R[ i + i*row_n]; \n");
 
  158   source.append(
"  } \n");
 
  159   source.append(
"} \n");
 
  162   source.append(
"void apply_q_trans_vec_ls(__global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, unsigned int col_n, __global const "); source.append(numeric_string); source.append(
" * b_v,  __global "); source.append(numeric_string); source.append(
" * y){ \n");
 
  163   source.append(
"            "); source.append(numeric_string); source.append(
" inn_prod = 0; \n");
 
  164   source.append(
"            for (unsigned int i = 0; i < col_n; ++i){ \n");
 
  165   source.append(
"                custom_dot_prod_ls(R, row_n, y, i, &inn_prod); \n");
 
  166   source.append(
"                for (unsigned int j = i; j < row_n; ++j){ \n");
 
  167   source.append(
"                    if (i == j){ \n");
 
  168   source.append(
"                        y[ j] -= b_v[ i]*inn_prod; \n");
 
  169   source.append(
"                    } \n");
 
  170   source.append(
"                    else{ \n");
 
  171   source.append(
"                        y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n]; \n");
 
  172   source.append(
"                    } \n");
 
  173   source.append(
"                } \n");
 
  174   source.append(
"            } \n");
 
  175   source.append(
"        } \n");
 
  177   source.append(
"void ls(__global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(
" * b_v, __global "); source.append(numeric_string); source.append(
" * m_v, __global "); source.append(numeric_string); source.append(
" * y_v){ \n");
 
  178   source.append(
"  apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v); \n");
 
  179   source.append(
"  //m_new - is m_v now \n");
 
  180   source.append(
"  backwardSolve(R, row_n, col_n, y_v, m_v); \n");
 
  181   source.append(
"} \n");
 
  183   source.append(
"__kernel void block_least_squares( \n");
 
  184   source.append(
"      __global "); source.append(numeric_string); source.append(
" * global_R, \n");
 
  185   source.append(
"      __global unsigned int * block_ind, \n");
 
  186   source.append(
"      __global "); source.append(numeric_string); source.append(
" * b_v, \n");
 
  187   source.append(
"      __global unsigned int * start_bv_inds, \n");
 
  188   source.append(
"      __global "); source.append(numeric_string); source.append(
" * m_v, \n");
 
  189   source.append(
"      __global "); source.append(numeric_string); source.append(
" * y_v, \n");
 
  190   source.append(
"      __global unsigned int * start_y_inds, \n");
 
  191   source.append(
"      __global unsigned int * matrix_dimensions, \n");
 
  192   source.append(
"      __global unsigned int * g_is_update, \n");
 
  193   source.append(
"      unsigned int  block_elems_num) \n");
 
  194   source.append(
"{ \n");
 
  195   source.append(
"    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
 
  196   source.append(
"        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
 
  197   source.append(
"            ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] ); \n");
 
  198   source.append(
"        } \n");
 
  199   source.append(
"    } \n");
 
  200   source.append(
"} \n");
 
  203 template<
typename StringT>
 
  206   source.append(
"void custom_dot_prod(__global "); source.append(numeric_string); source.append(
" * A, unsigned int row_n, __local "); source.append(numeric_string); source.append(
" * v, unsigned int ind, "); source.append(numeric_string); source.append(
" *res){ \n");
 
  207   source.append(
"  *res = 0.0; \n");
 
  208   source.append(
"  for (unsigned int j = ind; j < row_n; ++j){ \n");
 
  209   source.append(
"    if (j == ind){ \n");
 
  210   source.append(
"      *res += v[j]; \n");
 
  211   source.append(
"    }else{ \n");
 
  212   source.append(
"      *res += A[j + ind*row_n]*v[j]; \n");
 
  213   source.append(
"    } \n");
 
  214   source.append(
"  } \n");
 
  215   source.append(
"} \n");
 
  217   source.append(
"void apply_q_trans_vec(__global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(
" * b_v, __local "); source.append(numeric_string); source.append(
" * y){ \n");
 
  218   source.append(
"  "); source.append(numeric_string); source.append(
" inn_prod = 0; \n");
 
  219   source.append(
"  for (unsigned int i = 0; i < col_n; ++i){ \n");
 
  220   source.append(
"    custom_dot_prod(R, row_n, y, i, &inn_prod); \n");
 
  221   source.append(
"    for (unsigned int j = i; j < row_n; ++j){ \n");
 
  222   source.append(
"      if (i == j){ \n");
 
  223   source.append(
"        y[j] -= b_v[ i]*inn_prod; \n");
 
  224   source.append(
"      } \n");
 
  225   source.append(
"      else{ \n");
 
  226   source.append(
"        y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n]; \n");
 
  227   source.append(
"      } \n");
 
  228   source.append(
"    } \n");
 
  229   source.append(
"  } \n");
 
  230   source.append(
"} \n");
 
  232   source.append(
"void q_mult(__global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(
" * b_v, __local "); source.append(numeric_string); source.append(
" * R_u, unsigned int col_n_u){ \n");
 
  233   source.append(
"        for (unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){ \n");
 
  234   source.append(
"          apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i); \n");
 
  235   source.append(
"        } \n");
 
  236   source.append(
"} \n");
 
  238   source.append(
"void matrix_from_global_to_local(__global "); source.append(numeric_string); source.append(
"* g_M, __local "); source.append(numeric_string); source.append(
"* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
 
  239   source.append(
"  for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
 
  240   source.append(
"    for (unsigned int j = 0; j < row_n; ++j){ \n");
 
  241   source.append(
"      l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
 
  242   source.append(
"    } \n");
 
  243   source.append(
"  } \n");
 
  244   source.append(
"} \n");
 
  246   source.append(
"void matrix_from_local_to_global(__global "); source.append(numeric_string); source.append(
"* g_M, __local "); source.append(numeric_string); source.append(
"* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
 
  247   source.append(
"  for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
 
  248   source.append(
"    for (unsigned int j = 0; j < row_n; ++j){ \n");
 
  249   source.append(
"      g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
 
  250   source.append(
"    } \n");
 
  251   source.append(
"  } \n");
 
  252   source.append(
"} \n");
 
  254   source.append(
"__kernel void block_q_mult(__global "); source.append(numeric_string); source.append(
" * global_R, \n");
 
  255   source.append(
"  __global unsigned int * block_ind, \n");
 
  256   source.append(
"  __global "); source.append(numeric_string); source.append(
" * global_R_u, \n");
 
  257   source.append(
"  __global unsigned int *block_ind_u, \n");
 
  258   source.append(
"  __global "); source.append(numeric_string); source.append(
" * b_v, \n");
 
  259   source.append(
"  __global unsigned int * start_bv_inds, \n");
 
  260   source.append(
"  __global unsigned int * matrix_dimensions, \n");
 
  261   source.append(
"  __global unsigned int * matrix_dimensions_u, \n");
 
  262   source.append(
"  __global unsigned int * g_is_update, \n");
 
  263   source.append(
"  __local  "); source.append(numeric_string); source.append(
" * local_R_u, \n");
 
  264   source.append(
"    unsigned int  block_elems_num){ \n");
 
  265   source.append(
"    for (unsigned int i  = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
 
  266   source.append(
"          if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){ \n");
 
  268   source.append(
"        matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]); \n");
 
  269   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  270   source.append(
"              q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n");
 
  271   source.append(
"             matrix_dimensions_u[2*i + 1]); \n");
 
  272   source.append(
"        barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  273   source.append(
"              matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]); \n");
 
  274   source.append(
"          } \n");
 
  275   source.append(
"      } \n");
 
  276   source.append(
"} \n");
 
  279 template<
typename StringT>
 
  282   source.append(
"void dot_prod(__local const "); source.append(numeric_string); source.append(
"* A, unsigned int n, unsigned int beg_ind, "); source.append(numeric_string); source.append(
"* res){ \n");
 
  283   source.append(
"    *res = 0; \n");
 
  284   source.append(
"    for (unsigned int i = beg_ind; i < n; ++i){ \n");
 
  285   source.append(
"        *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i]; \n");
 
  286   source.append(
"    } \n");
 
  287   source.append(
"} \n");
 
  289   source.append(
"void vector_div(__global "); source.append(numeric_string); source.append(
"* v, unsigned int beg_ind, "); source.append(numeric_string); source.append(
" b, unsigned int n){ \n");
 
  290   source.append(
"    for (unsigned int i = beg_ind; i < n; ++i){ \n");
 
  291   source.append(
"        v[i] /= b; \n");
 
  292   source.append(
"    } \n");
 
  293   source.append(
"} \n");
 
  295   source.append(
"void copy_vector(__local const "); source.append(numeric_string); source.append(
"* A, __global "); source.append(numeric_string); source.append(
"* v, const unsigned int beg_ind, const unsigned int n){ \n");
 
  296   source.append(
"    for (unsigned int i = beg_ind; i < n; ++i){ \n");
 
  297   source.append(
"        v[i] = A[(beg_ind-1)*n + i]; \n");
 
  298   source.append(
"    } \n");
 
  299   source.append(
"} \n");
 
  302   source.append(
"void householder_vector(__local const "); source.append(numeric_string); source.append(
"* A, unsigned int j, unsigned int n, __global "); source.append(numeric_string); source.append(
"* v, __global "); source.append(numeric_string); source.append(
"* b){ \n");
 
  303   source.append(
"    "); source.append(numeric_string); source.append(
" sg; \n");
 
  304   source.append(
"    dot_prod(A, n, j+1, &sg); \n");
 
  305   source.append(
"    copy_vector(A, v, j+1, n); \n");
 
  306   source.append(
"    "); source.append(numeric_string); source.append(
" mu; \n");
 
  307   source.append(
"    v[j] = 1.0; \n");
 
  309   source.append(
"    if (sg == 0){ \n");
 
  310   source.append(
"        *b = 0; \n");
 
  311   source.append(
"    } \n");
 
  312   source.append(
"    else{ \n");
 
  313   source.append(
"        mu = sqrt(A[j*n + j]*A[ j*n + j] + sg); \n");
 
  314   source.append(
"        if (A[ j*n + j] <= 0){ \n");
 
  315   source.append(
"            v[j] = A[ j*n + j] - mu; \n");
 
  316   source.append(
"        }else{ \n");
 
  317   source.append(
"            v[j] = -sg/(A[ j*n + j] + mu); \n");
 
  318   source.append(
"        } \n");
 
  319   source.append(
"    *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]); \n");
 
  321   source.append(
"        vector_div(v, j, v[j], n); \n");
 
  323   source.append(
"    } \n");
 
  324   source.append(
"} \n");
 
  326   source.append(
"void custom_inner_prod(__local const "); source.append(numeric_string); source.append(
"* A, __global "); source.append(numeric_string); source.append(
"* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, "); source.append(numeric_string); source.append(
"* res){ \n");
 
  327   source.append(
"    for (unsigned int i = start_ind; i < row_num; ++i){ \n");
 
  328   source.append(
"        *res += A[col_ind*row_num + i]*v[i]; \n");
 
  329   source.append(
"    } \n");
 
  330   source.append(
"} \n");
 
  332   source.append(
"void apply_householder_reflection(__local "); source.append(numeric_string); source.append(
"* A,  unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global "); source.append(numeric_string); source.append(
"* v, "); source.append(numeric_string); source.append(
" b){ \n");
 
  333   source.append(
"    "); source.append(numeric_string); source.append(
" in_prod_res; \n");
 
  334   source.append(
"    for (unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){ \n");
 
  335   source.append(
"        in_prod_res = 0.0; \n");
 
  336   source.append(
"        custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res); \n");
 
  337   source.append(
"        for (unsigned int j = iter_cnt; j < row_n; ++j){ \n");
 
  338   source.append(
"            A[ i*row_n + j] -= b*in_prod_res* v[j]; \n");
 
  339   source.append(
"        } \n");
 
  340   source.append(
"    } \n");
 
  341   source.append(
"} \n");
 
  343   source.append(
"void store_householder_vector(__local "); source.append(numeric_string); source.append(
"* A,  unsigned int ind, unsigned int n, __global "); source.append(numeric_string); source.append(
"* v){ \n");
 
  344   source.append(
"    for (unsigned int i = ind; i < n; ++i){ \n");
 
  345   source.append(
"        A[ (ind-1)*n + i] = v[i]; \n");
 
  346   source.append(
"    } \n");
 
  347   source.append(
"} \n");
 
  349   source.append(
"void single_qr( __local "); source.append(numeric_string); source.append(
"* R, __global unsigned int* matrix_dimensions, __global "); source.append(numeric_string); source.append(
"* b_v, __global "); source.append(numeric_string); source.append(
"* v, unsigned int matrix_ind){ \n");
 
  352   source.append(
"  unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
 
  353   source.append(
"  unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
 
  355   source.append(
"  if ((col_n == row_n)&&(row_n == 1)){ \n");
 
  356   source.append(
"    b_v[0] = 0.0; \n");
 
  357   source.append(
"      return; \n");
 
  358   source.append(
"  } \n");
 
  359   source.append(
"  for (unsigned int i = 0; i < col_n; ++i){ \n");
 
  360   source.append(
"    if (get_local_id(0) == 0){ \n");
 
  361   source.append(
"      householder_vector(R, i, row_n, v, b_v + i); \n");
 
  362   source.append(
"    } \n");
 
  363   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  364   source.append(
"    apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]); \n");
 
  365   source.append(
"    barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  366   source.append(
"    if (get_local_id(0) == 0){ \n");
 
  367   source.append(
"      if (i < matrix_dimensions[2*matrix_ind]){ \n");
 
  368   source.append(
"        store_householder_vector(R, i+1, row_n, v); \n");
 
  369   source.append(
"      } \n");
 
  370   source.append(
"    } \n");
 
  371   source.append(
"  } \n");
 
  372   source.append(
"} \n");
 
  374   source.append(
"void matrix_from_global_to_local_qr(__global "); source.append(numeric_string); source.append(
"* g_M, __local "); source.append(numeric_string); source.append(
"* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
 
  375   source.append(
"  for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
 
  376   source.append(
"    for (unsigned int j = 0; j < row_n; ++j){ \n");
 
  377   source.append(
"      l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
 
  378   source.append(
"    } \n");
 
  379   source.append(
"  } \n");
 
  380   source.append(
"} \n");
 
  381   source.append(
"void matrix_from_local_to_global_qr(__global "); source.append(numeric_string); source.append(
"* g_M, __local "); source.append(numeric_string); source.append(
"* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
 
  382   source.append(
"  for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
 
  383   source.append(
"    for (unsigned int j = 0; j < row_n; ++j){ \n");
 
  384   source.append(
"      g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
 
  385   source.append(
"    } \n");
 
  386   source.append(
"  } \n");
 
  387   source.append(
"} \n");
 
  390   source.append(
"__kernel void block_qr( \n");
 
  391   source.append(
"      __global "); source.append(numeric_string); source.append(
"* R, \n");
 
  392   source.append(
"      __global unsigned int* matrix_dimensions, \n");
 
  393   source.append(
"      __global "); source.append(numeric_string); source.append(
"* b_v, \n");
 
  394   source.append(
"      __global "); source.append(numeric_string); source.append(
"* v, \n");
 
  395   source.append(
"      __global unsigned int* start_matrix_inds, \n");
 
  396   source.append(
"      __global unsigned int* start_bv_inds, \n");
 
  397   source.append(
"      __global unsigned int* start_v_inds, \n");
 
  398   source.append(
"      __global unsigned int * g_is_update, \n");
 
  399   source.append(
"      __local "); source.append(numeric_string); source.append(
"* local_buff_R, \n");
 
  400   source.append(
"      unsigned int block_elems_num){ \n");
 
  401   source.append(
"    for (unsigned int i  = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
 
  402   source.append(
"        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
 
  403   source.append(
"      matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
 
  404   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  405   source.append(
"            single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i); \n");
 
  406   source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  407   source.append(
"            matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
 
  408   source.append(
"        } \n");
 
  409   source.append(
"    } \n");
 
  410   source.append(
"} \n");
 
  413 template<
typename StringT>
 
  416   source.append(
"void assemble_upper_part(__global "); source.append(numeric_string); source.append(
" * R_q, \n");
 
  417   source.append(
"            unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(
" * R_u, \n");
 
  418   source.append(
"            unsigned int row_n_u, unsigned int col_n_u, \n");
 
  419   source.append(
"            unsigned int col_n, unsigned int diff){ \n");
 
  420   source.append(
"            for (unsigned int i = 0; i < col_n_q; ++i){ \n");
 
  421   source.append(
"                for (unsigned int j = 0; j < diff; ++j){ \n");
 
  422   source.append(
"          R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ]; \n");
 
  423   source.append(
"                } \n");
 
  424   source.append(
"            } \n");
 
  425   source.append(
"        } \n");
 
  427   source.append(
"void assemble_lower_part(__global "); source.append(numeric_string); source.append(
" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(
" * R_u_u, \n");
 
  428   source.append(
"             unsigned int row_n_u_u, unsigned int col_n_u_u, \n");
 
  429   source.append(
"             unsigned int diff){ \n");
 
  430   source.append(
"  for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
 
  431   source.append(
"    for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
 
  432   source.append(
"      R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j]; \n");
 
  433   source.append(
"    } \n");
 
  434   source.append(
"  } \n");
 
  435   source.append(
"} \n");
 
  437   source.append(
"void assemble_qr_block(__global "); source.append(numeric_string); source.append(
" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(
" * R_u, unsigned int row_n_u, \n");
 
  438   source.append(
"            unsigned int col_n_u, __global "); source.append(numeric_string); source.append(
" * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){ \n");
 
  439   source.append(
"            unsigned int diff = row_n_u - col_n; \n");
 
  440   source.append(
"            assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
 
  441   source.append(
"            if (diff > 0){ \n");
 
  442   source.append(
"              assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff); \n");
 
  443   source.append(
"            } \n");
 
  444   source.append(
"} \n");
 
  446   source.append(
"__kernel void block_qr_assembly( \n");
 
  447   source.append(
"      __global unsigned int * matrix_dimensions, \n");
 
  448   source.append(
"      __global "); source.append(numeric_string); source.append(
" * R_u, \n");
 
  449   source.append(
"      __global unsigned int * block_ind_u, \n");
 
  450   source.append(
"      __global unsigned int * matrix_dimensions_u, \n");
 
  451   source.append(
"      __global "); source.append(numeric_string); source.append(
" * R_u_u, \n");
 
  452   source.append(
"      __global unsigned int * block_ind_u_u, \n");
 
  453   source.append(
"      __global unsigned int * matrix_dimensions_u_u, \n");
 
  454   source.append(
"      __global "); source.append(numeric_string); source.append(
" * R_q, \n");
 
  455   source.append(
"      __global unsigned int * block_ind_q, \n");
 
  456   source.append(
"      __global unsigned int * matrix_dimensions_q, \n");
 
  457   source.append(
"      __global unsigned int * g_is_update, \n");
 
  458   source.append(
"          //__local  "); source.append(numeric_string); source.append(
" * local_R_q, \n");
 
  459   source.append(
"      unsigned int  block_elems_num) \n");
 
  460   source.append(
"{ \n");
 
  461   source.append(
"    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
 
  462   source.append(
"        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
 
  463   source.append(
"           assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
 
  464   source.append(
"             matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
 
  465   source.append(
"       } \n");
 
  466   source.append(
"   } \n");
 
  467   source.append(
"} \n");
 
  470 template<
typename StringT>
 
  473   source.append(
"void assemble_upper_part_1(__global "); source.append(numeric_string); source.append(
" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(
" * R_u, \n");
 
  474   source.append(
"             unsigned int row_n_u, unsigned int col_n_u, \n");
 
  475   source.append(
"             unsigned int col_n, unsigned int diff){ \n");
 
  476   source.append(
"            for (unsigned int i = 0; i < col_n_q; ++i){ \n");
 
  477   source.append(
"                for (unsigned int j = 0; j < diff; ++j){ \n");
 
  478   source.append(
"          R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ]; \n");
 
  479   source.append(
"                } \n");
 
  480   source.append(
"            } \n");
 
  481   source.append(
"        } \n");
 
  484   source.append(
"void assemble_qr_block_1(__global "); source.append(numeric_string); source.append(
" * R_q,  unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(
" * R_u, unsigned int row_n_u, \n");
 
  485   source.append(
"            unsigned int col_n_u, unsigned int col_n){ \n");
 
  486   source.append(
"            unsigned int diff = row_n_u - col_n; \n");
 
  487   source.append(
"            assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
 
  488   source.append(
"} \n");
 
  490   source.append(
"__kernel void block_qr_assembly_1( \n");
 
  491   source.append(
"  __global unsigned int * matrix_dimensions, \n");
 
  492   source.append(
"  __global "); source.append(numeric_string); source.append(
" * R_u, \n");
 
  493   source.append(
"  __global unsigned int * block_ind_u, \n");
 
  494   source.append(
"  __global unsigned int * matrix_dimensions_u, \n");
 
  495   source.append(
"  __global "); source.append(numeric_string); source.append(
" * R_q, \n");
 
  496   source.append(
"  __global unsigned int * block_ind_q, \n");
 
  497   source.append(
"  __global unsigned int * matrix_dimensions_q, \n");
 
  498   source.append(
"  __global unsigned int * g_is_update, \n");
 
  499   source.append(
"  unsigned int  block_elems_num) \n");
 
  500   source.append(
"{ \n");
 
  501   source.append(
"    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
 
  502   source.append(
"        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
 
  503   source.append(
"            assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
 
  504   source.append(
"              matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
 
  505   source.append(
"        } \n");
 
  506   source.append(
"    } \n");
 
  507   source.append(
"} \n");
 
  510 template<
typename StringT>
 
  513   source.append(
"void assemble_r(__global "); source.append(numeric_string); source.append(
" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(
" * R, \n");
 
  514   source.append(
"        unsigned int row_n, unsigned int col_n) \n");
 
  515   source.append(
"{ \n");
 
  516   source.append(
"  for (unsigned int i = 0; i < col_n; ++i){ \n");
 
  517   source.append(
"     for (unsigned int j = 0; j < row_n; ++j){ \n");
 
  518   source.append(
"    gR[i*row_n_r + j] = R[i*row_n + j ]; \n");
 
  519   source.append(
"     } \n");
 
  520   source.append(
"  } \n");
 
  521   source.append(
"} \n");
 
  523   source.append(
"void assemble_r_u(__global "); source.append(numeric_string); source.append(
" * gR, \n");
 
  524   source.append(
"          unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(
" * R_u, unsigned int row_n_u, unsigned int col_n_u, \n");
 
  525   source.append(
"          unsigned int col_n) \n");
 
  526   source.append(
"{ \n");
 
  527   source.append(
"  for (unsigned int i = 0; i < col_n_u; ++i){ \n");
 
  528   source.append(
"    for (unsigned int j = 0; j < col_n; ++j){ \n");
 
  529   source.append(
"      gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j]; \n");
 
  530   source.append(
"    } \n");
 
  531   source.append(
"  } \n");
 
  532   source.append(
"} \n");
 
  535   source.append(
"void assemble_r_u_u(__global "); source.append(numeric_string); source.append(
" * gR,  unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(
" * R_u_u, unsigned int row_n_u_u, \n");
 
  536   source.append(
"          unsigned int col_n_u_u, unsigned int col_n) \n");
 
  537   source.append(
"{ \n");
 
  538   source.append(
"  for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
 
  539   source.append(
"    for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
 
  540   source.append(
"      gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j]; \n");
 
  541   source.append(
"    } \n");
 
  542   source.append(
"  } \n");
 
  543   source.append(
"} \n");
 
  545   source.append(
"void assemble_r_block(__global "); source.append(numeric_string); source.append(
" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, \n");
 
  546   source.append(
"        unsigned int col_n, __global "); source.append(numeric_string); source.append(
" * R_u, unsigned int row_n_u, unsigned int col_n_u, __global "); source.append(numeric_string); source.append(
" * R_u_u, \n");
 
  547   source.append(
"        unsigned int row_n_u_u, unsigned int col_n_u_u){ \n");
 
  548   source.append(
"        assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n");
 
  549   source.append(
"        assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n); \n");
 
  550   source.append(
"        assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n); \n");
 
  551   source.append(
"} \n");
 
  554   source.append(
"__kernel void block_r_assembly( \n");
 
  555   source.append(
"  __global "); source.append(numeric_string); source.append(
" * R, \n");
 
  556   source.append(
"  __global unsigned int * block_ind, \n");
 
  557   source.append(
"  __global unsigned int * matrix_dimensions, \n");
 
  558   source.append(
"  __global "); source.append(numeric_string); source.append(
" * R_u, \n");
 
  559   source.append(
"  __global unsigned int * block_ind_u, \n");
 
  560   source.append(
"  __global unsigned int * matrix_dimensions_u, \n");
 
  561   source.append(
"  __global "); source.append(numeric_string); source.append(
" * R_u_u, \n");
 
  562   source.append(
"  __global unsigned int * block_ind_u_u, \n");
 
  563   source.append(
"  __global unsigned int * matrix_dimensions_u_u, \n");
 
  564   source.append(
"  __global "); source.append(numeric_string); source.append(
" * g_R, \n");
 
  565   source.append(
"  __global unsigned int * block_ind_r, \n");
 
  566   source.append(
"  __global unsigned int * matrix_dimensions_r, \n");
 
  567   source.append(
"  __global unsigned int * g_is_update, \n");
 
  568   source.append(
"  unsigned int  block_elems_num) \n");
 
  569   source.append(
"{ \n");
 
  570   source.append(
"    for (unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
 
  571   source.append(
"        if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
 
  573   source.append(
"            assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n");
 
  574   source.append(
"              matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], \n");
 
  575   source.append(
"              R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]); \n");
 
  577   source.append(
"        } \n");
 
  578   source.append(
"    } \n");
 
  579   source.append(
"} \n");
 
  586 template<
typename NumericT>
 
  596     static std::map<cl_context, bool> init_done;
 
  603       source.reserve(1024);
 
  605       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
  617       #ifdef VIENNACL_BUILD_INFO 
  618       std::cout << 
"Creating program " << prog_name << std::endl;
 
  620       ctx.add_program(source, prog_name);
 
  621       init_done[ctx.handle().get()] = 
true;
 
Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners...
void generate_spai_block_r_assembly(StringT &source, std::string const &numeric_string)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
void generate_spai_block_qr_assembly(StringT &source, std::string const &numeric_string)
Provides OpenCL-related utilities. 
const viennacl::ocl::handle< cl_context > & handle() const 
Returns the context handle. 
void generate_spai_block_q_mult(StringT &source, std::string const &numeric_string)
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const 
void generate_spai_assemble_blocks(StringT &source, std::string const &numeric_string)
static std::string program_name()
void generate_spai_block_qr(StringT &source, std::string const &numeric_string)
void generate_spai_block_least_squares(StringT &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL. 
void generate_spai_block_qr_assembly_1(StringT &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
void generate_spai_block_bv_assembly(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.