1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_ILU_HPP 
    2 #define VIENNACL_LINALG_OPENCL_KERNELS_ILU_HPP 
   36 template<
typename StringT>
 
   39   source.append(
"__kernel void level_scheduling_substitute( \n");
 
   40   source.append(
"  __global const unsigned int * row_index_array, \n");
 
   41   source.append(
"  __global const unsigned int * row_indices, \n");
 
   42   source.append(
"  __global const unsigned int * column_indices, \n");
 
   43   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * elements, \n");
 
   44   source.append(
"  __global "); source.append(numeric_string); source.append(
" * vec, \n");
 
   45   source.append(
"  unsigned int size) \n");
 
   46   source.append(
"{ \n");
 
   47   source.append(
"  for (unsigned int row  = get_global_id(0); \n");
 
   48   source.append(
"                    row  < size; \n");
 
   49   source.append(
"                    row += get_global_size(0)) \n");
 
   50   source.append(
"  { \n");
 
   51   source.append(
"    unsigned int eq_row = row_index_array[row]; \n");
 
   52   source.append(
"    "); source.append(numeric_string); source.append(
" vec_entry = vec[eq_row]; \n");
 
   53   source.append(
"    unsigned int row_end = row_indices[row+1]; \n");
 
   55   source.append(
"    for (unsigned int j = row_indices[row]; j < row_end; ++j) \n");
 
   56   source.append(
"      vec_entry -= vec[column_indices[j]] * elements[j]; \n");
 
   58   source.append(
"    vec[eq_row] = vec_entry; \n");
 
   59   source.append(
"  } \n");
 
   60   source.append(
"} \n");
 
   66 template<
typename StringT>
 
   69   source.append(
"__kernel void extract_L_1( \n");
 
   70   source.append(
"  __global unsigned int const *A_row_indices, \n");
 
   71   source.append(
"  __global unsigned int const *A_col_indices, \n");
 
   72   source.append(
"  unsigned int A_size1, \n");
 
   73   source.append(
"  __global unsigned int *L_row_indices) { \n");
 
   75   source.append(
"  for (unsigned int row  = get_global_id(0); \n");
 
   76   source.append(
"                    row  < A_size1; \n");
 
   77   source.append(
"                    row += get_global_size(0)) \n");
 
   78   source.append(
"  { \n");
 
   79   source.append(
"    unsigned int row_begin = A_row_indices[row]; \n");
 
   80   source.append(
"    unsigned int row_end   = A_row_indices[row+1]; \n");
 
   82   source.append(
"    unsigned int num_entries_L = 0; \n");
 
   83   source.append(
"    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
 
   84   source.append(
"      unsigned int col = A_col_indices[j]; \n");
 
   85   source.append(
"      if (col <= row) ++num_entries_L; \n");
 
   86   source.append(
"    } \n");
 
   88   source.append(
"    L_row_indices[row] = num_entries_L;   \n");
 
   89   source.append(
"  } \n");
 
   90   source.append(
"} \n");
 
   93 template<
typename StringT>
 
   96   source.append(
"__kernel void extract_L_2( \n");
 
   97   source.append(
"  __global unsigned int const *A_row_indices, \n");
 
   98   source.append(
"  __global unsigned int const *A_col_indices, \n");
 
   99   source.append(
"  __global "); source.append(numeric_string); source.append(
" const *A_elements, \n");
 
  100   source.append(
"  unsigned int A_size1, \n");
 
  101   source.append(
"  __global unsigned int const *L_row_indices, \n");
 
  102   source.append(
"  __global unsigned int       *L_col_indices, \n");
 
  103   source.append(
"  __global "); source.append(numeric_string); source.append(
" *L_elements) { \n");
 
  105   source.append(
"  for (unsigned int row  = get_global_id(0); \n");
 
  106   source.append(
"                    row  < A_size1; \n");
 
  107   source.append(
"                    row += get_global_size(0)) \n");
 
  108   source.append(
"  { \n");
 
  109   source.append(
"    unsigned int row_begin = A_row_indices[row]; \n");
 
  110   source.append(
"    unsigned int row_end   = A_row_indices[row+1]; \n");
 
  112   source.append(
"    unsigned int index_L = L_row_indices[row]; \n");
 
  113   source.append(
"    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
 
  114   source.append(
"      unsigned int col = A_col_indices[j]; \n");
 
  115   source.append(
"      "); source.append(numeric_string); source.append(
" value = A_elements[j]; \n");
 
  117   source.append(
"      if (col <= row) { \n");
 
  118   source.append(
"        L_col_indices[index_L] = col; \n");
 
  119   source.append(
"        L_elements[index_L]    = value; \n");
 
  120   source.append(
"        ++index_L; \n");
 
  121   source.append(
"      } \n");
 
  122   source.append(
"    } \n");
 
  124   source.append(
"  } \n");
 
  125   source.append(
"} \n");
 
  129 template<
typename StringT>
 
  132   source.append(
"__kernel void icc_chow_patel_sweep_kernel( \n");
 
  133   source.append(
"  __global unsigned int const *L_row_indices, \n");
 
  134   source.append(
"  __global unsigned int const *L_col_indices, \n");
 
  135   source.append(
"  __global "); source.append(numeric_string); source.append(
"       *L_elements, \n");
 
  136   source.append(
"  __global "); source.append(numeric_string); source.append(
" const *L_backup, \n");
 
  137   source.append(
"  unsigned int L_size1, \n");
 
  139   source.append(
"  __global "); source.append(numeric_string); source.append(
" const *aij_L) { \n");
 
  141   source.append(
"  for (unsigned int row  = get_global_id(0); \n");
 
  142   source.append(
"                    row  < L_size1; \n");
 
  143   source.append(
"                    row += get_global_size(0)) \n");
 
  144   source.append(
"  { \n");
 
  149   source.append(
"    unsigned int row_Li_start = L_row_indices[row]; \n");
 
  150   source.append(
"    unsigned int row_Li_end   = L_row_indices[row + 1]; \n");
 
  152   source.append(
"    for (unsigned int i = row_Li_start; i < row_Li_end; ++i) { \n");
 
  153   source.append(
"      unsigned int col = L_col_indices[i]; \n");
 
  155   source.append(
"      unsigned int row_Lj_start = L_row_indices[col]; \n");
 
  156   source.append(
"      unsigned int row_Lj_end   = L_row_indices[col + 1]; \n");
 
  158   source.append(
"      unsigned int index_Lj = row_Lj_start; \n");
 
  159   source.append(
"      unsigned int col_Lj = L_col_indices[index_Lj]; \n");
 
  161   source.append(
"      "); source.append(numeric_string); source.append(
" s = aij_L[i]; \n");
 
  162   source.append(
"      for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li) { \n");
 
  163   source.append(
"        unsigned int col_Li = L_col_indices[index_Li]; \n");
 
  165   source.append(
"        while (col_Lj < col_Li) { \n");
 
  166   source.append(
"          ++index_Lj; \n");
 
  167   source.append(
"          col_Lj = L_col_indices[index_Lj]; \n");
 
  168   source.append(
"        } \n");
 
  170   source.append(
"        if (col_Lj == col_Li) \n");
 
  171   source.append(
"          s -= L_backup[index_Li] * L_backup[index_Lj]; \n");
 
  172   source.append(
"      } \n");
 
  175   source.append(
"      L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]); \n");
 
  176   source.append(
"    } \n");
 
  178   source.append(
"  } \n");
 
  179   source.append(
"} \n");
 
  185 template<
typename StringT>
 
  188   source.append(
"__kernel void extract_LU_1( \n");
 
  189   source.append(
"  __global unsigned int const *A_row_indices, \n");
 
  190   source.append(
"  __global unsigned int const *A_col_indices, \n");
 
  191   source.append(
"  unsigned int A_size1, \n");
 
  192   source.append(
"  __global unsigned int *L_row_indices, \n");
 
  193   source.append(
"  __global unsigned int *U_row_indices) { \n");
 
  195   source.append(
"  for (unsigned int row  = get_global_id(0); \n");
 
  196   source.append(
"                    row  < A_size1; \n");
 
  197   source.append(
"                    row += get_global_size(0)) \n");
 
  198   source.append(
"  { \n");
 
  199   source.append(
"    unsigned int row_begin = A_row_indices[row]; \n");
 
  200   source.append(
"    unsigned int row_end   = A_row_indices[row+1]; \n");
 
  202   source.append(
"    unsigned int num_entries_L = 0; \n");
 
  203   source.append(
"    unsigned int num_entries_U = 0; \n");
 
  204   source.append(
"    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
 
  205   source.append(
"      unsigned int col = A_col_indices[j]; \n");
 
  206   source.append(
"      if (col <= row) ++num_entries_L; \n");
 
  207   source.append(
"      if (col >= row) ++num_entries_U; \n");
 
  208   source.append(
"    } \n");
 
  210   source.append(
"    L_row_indices[row] = num_entries_L;   \n");
 
  211   source.append(
"    U_row_indices[row] = num_entries_U;   \n");
 
  212   source.append(
"  } \n");
 
  213   source.append(
"} \n");
 
  216 template<
typename StringT>
 
  219   source.append(
"__kernel void extract_LU_2( \n");
 
  220   source.append(
"  __global unsigned int const *A_row_indices, \n");
 
  221   source.append(
"  __global unsigned int const *A_col_indices, \n");
 
  222   source.append(
"  __global "); source.append(numeric_string); source.append(
" const *A_elements, \n");
 
  223   source.append(
"  unsigned int A_size1, \n");
 
  224   source.append(
"  __global unsigned int const *L_row_indices, \n");
 
  225   source.append(
"  __global unsigned int       *L_col_indices, \n");
 
  226   source.append(
"  __global "); source.append(numeric_string); source.append(
" *L_elements, \n");
 
  227   source.append(
"  __global unsigned int const *U_row_indices, \n");
 
  228   source.append(
"  __global unsigned int       *U_col_indices, \n");
 
  229   source.append(
"  __global "); source.append(numeric_string); source.append(
" *U_elements) { \n");
 
  231   source.append(
"  for (unsigned int row  = get_global_id(0); \n");
 
  232   source.append(
"                    row  < A_size1; \n");
 
  233   source.append(
"                    row += get_global_size(0)) \n");
 
  234   source.append(
"  { \n");
 
  235   source.append(
"    unsigned int row_begin = A_row_indices[row]; \n");
 
  236   source.append(
"    unsigned int row_end   = A_row_indices[row+1]; \n");
 
  238   source.append(
"    unsigned int index_L = L_row_indices[row]; \n");
 
  239   source.append(
"    unsigned int index_U = U_row_indices[row]; \n");
 
  240   source.append(
"    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
 
  241   source.append(
"      unsigned int col = A_col_indices[j]; \n");
 
  242   source.append(
"      "); source.append(numeric_string); source.append(
" value = A_elements[j]; \n");
 
  244   source.append(
"      if (col <= row) { \n");
 
  245   source.append(
"        L_col_indices[index_L] = col; \n");
 
  246   source.append(
"        L_elements[index_L]    = value; \n");
 
  247   source.append(
"        ++index_L; \n");
 
  248   source.append(
"      } \n");
 
  249   source.append(
"      if (col >= row) { \n");
 
  250   source.append(
"        U_col_indices[index_U] = col; \n");
 
  251   source.append(
"        U_elements[index_U]    = value; \n");
 
  252   source.append(
"        ++index_U; \n");
 
  253   source.append(
"      } \n");
 
  254   source.append(
"    } \n");
 
  256   source.append(
"  } \n");
 
  257   source.append(
"} \n");
 
  260 template<
typename StringT>
 
  263   source.append(
"__kernel void ilu_scale_kernel_1( \n");
 
  264   source.append(
"  __global unsigned int const *A_row_indices, \n");
 
  265   source.append(
"  __global unsigned int const *A_col_indices, \n");
 
  266   source.append(
"  __global "); source.append(numeric_string); source.append(
" const *A_elements, \n");
 
  267   source.append(
"  unsigned int A_size1, \n");
 
  268   source.append(
"  __global "); source.append(numeric_string); source.append(
"       *D_elements) { \n");
 
  270   source.append(
"  for (unsigned int row  = get_global_id(0); \n");
 
  271   source.append(
"                    row  < A_size1; \n");
 
  272   source.append(
"                    row += get_global_size(0)) \n");
 
  273   source.append(
"  { \n");
 
  274   source.append(
"    unsigned int row_begin = A_row_indices[row]; \n");
 
  275   source.append(
"    unsigned int row_end   = A_row_indices[row+1]; \n");
 
  277   source.append(
"    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
 
  278   source.append(
"      unsigned int col = A_col_indices[j]; \n");
 
  280   source.append(
"      if (col == row) { \n");
 
  281   source.append(
"        D_elements[row] = 1 / sqrt(fabs(A_elements[j])); \n");
 
  282   source.append(
"        break; \n");
 
  283   source.append(
"      } \n");
 
  284   source.append(
"    } \n");
 
  286   source.append(
"  } \n");
 
  287   source.append(
"} \n");
 
  290 template<
typename StringT>
 
  293   source.append(
"__kernel void ilu_scale_kernel_2( \n");
 
  294   source.append(
"  __global unsigned int const *R_row_indices, \n");
 
  295   source.append(
"  __global unsigned int const *R_col_indices, \n");
 
  296   source.append(
"  __global "); source.append(numeric_string); source.append(
"       *R_elements, \n");
 
  297   source.append(
"  unsigned int R_size1, \n");
 
  298   source.append(
"  __global "); source.append(numeric_string); source.append(
" const *D_elements) { \n");
 
  300   source.append(
"  for (unsigned int row  = get_global_id(0); \n");
 
  301   source.append(
"                    row  < R_size1; \n");
 
  302   source.append(
"                    row += get_global_size(0)) \n");
 
  303   source.append(
"  { \n");
 
  304   source.append(
"    unsigned int row_begin = R_row_indices[row]; \n");
 
  305   source.append(
"    unsigned int row_end   = R_row_indices[row+1]; \n");
 
  307   source.append(
"    "); source.append(numeric_string); source.append(
" D_row = D_elements[row]; \n");
 
  308   source.append(
"    for (unsigned int j=row_begin; j<row_end; ++j) \n");
 
  309   source.append(
"      R_elements[j] *= D_row * D_elements[R_col_indices[j]]; \n");
 
  311   source.append(
"  } \n");
 
  312   source.append(
"} \n");
 
  315 template<
typename StringT>
 
  318   source.append(
"__kernel void ilu_chow_patel_sweep_kernel( \n");
 
  319   source.append(
"  __global unsigned int const *L_row_indices, \n");
 
  320   source.append(
"  __global unsigned int const *L_col_indices, \n");
 
  321   source.append(
"  __global "); source.append(numeric_string); source.append(
"       *L_elements, \n");
 
  322   source.append(
"  __global "); source.append(numeric_string); source.append(
" const *L_backup, \n");
 
  323   source.append(
"  unsigned int L_size1, \n");
 
  325   source.append(
"  __global "); source.append(numeric_string); source.append(
" const *aij_L, \n");
 
  327   source.append(
"  __global unsigned int const *U_trans_row_indices, \n");
 
  328   source.append(
"  __global unsigned int const *U_trans_col_indices, \n");
 
  329   source.append(
"  __global "); source.append(numeric_string); source.append(
"       *U_trans_elements, \n");
 
  330   source.append(
"  __global "); source.append(numeric_string); source.append(
" const *U_trans_backup, \n");
 
  332   source.append(
"  __global "); source.append(numeric_string); source.append(
" const *aij_U_trans) { \n");
 
  334   source.append(
"  for (unsigned int row  = get_global_id(0); \n");
 
  335   source.append(
"                    row  < L_size1; \n");
 
  336   source.append(
"                    row += get_global_size(0)) \n");
 
  337   source.append(
"  { \n");
 
  342   source.append(
"    unsigned int row_L_start = L_row_indices[row]; \n");
 
  343   source.append(
"    unsigned int row_L_end   = L_row_indices[row + 1]; \n");
 
  345   source.append(
"    for (unsigned int j = row_L_start; j < row_L_end; ++j) { \n");
 
  346   source.append(
"      unsigned int col = L_col_indices[j]; \n");
 
  348   source.append(
"      if (col == row) continue; \n");
 
  350   source.append(
"      unsigned int row_U_start = U_trans_row_indices[col]; \n");
 
  351   source.append(
"      unsigned int row_U_end   = U_trans_row_indices[col + 1]; \n");
 
  353   source.append(
"      unsigned int index_U = row_U_start; \n");
 
  354   source.append(
"      unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1; \n");
 
  356   source.append(
"      "); source.append(numeric_string); source.append(
" sum = 0; \n");
 
  357   source.append(
"      for (unsigned int k = row_L_start; k < j; ++k) { \n");
 
  358   source.append(
"        unsigned int col_L = L_col_indices[k]; \n");
 
  360   source.append(
"        while (col_U < col_L) { \n");
 
  361   source.append(
"          ++index_U; \n");
 
  362   source.append(
"          col_U = U_trans_col_indices[index_U]; \n");
 
  363   source.append(
"        } \n");
 
  365   source.append(
"        if (col_U == col_L) \n");
 
  366   source.append(
"          sum += L_backup[k] * U_trans_backup[index_U]; \n");
 
  367   source.append(
"      } \n");
 
  370   source.append(
"      L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1]; \n");
 
  371   source.append(
"    } \n");
 
  376   source.append(
"    unsigned int row_U_start = U_trans_row_indices[row]; \n");
 
  377   source.append(
"    unsigned int row_U_end   = U_trans_row_indices[row + 1]; \n");
 
  379   source.append(
"    for (unsigned int j = row_U_start; j < row_U_end; ++j) { \n");
 
  380   source.append(
"      unsigned int col = U_trans_col_indices[j]; \n");
 
  382   source.append(
"      row_L_start = L_row_indices[col]; \n");
 
  383   source.append(
"      row_L_end   = L_row_indices[col + 1]; \n");
 
  386   source.append(
"      unsigned int index_L = row_L_start; \n");
 
  387   source.append(
"      unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1; \n");
 
  388   source.append(
"      "); source.append(numeric_string); source.append(
" sum = 0; \n");
 
  389   source.append(
"      for (unsigned int k = row_U_start; k < j; ++k) { \n");
 
  390   source.append(
"        unsigned int col_U = U_trans_col_indices[k]; \n");
 
  393   source.append(
"        while (col_L < col_U) { \n");
 
  394   source.append(
"          ++index_L; \n");
 
  395   source.append(
"          col_L = L_col_indices[index_L]; \n");
 
  396   source.append(
"        } \n");
 
  398   source.append(
"        if (col_U == col_L) \n");
 
  399   source.append(
"          sum += L_backup[index_L] * U_trans_backup[k]; \n");
 
  400   source.append(
"      } \n");
 
  403   source.append(
"      U_trans_elements[j] = aij_U_trans[j] - sum; \n");
 
  404   source.append(
"    } \n");
 
  406   source.append(
"  } \n");
 
  407   source.append(
"} \n");
 
  411 template<
typename StringT>
 
  414   source.append(
"__kernel void ilu_form_neumann_matrix_kernel( \n");
 
  415   source.append(
"  __global unsigned int const *R_row_indices, \n");
 
  416   source.append(
"  __global unsigned int const *R_col_indices, \n");
 
  417   source.append(
"  __global "); source.append(numeric_string); source.append(
" *R_elements, \n");
 
  418   source.append(
"  unsigned int R_size1, \n");
 
  419   source.append(
"  __global "); source.append(numeric_string); source.append(
" *D_elements) { \n");
 
  421   source.append(
"  for (unsigned int row  = get_global_id(0); \n");
 
  422   source.append(
"                    row  < R_size1; \n");
 
  423   source.append(
"                    row += get_global_size(0)) \n");
 
  424   source.append(
"  { \n");
 
  425   source.append(
"    unsigned int row_begin = R_row_indices[row]; \n");
 
  426   source.append(
"    unsigned int row_end   = R_row_indices[row+1]; \n");
 
  429   source.append(
"    "); source.append(numeric_string); source.append(
" diag = D_elements[row]; \n");
 
  430   source.append(
"    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
 
  431   source.append(
"      unsigned int col = R_col_indices[j]; \n");
 
  432   source.append(
"      if (col == row) { \n");
 
  433   source.append(
"        diag = R_elements[j]; \n");
 
  434   source.append(
"        R_elements[j] = 0; \n");
 
  435   source.append(
"        break; \n");
 
  436   source.append(
"      } \n");
 
  437   source.append(
"    } \n");
 
  438   source.append(
"    D_elements[row] = diag; \n");
 
  441   source.append(
"    for (unsigned int j=row_begin; j<row_end; ++j) \n");
 
  442   source.append(
"      R_elements[j] /= -diag; \n");
 
  444   source.append(
"  } \n");
 
  445   source.append(
"} \n");
 
  452 template<
class NumericT>
 
  462     static std::map<cl_context, bool> init_done;
 
  469       source.reserve(1024);
 
  471       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
  474       if (numeric_string == 
"float" || numeric_string == 
"double")
 
  491       #ifdef VIENNACL_BUILD_INFO 
  492       std::cout << 
"Creating program " << prog_name << std::endl;
 
  494       ctx.add_program(source, prog_name);
 
  495       init_done[ctx.handle().get()] = 
true;
 
void generate_ilu_level_scheduling_substitute(StringT &source, std::string const &numeric_string)
void generate_ilu_extract_LU_2(StringT &source, std::string const &numeric_string)
void generate_icc_extract_L_2(StringT &source, std::string const &numeric_string)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities. 
const viennacl::ocl::handle< cl_context > & handle() const 
Returns the context handle. 
void generate_ilu_extract_LU_1(StringT &source)
static void apply(viennacl::ocl::context const &)
void generate_icc_extract_L_1(StringT &source)
const OCL_TYPE & get() const 
Main kernel class for generating OpenCL kernels for incomplete LU factorization preconditioners. 
void generate_ilu_chow_patel_sweep_kernel(StringT &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
void generate_ilu_scale_kernel_1(StringT &source, std::string const &numeric_string)
void generate_ilu_scale_kernel_2(StringT &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL. 
void generate_icc_chow_patel_sweep_kernel(StringT &source, std::string const &numeric_string)
void generate_ilu_form_neumann_matrix_kernel(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation. 
static std::string program_name()