1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_HYB_MATRIX_HPP 
    2 #define VIENNACL_LINALG_OPENCL_KERNELS_HYB_MATRIX_HPP 
   41 template<
typename StringT>
 
   45     source.append(
"__kernel void vec_mul_alpha_beta( \n");
 
   47     source.append(
"__kernel void vec_mul( \n");
 
   48   source.append(
"  const __global int* ell_coords, \n");
 
   49   source.append(
"  const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
 
   50   source.append(
"  const __global uint* csr_rows, \n");
 
   51   source.append(
"  const __global uint* csr_cols, \n");
 
   52   source.append(
"  const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
 
   53   source.append(
"  const __global "); source.append(numeric_string); source.append(
" * x, \n");
 
   54   source.append(
"  uint4 layout_x, \n");
 
   55   if (with_alpha_beta) { source.append(
"  "); source.append(numeric_string); source.append(
" alpha, \n"); }
 
   56   source.append(
"  __global "); source.append(numeric_string); source.append(
" * result, \n");
 
   57   source.append(
"  uint4 layout_result, \n");
 
   58   if (with_alpha_beta) { source.append(
"  "); source.append(numeric_string); source.append(
" beta, \n"); }
 
   59   source.append(
"  unsigned int row_num, \n");
 
   60   source.append(
"  unsigned int internal_row_num, \n");
 
   61   source.append(
"  unsigned int items_per_row, \n");
 
   62   source.append(
"  unsigned int aligned_items_per_row) \n");
 
   63   source.append(
"{ \n");
 
   64   source.append(
"  uint glb_id = get_global_id(0); \n");
 
   65   source.append(
"  uint glb_sz = get_global_size(0); \n");
 
   67   source.append(
"  for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
 
   68   source.append(
"    "); source.append(numeric_string); source.append(
" sum = 0; \n");
 
   70   source.append(
"    uint offset = row_id; \n");
 
   71   source.append(
"    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
 
   72   source.append(
"      "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
 
   74   source.append(
"      if (val != ("); source.append(numeric_string); source.append(
")0) { \n");
 
   75   source.append(
"        int col = ell_coords[offset]; \n");
 
   76   source.append(
"        sum += (x[col * layout_x.y + layout_x.x] * val); \n");
 
   77   source.append(
"      } \n");
 
   79   source.append(
"    } \n");
 
   81   source.append(
"    uint col_begin = csr_rows[row_id]; \n");
 
   82   source.append(
"    uint col_end   = csr_rows[row_id + 1]; \n");
 
   84   source.append(
"    for (uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
 
   85   source.append(
"      sum += (x[csr_cols[item_id] * layout_x.y + layout_x.x] * csr_elements[item_id]); \n");
 
   86   source.append(
"    } \n");
 
   89     source.append(
"    result[row_id * layout_result.y + layout_result.x] = alpha * sum + ((beta != 0) ? beta * result[row_id * layout_result.y + layout_result.x] : 0); \n");
 
   91     source.append(
"    result[row_id * layout_result.y + layout_result.x] = sum; \n");
 
   92   source.append(
"  } \n");
 
   93   source.append(
"} \n");
 
   98   template<
typename StringT>
 
  100                                             bool B_transposed, 
bool B_row_major, 
bool C_row_major)
 
  102     source.append(
"__kernel void ");
 
  104     source.append(
"( \n");
 
  105     source.append(
"  const __global int* ell_coords, \n");
 
  106     source.append(
"  const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
 
  107     source.append(
"  const __global uint* csr_rows, \n");
 
  108     source.append(
"  const __global uint* csr_cols, \n");
 
  109     source.append(
"  const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
 
  110     source.append(
"  unsigned int row_num, \n");
 
  111     source.append(
"  unsigned int internal_row_num, \n");
 
  112     source.append(
"  unsigned int items_per_row, \n");
 
  113     source.append(
"  unsigned int aligned_items_per_row, \n");
 
  114     source.append(
"    __global const "); source.append(numeric_string); source.append(
"* d_mat, \n");
 
  115     source.append(
"    unsigned int d_mat_row_start, \n");
 
  116     source.append(
"    unsigned int d_mat_col_start, \n");
 
  117     source.append(
"    unsigned int d_mat_row_inc, \n");
 
  118     source.append(
"    unsigned int d_mat_col_inc, \n");
 
  119     source.append(
"    unsigned int d_mat_row_size, \n");
 
  120     source.append(
"    unsigned int d_mat_col_size, \n");
 
  121     source.append(
"    unsigned int d_mat_internal_rows, \n");
 
  122     source.append(
"    unsigned int d_mat_internal_cols, \n");
 
  123     source.append(
"    __global "); source.append(numeric_string); source.append(
" * result, \n");
 
  124     source.append(
"    unsigned int result_row_start, \n");
 
  125     source.append(
"    unsigned int result_col_start, \n");
 
  126     source.append(
"    unsigned int result_row_inc, \n");
 
  127     source.append(
"    unsigned int result_col_inc, \n");
 
  128     source.append(
"    unsigned int result_row_size, \n");
 
  129     source.append(
"    unsigned int result_col_size, \n");
 
  130     source.append(
"    unsigned int result_internal_rows, \n");
 
  131     source.append(
"    unsigned int result_internal_cols) { \n");
 
  133     source.append(
"  uint glb_id = get_global_id(0); \n");
 
  134     source.append(
"  uint glb_sz = get_global_size(0); \n");
 
  136     source.append(
"  for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
 
  137     source.append(
"   for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
 
  138     source.append(
"    "); source.append(numeric_string); source.append(
" sum = 0; \n");
 
  140     source.append(
"    uint offset = row_id; \n");
 
  141     source.append(
"    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
 
  142     source.append(
"      "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
 
  144     source.append(
"      if (val != ("); source.append(numeric_string); source.append(
")0) { \n");
 
  145     source.append(
"        int col = ell_coords[offset]; \n");
 
  146     if (B_transposed && B_row_major)
 
  147       source.append(
"      sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start +        col * d_mat_col_inc                        ] * val; \n");
 
  148     else if (B_transposed && !B_row_major)
 
  149       source.append(
"      sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc)                       + (d_mat_col_start +        col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
 
  150     else if (!B_transposed && B_row_major)
 
  151       source.append(
"      sum += d_mat[ (d_mat_row_start +        col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start + result_col * d_mat_col_inc                        ] * val; \n");
 
  153       source.append(
"      sum += d_mat[ (d_mat_row_start +        col * d_mat_row_inc)                       + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
 
  154     source.append(
"      } \n");
 
  156     source.append(
"    } \n");
 
  158     source.append(
"    uint col_begin = csr_rows[row_id]; \n");
 
  159     source.append(
"    uint col_end   = csr_rows[row_id + 1]; \n");
 
  161     source.append(
"    for (uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
 
  162     if (B_transposed && B_row_major)
 
  163       source.append(
"      sum += d_mat[ (d_mat_row_start +        result_col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start + csr_cols[item_id] * d_mat_col_inc                        ] * csr_elements[item_id]; \n");
 
  164     else if (B_transposed && !B_row_major)
 
  165       source.append(
"      sum += d_mat[ (d_mat_row_start +        result_col * d_mat_row_inc)                       + (d_mat_col_start + csr_cols[item_id] * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
 
  166     else if (!B_transposed && B_row_major)
 
  167       source.append(
"      sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start +        result_col * d_mat_col_inc                        ] * csr_elements[item_id]; \n");
 
  169       source.append(
"      sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc)                       + (d_mat_col_start +        result_col * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
 
  170     source.append(
"    } \n");
 
  173       source.append(
"      result[ (result_row_start + row_id * result_row_inc) * result_internal_cols + result_col_start + result_col * result_col_inc ] = sum; \n");
 
  175       source.append(
"      result[ (result_row_start + row_id * result_row_inc)                        + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = sum; \n");
 
  176     source.append(
"   } \n");
 
  177     source.append(
"  } \n");
 
  178     source.append(
"} \n");
 
  182 template<
typename StringT>
 
  200 template<
typename NumericT>
 
  210     static std::map<cl_context, bool> init_done;
 
  217       source.reserve(1024);
 
  219       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
  226       #ifdef VIENNACL_BUILD_INFO 
  227       std::cout << 
"Creating program " << prog_name << std::endl;
 
  229       ctx.add_program(source, prog_name);
 
  230       init_done[ctx.handle().get()] = 
true;
 
void generate_hyb_vec_mul(StringT &source, std::string const &numeric_string, bool with_alpha_beta)
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities. 
const viennacl::ocl::handle< cl_context > & handle() const 
Returns the context handle. 
Common implementations shared by OpenCL-based operations. 
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const 
static std::string program_name()
void generate_hyb_matrix_dense_matrix_mul(StringT &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Representation of an OpenCL kernel in ViennaCL. 
static void init(viennacl::ocl::context &ctx)
Helper class for converting a type to its string representation. 
void generate_hyb_matrix_dense_matrix_multiplication(StringT &source, std::string const &numeric_string)
Main kernel class for generating OpenCL kernels for hyb_matrix.