1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP 
    2 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP 
   65 template <
typename StringType>
 
   70     source.append(
"  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
 
   71     source.append(
"  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
 
   72     source.append(
"  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
 
   73     source.append(
"    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
 
   77     source.append(
"  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
 
   78     source.append(
"  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
 
   79     source.append(
"  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
 
   80     source.append(
"    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
 
   86       source.append(
"      A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] ");
 
   88       source.append(
"      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] ");
 
   91       source.append(
" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] ");
 
   93       source.append(
" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] ");
 
   96       source.append(
"* alpha ");
 
   98       source.append(
"/ alpha ");
 
  102         source.append(
"+ C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] ");
 
  104         source.append(
"+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] ");
 
  106         source.append(
"* beta");
 
  108         source.append(
"/ beta");
 
  114       source.append(
"    A[row * A_internal_size2 + col] ");
 
  116       source.append(
"    A[row + col * A_internal_size1] ");
 
  119       source.append(
" B[row * B_internal_size2 + col] ");
 
  121       source.append(
" B[row + col * B_internal_size1] ");
 
  124       source.append(
"* alpha ");
 
  126       source.append(
"/ alpha ");
 
  130         source.append(
"+ C[row * C_internal_size2 + col] ");
 
  132         source.append(
"+ C[row + col * C_internal_size2] ");
 
  134         source.append(
"* beta");
 
  136         source.append(
"/ beta");
 
  139   source.append(
"; \n");
 
  142 template <
typename StringType>
 
  145   source.append(
"__kernel void am");
 
  152     source.append(
"_cpu");
 
  154     source.append(
"_gpu");
 
  157     source.append(
"_cpu");
 
  159     source.append(
"_gpu");
 
  160   source.append(
"( \n");
 
  161   source.append(
"  __global "); source.append(numeric_string); source.append(
" * A, \n");
 
  162   source.append(
"  unsigned int A_start1, unsigned int A_start2, \n");
 
  163   source.append(
"  unsigned int A_inc1,   unsigned int A_inc2, \n");
 
  164   source.append(
"  unsigned int A_size1,  unsigned int A_size2, \n");
 
  165   source.append(
"  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
 
  168     source.append(
"  "); source.append(numeric_string); source.append(
" fac2, \n");
 
  172     source.append(
"  __global "); source.append(numeric_string); source.append(
" * fac2, \n");
 
  174   source.append(
"  unsigned int options2, \n");  
 
  175   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * B, \n");
 
  176   source.append(
"  unsigned int B_start1, unsigned int B_start2, \n");
 
  177   source.append(
"  unsigned int B_inc1,   unsigned int B_inc2, \n");
 
  178   source.append(
"  unsigned int B_internal_size1,  unsigned int B_internal_size2");
 
  182     source.append(
", \n\n");
 
  185       source.append(
"  "); source.append(numeric_string); source.append(
" fac3, \n");
 
  189       source.append(
"  __global "); source.append(numeric_string); source.append(
" * fac3, \n");
 
  191     source.append(
"  unsigned int options3, \n");  
 
  192     source.append(
"  __global const "); source.append(numeric_string); source.append(
" * C, \n");
 
  193     source.append(
"  unsigned int C_start1, unsigned int C_start2, \n");
 
  194     source.append(
"  unsigned int C_inc1,   unsigned int C_inc2, \n");
 
  195     source.append(
"  unsigned int C_internal_size1,  unsigned int C_internal_size2 \n");
 
  197   source.append(
") { \n");
 
  201     source.append(
"  "); source.append(numeric_string); source.append(
" alpha = fac2; \n");
 
  205     source.append(
"  "); source.append(numeric_string); source.append(
" alpha = fac2[0]; \n");
 
  207   source.append(
"  if (options2 & (1 << 0)) \n");
 
  208   source.append(
"    alpha = -alpha; \n");
 
  209   source.append(
" \n");
 
  213     source.append(
"  "); source.append(numeric_string); source.append(
" beta = fac3; \n");
 
  217     source.append(
"  "); source.append(numeric_string); source.append(
" beta = fac3[0]; \n");
 
  221     source.append(
"  if (options3 & (1 << 0)) \n");
 
  222     source.append(
"    beta = -beta; \n");
 
  223     source.append(
" \n");
 
  225   source.append(
"  if (options2 & (1 << 1)) { \n");
 
  228     source.append(
"    if (options3 & (1 << 1)) {\n");
 
  230     source.append(
"    } else {\n");
 
  232     source.append(
"    } \n");
 
  236   source.append(
"  } else { \n");
 
  239     source.append(
"    if (options3 & (1 << 1)) {\n");
 
  241     source.append(
"    } else {\n");
 
  243     source.append(
"    } \n");
 
  247   source.append(
"  } \n");
 
  248   source.append(
"} \n");
 
  251 template <
typename StringType>
 
  278 template <
typename StringType>
 
  281   source.append(
"__kernel void assign_cpu( \n");
 
  282   source.append(
"  __global "); source.append(numeric_string); source.append(
" * A, \n");
 
  283   source.append(
"  unsigned int A_start1, unsigned int A_start2, \n");
 
  284   source.append(
"  unsigned int A_inc1,   unsigned int A_inc2, \n");
 
  285   source.append(
"  unsigned int A_size1,  unsigned int A_size2, \n");
 
  286   source.append(
"  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
 
  287   source.append(
"  "); source.append(numeric_string); source.append(
" alpha) \n");
 
  288   source.append(
"{ \n");
 
  291     source.append(
"  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
 
  292     source.append(
"  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
 
  293     source.append(
"  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
 
  294     source.append(
"    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
 
  295     source.append(
"      A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; \n");
 
  299     source.append(
"  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
 
  300     source.append(
"  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
 
  301     source.append(
"  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
 
  302     source.append(
"    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
 
  303     source.append(
"      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] = alpha; \n");
 
  305   source.append(
"} \n");
 
  308 template <
typename StringType>
 
  311   source.append(
"__kernel void diagonal_assign_cpu( \n");
 
  312   source.append(
"  __global "); source.append(numeric_string); source.append(
" * A, \n");
 
  313   source.append(
"  unsigned int A_start1, unsigned int A_start2, \n");
 
  314   source.append(
"  unsigned int A_inc1,   unsigned int A_inc2, \n");
 
  315   source.append(
"  unsigned int A_size1,  unsigned int A_size2, \n");
 
  316   source.append(
"  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
 
  317   source.append(
"  "); source.append(numeric_string); source.append(
" alpha) \n");
 
  318   source.append(
"{ \n");
 
  319   source.append(
"  for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))\n");
 
  321     source.append(
"    A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; \n");
 
  323     source.append(
"    A[(idx * A_inc1 + A_start1) + (idx * A_inc2 + A_start2) *  A_internal_size1] = alpha; \n");
 
  324   source.append(
"} \n");
 
  327 template <
typename StringType>
 
  330   source.append(
"__kernel void element_op( \n");
 
  331   source.append(
"  __global "); source.append(numeric_string); source.append(
" * A, \n");
 
  332   source.append(
"  unsigned int A_start1, unsigned int A_start2, \n");
 
  333   source.append(
"  unsigned int A_inc1,   unsigned int A_inc2, \n");
 
  334   source.append(
"  unsigned int A_size1,  unsigned int A_size2, \n");
 
  335   source.append(
"  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
 
  336   source.append(
"  __global "); source.append(numeric_string); source.append(
" * B, \n");
 
  337   source.append(
"  unsigned int B_start1, unsigned int B_start2, \n");
 
  338   source.append(
"  unsigned int B_inc1,   unsigned int B_inc2, \n");
 
  339   source.append(
"  unsigned int B_internal_size1,  unsigned int B_internal_size2, \n");
 
  340   source.append(
"  __global "); source.append(numeric_string); source.append(
" * C, \n");
 
  341   source.append(
"  unsigned int C_start1, unsigned int C_start2, \n");
 
  342   source.append(
"  unsigned int C_inc1,   unsigned int C_inc2, \n");
 
  343   source.append(
"  unsigned int C_internal_size1,  unsigned int C_internal_size2, \n");
 
  344   source.append(
"  unsigned int op_type) \n"); 
 
  345   source.append(
"{ \n");
 
  348     source.append(
"  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
 
  349     source.append(
"  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
 
  350     source.append(
"  if (op_type == 2) {");
 
  351     if (numeric_string == 
"float" || numeric_string == 
"double")
 
  353       source.append(
"    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
 
  354       source.append(
"      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
 
  355       source.append(
"        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
 
  356       source.append(
"        pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)], \n");
 
  357       source.append(
"            C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]); \n");
 
  359     source.append(
"  } else if (op_type == 1) {");
 
  360     source.append(
"    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
 
  361     source.append(
"      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
 
  362     source.append(
"        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
 
  363     source.append(
"        B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / \n");
 
  364     source.append(
"        C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
 
  365     source.append(
"  } else if (op_type == 0) {");
 
  366     source.append(
"    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
 
  367     source.append(
"      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
 
  368     source.append(
"        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
 
  369     source.append(
"        B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * \n");
 
  370     source.append(
"        C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
 
  375     source.append(
"  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
 
  376     source.append(
"  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
 
  377     source.append(
"  if (op_type == 2) {");
 
  378     if (numeric_string == 
"float" || numeric_string == 
"double")
 
  380       source.append(
"    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
 
  381       source.append(
"      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
 
  382       source.append(
"        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] =  \n");
 
  383       source.append(
"          pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1], \n");
 
  384       source.append(
"              C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]); \n");
 
  386     source.append(
"  } else if (op_type == 1) {");
 
  387     source.append(
"    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
 
  388     source.append(
"      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
 
  389     source.append(
"        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] =  \n");
 
  390     source.append(
"          B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1] / \n");
 
  391     source.append(
"          C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]; \n");
 
  392     source.append(
"  } else if (op_type == 0) {");
 
  393     source.append(
"    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
 
  394     source.append(
"      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
 
  395     source.append(
"        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] = \n");
 
  396     source.append(
"          B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1] * \n");
 
  397     source.append(
"          C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]; \n");
 
  400   source.append(
"} \n");
 
  404 template<
typename StringT>
 
  408   source.append(
"__kernel void fft_direct(__global "); source.append(numeric_string); source.append(
"2 *input, \n");
 
  409   source.append(
"                         __global "); source.append(numeric_string); source.append(
"2 *output, \n");
 
  410   source.append(
"                         unsigned int size, \n");
 
  411   source.append(
"                         unsigned int stride, \n");
 
  412   source.append(
"                         unsigned int batch_num, \n");
 
  413   source.append(
"                         "); source.append(numeric_string); source.append(
" sign) { \n");
 
  414   source.append(
"    const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
 
  415   source.append(
" \n");
 
  416   source.append(
"    for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
 
  417   source.append(
"        for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
 
  418   source.append(
"            "); source.append(numeric_string); source.append(
"2 f = 0.0f; \n");
 
  419   source.append(
" \n");
 
  420   source.append(
"            for (unsigned int n = 0; n < size; n++) { \n");
 
  421   source.append(
"                "); source.append(numeric_string); source.append(
"2 in = ");
 
  423     source.append(
"input[batch_id * stride + n]; \n"); 
 
  425     source.append(
"input[n * stride + batch_id]; \n"); 
 
  426   source.append(
" \n");
 
  427   source.append(
"                "); source.append(numeric_string); source.append(
" sn, cs; \n");
 
  428   source.append(
"                "); source.append(numeric_string); source.append(
" arg = sign * 2 * NUM_PI * k / size * n; \n");
 
  429   source.append(
"                sn = sincos(arg, &cs); \n");
 
  430   source.append(
" \n");
 
  431   source.append(
"                "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
 
  432   source.append(
"                f = f + ("); source.append(numeric_string); source.append(
"2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
 
  433   source.append(
"            } \n");
 
  434   source.append(
" \n");
 
  436     source.append(
"            output[batch_id * stride + k] = f; \n"); 
 
  438     source.append(
"            output[k * stride + batch_id] = f; \n"); 
 
  439   source.append(
"        } \n");
 
  440   source.append(
"    } \n");
 
  441   source.append(
"} \n");
 
  443   source.append(
" \n"); 
 
  445   source.append(
"__kernel void fft_radix2(__global "); source.append(numeric_string); source.append(
"2* input, \n");
 
  446   source.append(
"                         unsigned int s, \n");
 
  447   source.append(
"                         unsigned int bit_size, \n");
 
  448   source.append(
"                         unsigned int size, \n");
 
  449   source.append(
"                         unsigned int stride, \n");
 
  450   source.append(
"                         unsigned int batch_num, \n");
 
  451   source.append(
"                         "); source.append(numeric_string); source.append(
" sign) { \n");
 
  452   source.append(
" \n");
 
  453   source.append(
"    unsigned int ss = 1 << s; \n");
 
  454   source.append(
"    unsigned int half_size = size >> 1; \n");
 
  455   source.append(
" \n");
 
  456   source.append(
"    "); source.append(numeric_string); source.append(
" cs, sn; \n");
 
  457   source.append(
"    const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
 
  458   source.append(
" \n");
 
  459   source.append(
"    unsigned int glb_id = get_global_id(0); \n");
 
  460   source.append(
"    unsigned int glb_sz = get_global_size(0); \n");
 
  462   source.append(
"    for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
 
  463   source.append(
"        for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
 
  464   source.append(
"            unsigned int group = (tid & (ss - 1)); \n");
 
  465   source.append(
"            unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
 
  469     source.append(
"            unsigned int offset = batch_id * stride + pos; \n");
 
  470     source.append(
"            "); source.append(numeric_string); source.append(
"2 in1 = input[offset]; \n"); 
 
  471     source.append(
"            "); source.append(numeric_string); source.append(
"2 in2 = input[offset + ss]; \n");
 
  475     source.append(
"            unsigned int offset = pos * stride + batch_id; \n");
 
  476     source.append(
"            "); source.append(numeric_string); source.append(
"2 in1 = input[offset]; \n"); 
 
  477     source.append(
"            "); source.append(numeric_string); source.append(
"2 in2 = input[offset + ss * stride]; \n");
 
  480   source.append(
"            "); source.append(numeric_string); source.append(
" arg = group * sign * NUM_PI / ss; \n");
 
  482   source.append(
"            sn = sincos(arg, &cs); \n");
 
  484   source.append(
"            "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
 
  486   source.append(
"            "); source.append(numeric_string); source.append(
"2 tmp = ("); source.append(numeric_string); source.append(
"2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
 
  489     source.append(
"            input[offset + ss] = in1 - tmp; \n");
 
  491     source.append(
"            input[offset + ss * stride] = in1 - tmp; \n");
 
  492   source.append(
"            input[offset] = in1 + tmp; \n");
 
  493   source.append(
"        } \n");
 
  494   source.append(
"    } \n");
 
  495   source.append(
"} \n");
 
  497   source.append(
" \n"); 
 
  499   source.append(
" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
 
  500   source.append(
"     v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
 
  501   source.append(
"     v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
 
  502   source.append(
"     v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
 
  503   source.append(
"     v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
 
  504   source.append(
"     v = (v >> 16) | (v << 16); \n");
 
  505   source.append(
"  \n");
 
  506   source.append(
"     v = v >> (32 - bit_size); \n");
 
  507   source.append(
"  \n");
 
  508   source.append(
"     return v; \n");
 
  509   source.append(
" } \n");
 
  511   source.append(
" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append(
"2* input, \n");
 
  512   source.append(
"                                 __local "); source.append(numeric_string); source.append(
"2* lcl_input, \n");
 
  513   source.append(
"                                 unsigned int bit_size, \n");
 
  514   source.append(
"                                 unsigned int size, \n");
 
  515   source.append(
"                                 unsigned int stride, \n");
 
  516   source.append(
"                                 unsigned int batch_num, \n");
 
  517   source.append(
"                                 "); source.append(numeric_string); source.append(
" sign) { \n");
 
  519   source.append(
"     unsigned int grp_id = get_group_id(0); \n");
 
  520   source.append(
"     unsigned int grp_num = get_num_groups(0); \n");
 
  522   source.append(
"     unsigned int lcl_sz = get_local_size(0); \n");
 
  523   source.append(
"     unsigned int lcl_id = get_local_id(0); \n");
 
  524   source.append(
"     const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
 
  526   source.append(
"     for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
 
  529   source.append(
"         for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
 
  530   source.append(
"             unsigned int v = get_reorder_num(p, bit_size); \n");
 
  532     source.append(
"             lcl_input[v] = input[batch_id * stride + p]; \n"); 
 
  534     source.append(
"             lcl_input[v] = input[p * stride + batch_id]; \n"); 
 
  535   source.append(
"         } \n");
 
  537   source.append(
"         barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  540   source.append(
"         for (unsigned int s = 0; s < bit_size; s++) { \n");
 
  541   source.append(
"             unsigned int ss = 1 << s; \n");
 
  543   source.append(
"             "); source.append(numeric_string); source.append(
" cs, sn; \n");
 
  545   source.append(
"             for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
 
  546   source.append(
"                 unsigned int group = (tid & (ss - 1)); \n");
 
  547   source.append(
"                 unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
 
  549   source.append(
"                 "); source.append(numeric_string); source.append(
"2 in1 = lcl_input[pos]; \n");
 
  550   source.append(
"                 "); source.append(numeric_string); source.append(
"2 in2 = lcl_input[pos + ss]; \n");
 
  552   source.append(
"                 "); source.append(numeric_string); source.append(
" arg = group * sign * NUM_PI / ss; \n");
 
  554   source.append(
"                 sn = sincos(arg, &cs); \n");
 
  555   source.append(
"                 "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
 
  557   source.append(
"                 "); source.append(numeric_string); source.append(
"2 tmp = ("); source.append(numeric_string); source.append(
"2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
 
  559   source.append(
"                 lcl_input[pos + ss] = in1 - tmp; \n");
 
  560   source.append(
"                 lcl_input[pos] = in1 + tmp; \n");
 
  561   source.append(
"             } \n");
 
  563   source.append(
"             barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  564   source.append(
"         } \n");
 
  567   source.append(
"         for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
 
  569     source.append(
"             input[batch_id * stride + p] = lcl_input[p]; \n");
 
  571     source.append(
"             input[p * stride + batch_id] = lcl_input[p]; \n");
 
  572   source.append(
"         } \n");
 
  573   source.append(
"     } \n");
 
  574   source.append(
" } \n");
 
  576   source.append(
" \n"); 
 
  582   source.append(
"unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
 
  583   source.append(
"    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
 
  584   source.append(
"    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
 
  585   source.append(
"    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
 
  586   source.append(
"    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
 
  587   source.append(
"    v = (v >> 16) | (v << 16); \n");
 
  589   source.append(
"    v = v >> (32 - bit_size); \n");
 
  591   source.append(
"    return v; \n");
 
  592   source.append(
"} \n");
 
  594   source.append(
"__kernel void fft_reorder(__global "); source.append(numeric_string); source.append(
"2* input, \n");
 
  595   source.append(
"                          unsigned int bit_size, \n");
 
  596   source.append(
"                          unsigned int size, \n");
 
  597   source.append(
"                          unsigned int stride, \n");
 
  598   source.append(
"                          int batch_num) { \n");
 
  600   source.append(
"    unsigned int glb_id = get_global_id(0); \n");
 
  601   source.append(
"    unsigned int glb_sz = get_global_size(0); \n");
 
  603   source.append(
"    for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
 
  604   source.append(
"        for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
 
  605   source.append(
"            unsigned int v = get_reorder_num_2(i, bit_size); \n");
 
  607   source.append(
"            if (i < v) {\n");
 
  610     source.append(
"                "); source.append(numeric_string); source.append(
"2 tmp = input[batch_id * stride + i]; \n"); 
 
  611     source.append(
"                input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); 
 
  612     source.append(
"                input[batch_id * stride + v] = tmp; \n"); 
 
  616     source.append(
"                "); source.append(numeric_string); source.append(
"2 tmp = input[i * stride + batch_id]; \n"); 
 
  617     source.append(
"                input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); 
 
  618     source.append(
"                input[v * stride + batch_id] = tmp; \n"); 
 
  620   source.append(
"            } \n");
 
  621   source.append(
"        } \n");
 
  622   source.append(
"    } \n");
 
  623   source.append(
"} \n");
 
  626 template<
typename StringT>
 
  629   source.append(
"__kernel void lu_factorize( \n");
 
  630   source.append(
"          __global "); source.append(numeric_string); source.append(
" * matrix, \n");
 
  631   source.append(
"          unsigned int matrix_rows, \n");
 
  632   source.append(
"          unsigned int matrix_cols, \n");
 
  633   source.append(
"          unsigned int matrix_internal_rows, \n");
 
  634   source.append(
"          unsigned int matrix_internal_cols) \n");
 
  635   source.append(
"{ \n");
 
  636   source.append(
"  "); source.append(numeric_string); source.append(
" temp; \n");
 
  640     source.append(
"  unsigned rowi; \n");
 
  641     source.append(
"  unsigned rowk; \n");
 
  642     source.append(
"  for (unsigned int i=1; i<matrix_rows; ++i) \n");
 
  643     source.append(
"  { \n");
 
  644     source.append(
"    rowi = i * matrix_internal_cols; \n");
 
  645     source.append(
"    for (unsigned int k=0; k<i; ++k) \n");
 
  646     source.append(
"    { \n");
 
  647     source.append(
"      rowk = k * matrix_internal_cols; \n");
 
  648     source.append(
"      if (get_global_id(0) == 0) \n");
 
  649     source.append(
"        matrix[rowi + k] /= matrix[rowk + k]; \n");
 
  651     source.append(
"      barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  652     source.append(
"      temp = matrix[rowi + k]; \n");
 
  655     source.append(
"      for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
 
  656     source.append(
"        matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
 
  660     source.append(
"      for (unsigned int i=1; i<matrix_rows; ++i) \n");
 
  661     source.append(
"      { \n");
 
  662     source.append(
"        for (unsigned int k=0; k<i; ++k) \n");
 
  663     source.append(
"        { \n");
 
  665     source.append(
"          if (get_global_id(0) == 0) \n");
 
  666     source.append(
"            matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
 
  668     source.append(
"          barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  669     source.append(
"          temp = matrix[i + k*matrix_internal_rows]; \n");
 
  672     source.append(
"          for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
 
  673     source.append(
"            matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
 
  681 template<
typename StringT>
 
  684   source.append(
"__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append(
"cpu") : source.append(
"gpu"); source.append(
"( \n");
 
  685   source.append(
"  __global "); source.append(numeric_string); source.append(
" * A, \n");
 
  686   source.append(
"  unsigned int A_start1, unsigned int A_start2, \n");
 
  687   source.append(
"  unsigned int A_inc1,   unsigned int A_inc2, \n");
 
  688   source.append(
"  unsigned int A_size1,  unsigned int A_size2, \n");
 
  689   source.append(
"  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
 
  692     source.append(
"  "); source.append(numeric_string); source.append(
" val, \n");
 
  694     source.append(
"  __global const "); source.append(numeric_string); source.append(
" *val, \n");
 
  696   source.append(
"  unsigned int options2, \n");
 
  698   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * vec1, \n");
 
  699   source.append(
"  unsigned int start1, \n");
 
  700   source.append(
"  unsigned int inc1, \n");
 
  701   source.append(
"  unsigned int size1, \n");
 
  703   source.append(
"  __global const "); source.append(numeric_string); source.append(
" * vec2, \n");
 
  704   source.append(
"  unsigned int start2, \n");
 
  705   source.append(
"  unsigned int inc2, \n");
 
  706   source.append(
"  unsigned int size2) \n");
 
  707   source.append(
"{ \n");
 
  710     source.append(
"  "); source.append(numeric_string); source.append(
" alpha = val; \n");
 
  712     source.append(
"  "); source.append(numeric_string); source.append(
" alpha = val[0]; \n");
 
  714   source.append(
"  if (options2 & (1 << 0)) \n");
 
  715   source.append(
"    alpha = -alpha; \n");
 
  717   source.append(
"  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
 
  718   source.append(
"  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
 
  720   source.append(
"  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
 
  721   source.append(
"  { \n");
 
  722   source.append(
"    "); source.append(numeric_string); source.append(
" tmp = vec1[row * inc1 + start1];");
 
  723   source.append(
"    tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
 
  724   source.append(
"    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
 
  726     source.append(
"      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
 
  728     source.append(
"      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
 
  729   source.append(
"  } \n");
 
  730   source.append(
"} \n");
 
  733 template <
typename StringType>
 
  736   source.append(
"__kernel void trans_vec_mul( \n");
 
  737   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * A, \n");
 
  738   source.append(
"          unsigned int A_row_start, unsigned int A_col_start, \n");
 
  739   source.append(
"          unsigned int A_row_inc, unsigned int A_col_inc, \n");
 
  740   source.append(
"          unsigned int A_row_size, unsigned int A_col_size, \n");
 
  741   source.append(
"          unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
 
  742   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * v, \n");
 
  743   source.append(
"          unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
 
  744   source.append(
"          __global "); source.append(numeric_string); source.append(
" * result, \n");
 
  745   source.append(
"          unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
 
  746   source.append(
"          __local "); source.append(numeric_string); source.append(
" * work) \n");
 
  747   source.append(
"{ \n");
 
  750     source.append(
"  for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) \n");
 
  751     source.append(
"  { \n");
 
  752     source.append(
"    "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
 
  753     source.append(
"    for (unsigned int col = 0; col < A_row_size; ++col) \n");
 
  754     source.append(
"      dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; \n");
 
  755     source.append(
"    result[row * result_inc + result_start] = dot_prod; \n");
 
  759     source.append(
"  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
 
  760     source.append(
"  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
 
  761     source.append(
"  unsigned int lid = get_local_id(0); \n");
 
  763     source.append(
"  for (unsigned int row = row_gid; row < A_col_size; row += get_num_groups(0)) \n");
 
  764     source.append(
"  { \n");
 
  765     source.append(
"    "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
 
  766     source.append(
"    for (unsigned int col = col_gid; col < A_row_size; col+=get_local_size(0)) \n");
 
  767     source.append(
"      dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; \n");
 
  768     source.append(
"    work[lid] = dot_prod; \n");
 
  770     source.append(
"    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
 
  771     source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  772     source.append(
"      if(lid < stride) \n");
 
  773     source.append(
"        work[lid] += work[lid+stride]; \n");
 
  774     source.append(
"    } \n");
 
  776     source.append(
"    if(lid == 0) \n");
 
  777     source.append(
"      result[row * result_inc + result_start] = work[0]; \n");
 
  779   source.append(
"  } \n");
 
  780   source.append(
"} \n");
 
  783 template<
typename StringT>
 
  786   source.append(
"__kernel void triangular_substitute_inplace( \n");
 
  787   source.append(
"          __global "); source.append(numeric_string); source.append(
" * A, \n");
 
  788   source.append(
"          unsigned int A_start1, unsigned int A_start2, \n");
 
  789   source.append(
"          unsigned int A_inc1,   unsigned int A_inc2, \n");
 
  790   source.append(
"          unsigned int A_size1,  unsigned int A_size2, \n");
 
  791   source.append(
"          unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
 
  792   source.append(
"          __global "); source.append(numeric_string); source.append(
" * v, \n");
 
  793   source.append(
"          unsigned int v_start, \n");
 
  794   source.append(
"          unsigned int v_inc, \n");
 
  795   source.append(
"          unsigned int v_size, \n");
 
  796   source.append(
"          unsigned int options) \n");
 
  797   source.append(
"{ \n");
 
  798   source.append(
"  "); source.append(numeric_string); source.append(
" temp; \n");
 
  799   source.append(
"  unsigned int unit_diagonal_flag  = (options & (1 << 0)); \n");
 
  800   source.append(
"  unsigned int transposed_access_A = (options & (1 << 1)); \n");
 
  801   source.append(
"  unsigned int is_lower_solve      = (options & (1 << 2)); \n");
 
  802   source.append(
"  unsigned int row; \n");
 
  803   source.append(
"  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)  \n");   
 
  804   source.append(
"  { \n");
 
  805   source.append(
"    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
 
  806   source.append(
"    barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  807   source.append(
"    if (!unit_diagonal_flag) \n");
 
  808   source.append(
"    { \n");
 
  809   source.append(
"      if (get_global_id(0) == 0) \n");
 
  811     source.append(
"        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
 
  813     source.append(
"        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
 
  814   source.append(
"   } \n");
 
  816   source.append(
"    barrier(CLK_GLOBAL_MEM_FENCE); \n");
 
  818   source.append(
"    temp = v[row * v_inc + v_start]; \n");
 
  820   source.append(
"    for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
 
  821   source.append(
"             elim < (is_lower_solve ? A_size1 : row); \n");
 
  822   source.append(
"             elim += get_global_size(0)) \n");
 
  825     source.append(
"      v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
 
  826     source.append(
"                                                                : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row  * A_inc2 + A_start2))]; \n");
 
  830     source.append(
"      v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
 
  831     source.append(
"                                                                : ((elim * A_inc1 + A_start1) + (row  * A_inc2 + A_start2) * A_internal_size1)]; \n");
 
  833   source.append(
"  } \n");
 
  834   source.append(
"} \n");
 
  837 template <
typename StringT>
 
  840   source.append(
"__kernel void trans_kernel(\n");
 
  841   source.append(
"           __global const ");source.append(numeric_string);source.append(
" * A, \n");
 
  842   source.append(
"           unsigned int A_start1,          unsigned int A_start2, \n");
 
  843   source.append(
"           unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
 
  844   source.append(
"           unsigned int A_size1,           unsigned int A_size2, \n");
 
  845   source.append(
"           unsigned int A_stride1,         unsigned int A_stride2, \n");
 
  846   source.append(
"           __global ");source.append(numeric_string);source.append(
" * B, \n");
 
  847   source.append(
"           unsigned int B_start1,          unsigned int B_start2, \n");
 
  848   source.append(
"           unsigned int B_internal_size1,  unsigned int B_internal_size2, \n");
 
  849   source.append(
"           unsigned int B_stride1,         unsigned int B_stride2) \n");
 
  850   source.append(
"{ \n");
 
  851   source.append(
"  for(unsigned int row = get_group_id(0); row < A_size1; row += get_num_groups(0))\n");
 
  852   source.append(
"  {  \n");
 
  853   source.append(
"    for(unsigned int col = get_local_id(0); col < A_size2; col += get_local_size(0))\n");
 
  854   source.append(
"    {  \n");
 
  856     source.append(
"      B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)];  \n");
 
  858     source.append(
"      B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1];  \n");
 
  859   source.append(
"    } \n");
 
  860   source.append(
"  } \n");
 
  861   source.append(
"}  \n");
 
  864 template <
typename StringType>
 
  867   source.append(
"__kernel void vec_mul( \n");
 
  868   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * A, \n");
 
  869   source.append(
"          unsigned int A_row_start, unsigned int A_col_start, \n");
 
  870   source.append(
"          unsigned int A_row_inc, unsigned int A_col_inc, \n");
 
  871   source.append(
"          unsigned int A_row_size, unsigned int A_col_size, \n");
 
  872   source.append(
"          unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
 
  873   source.append(
"          __global const "); source.append(numeric_string); source.append(
" * v, \n");
 
  874   source.append(
"          unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
 
  875   source.append(
"          __global "); source.append(numeric_string); source.append(
" * result, \n");
 
  876   source.append(
"          unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
 
  877   source.append(
"          __local "); source.append(numeric_string); source.append(
" * work) \n");
 
  878   source.append(
"{ \n");
 
  881     source.append(
"  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
 
  882     source.append(
"  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
 
  883     source.append(
"  unsigned int lid = get_local_id(0); \n");
 
  885     source.append(
"  for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) \n");
 
  886     source.append(
"  { \n");
 
  887     source.append(
"    "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
 
  888     source.append(
"    for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) \n");
 
  889     source.append(
"      dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; \n");
 
  890     source.append(
"    work[lid] = dot_prod; \n");
 
  892     source.append(
"    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
 
  893     source.append(
"      barrier(CLK_LOCAL_MEM_FENCE); \n");
 
  894     source.append(
"      if(lid < stride) \n");
 
  895     source.append(
"        work[lid] += work[lid+stride]; \n");
 
  896     source.append(
"    } \n");
 
  898     source.append(
"    if(lid == 0) \n");
 
  899     source.append(
"      result[row * result_inc + result_start] = work[0]; \n");
 
  904     source.append(
"    for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0)) \n");
 
  905     source.append(
"    { \n");
 
  906     source.append(
"      "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
 
  907     source.append(
"      for (unsigned int col = 0; col < A_col_size; ++col) \n");
 
  908     source.append(
"        dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; \n");
 
  909     source.append(
"      result[row * result_inc + result_start] = dot_prod; \n");
 
  911   source.append(
"  } \n");
 
  912   source.append(
"} \n");
 
  925 template <
typename NumericT, 
typename F>
 
  939     static std::map<cl_context, bool> init_done;
 
  943       source.reserve(8192);
 
  945       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
  958       #ifdef VIENNACL_BUILD_INFO 
  959       std::cout << 
"Creating program " << prog_name << std::endl;
 
  961       ctx.add_program(source, prog_name);
 
  962       init_done[ctx.handle().get()] = 
true;
 
  968 template<
typename NumericT>
 
  976     std::pair<bool, cl_context> key(is_row_major, h);
 
  977     if (handlers_map.find(key) == handlers_map.end())
 
  981       namespace ds = viennacl::device_specific;
 
  984       handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
 
  987       ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device, 
'N', 
'N');
 
  988       ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device, 
'T', 
'N');
 
  989       ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device, 
'N', 
'T');
 
  990       ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device, 
'T', 
'T');
 
 1005       handler.add(
"prod_NN", ds::matrix_product_template(matrix_product_params_NN, 
'N', 
'N'), 
scheduler::preset::mat_mat_prod(alpha, &A, 
false, &B, 
false, beta, &C));
 
 1006       handler.add(
"prod_TN", ds::matrix_product_template(matrix_product_params_TN, 
'T', 
'N'), 
scheduler::preset::mat_mat_prod(alpha, &A, 
true, &B, 
false, beta, &C));
 
 1007       handler.add(
"prod_NT", ds::matrix_product_template(matrix_product_params_NT, 
'N', 
'T'), 
scheduler::preset::mat_mat_prod(alpha, &A, 
false, &B, 
true, beta, &C));
 
 1008       handler.add(
"prod_TT", ds::matrix_product_template(matrix_product_params_TT, 
'T', 
'T'), 
scheduler::preset::mat_mat_prod(alpha, &A, 
true, &B, 
true, beta, &C));
 
 1017 template<
typename NumericT, 
typename LayoutT>
 
 1027     static std::map<cl_context, bool> init_done;
 
 1035       source.reserve(8192);
 
 1037       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
 1043       if (numeric_string == 
"float" || numeric_string == 
"double")
 
 1046         generate_lu(source, numeric_string, is_row_major);
 
 1052       #ifdef VIENNACL_BUILD_INFO 
 1053       std::cout << 
"Creating program " << prog_name << std::endl;
 
 1055       ctx.add_program(source, prog_name);
 
 1056       init_done[ctx.handle().get()] = 
true;
 
 1064 template<
typename StringT>
 
 1067  source.append(
" __kernel void convert_row_" + dest_type + 
"_" + src_type + 
"( \n");
 
 1068  source.append(
"  __global " + dest_type + 
" * dest, \n");
 
 1069  source.append(
"  unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
 
 1070  source.append(
"  unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
 
 1071  source.append(
"  __global const " + src_type + 
" * src, \n");
 
 1072  source.append(
"  unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
 
 1073  source.append(
"  unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
 
 1074  source.append(
"  { \n");
 
 1075  source.append(
"   for (unsigned int i = get_group_id(0); i < size1_dest; i += get_num_groups(0)) \n");
 
 1076  source.append(
"     for (unsigned int j = get_local_id(0); j < size2_dest; j += get_local_size(0)) \n");
 
 1077  source.append(
"       dest[(start1_dest + i * inc1_dest) * internal_size2_dest + (start2_dest + j * inc2_dest)] = src[(start1_src + i * inc1_src) * internal_size2_src + (start2_src + j * inc2_src)]; \n");
 
 1078  source.append(
"  } \n");
 
 1081 template<
typename StringT>
 
 1084   source.append(
" __kernel void convert_col_" + dest_type + 
"_" + src_type + 
"( \n");
 
 1085   source.append(
"  __global " + dest_type + 
" * dest, \n");
 
 1086   source.append(
"  unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
 
 1087   source.append(
"  unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
 
 1088   source.append(
"  __global const " + src_type + 
" * src, \n");
 
 1089   source.append(
"  unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
 
 1090   source.append(
"  unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
 
 1091   source.append(
"  { \n");
 
 1092   source.append(
"   for (unsigned int j = get_group_id(0); j < size2_dest; j += get_num_groups(0)) \n");
 
 1093   source.append(
"     for (unsigned int i = get_local_id(0); i < size1_dest; i += get_local_size(0)) \n");
 
 1094   source.append(
"       dest[(start1_dest + i * inc1_dest) + (start2_dest + j * inc2_dest) * internal_size1_dest] = src[(start1_src + i * inc1_src) + (start2_src + j * inc2_src) * internal_size1_src]; \n");
 
 1095   source.append(
"  } \n");
 
 1098 template<
typename StringT>
 
 1112     return "matrix_convert";
 
 1117     static std::map<cl_context, bool> init_done;
 
 1121       source.reserve(4096);
 
 1177       #ifdef VIENNACL_BUILD_INFO 
 1178       std::cout << 
"Creating program " << prog_name << std::endl;
 
viennacl::ocl::device const & current_device() const 
Returns the current device. 
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
void generate_fft(StringT &source, std::string const &numeric_string, bool is_row_major)
void generate_triangular_substitute_inplace(StringT &source, std::string const &numeric_string, bool is_row_major)
void generate_ambm_impl(StringType &source, std::string const &numeric_string, ambm_config const &cfg)
Helper class for checking whether a matrix has a row-major layout. 
void append_double_precision_pragma< double >(viennacl::ocl::context const &ctx, std::string &source)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
void generate_assign_cpu(StringType &source, std::string const &numeric_string, bool is_row_major)
Provides OpenCL-related utilities. 
A class representing a compute device (e.g. a GPU) 
void generate_element_op(StringType &source, std::string const &numeric_string, bool is_row_major)
const viennacl::ocl::handle< cl_context > & handle() const 
Returns the context handle. 
static void init(viennacl::ocl::context &ctx)
void generate_vec_mul(StringT &source, std::string const &numeric_string)
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
static void apply(viennacl::ocl::context const &)
bool with_stride_and_range
viennacl::ocl::program & add_program(cl_program p, std::string const &prog_name)
Adds a program to the context. 
const OCL_TYPE & get() const 
void generate_ambm(StringType &source, std::string const &numeric_string, bool is_row_major)
static std::string program_name()
static void init(viennacl::ocl::context &ctx)
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
void generate_trans_kernel(StringT &source, std::string const &numeric_string, bool is_row_major)
bool double_support() const 
ViennaCL convenience function: Returns true if the device supports double precision. 
void generate_trans_vec_mul(StringType &source, std::string const &numeric_string, bool is_row_major)
static std::string program_name()
Configuration struct for generating OpenCL kernels for linear combinations of matrices. 
void generate_ambm_impl2(StringType &source, ambm_config const &cfg, bool mult_alpha, bool mult_beta)
void generate_lu(StringT &source, std::string const &numeric_string, bool is_row_major)
statement mat_mat_prod(NumericT alpha, viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::matrix_base< NumericT > const *B, bool B_trans, NumericT beta, viennacl::matrix_base< NumericT > const *C)
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
Main kernel class for vector conversion routines (e.g. convert vector to vector). 
void generate_scaled_rank1_update(StringT &source, std::string const &numeric_string, bool is_row_major, bool alpha_on_cpu)
Representation of an OpenCL kernel in ViennaCL. 
std::string type_to_string(viennacl::row_major)
void generate_matrix_convert_col(StringT &source, std::string const &dest_type, std::string const &src_type)
A tag for column-major storage of a dense matrix. 
ambm_scalar_type
Enumeration for the scalar type in ambm-like operations. 
void generate_diagonal_assign_cpu(StringType &source, std::string const &numeric_string, bool is_row_major)
void generate_matrix_convert(StringT &source, std::string const &dest_type, std::string const &src_type)
ValueT const & at(std::map< KeyT, ValueT > const &map, KeyT const &key)
Emulation of C++11's .at() member for std::map<>, const-version. 
static void init(viennacl::ocl::context &ctx)
Helper class for converting a type to its string representation. 
A tag for row-major storage of a dense matrix. 
static std::string program_name()
Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc. 
void generate_matrix_convert_row(StringT &source, std::string const &dest_type, std::string const &src_type)