1 #ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_ 
   47 template<
typename DestNumericT, 
typename SrcNumericT>
 
   48 __global__ 
void convert_kernel(DestNumericT      * dest, 
unsigned int start_dest, 
unsigned int inc_dest, 
unsigned int size_dest,
 
   49                                SrcNumericT 
const * src,  
unsigned int start_src,  
unsigned int inc_src)
 
   51   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
   53                     i += gridDim.x * blockDim.x)
 
   54     dest[i*inc_dest+start_dest] = src[i*inc_src+start_src];
 
   58 template<
typename DestNumericT, 
typename SrcNumericT>
 
   76 template<
typename NumericT>
 
   83                           unsigned int options2,
 
   89   if (options2 & (1 << 0))
 
   92   if (options2 & (1 << 1))
 
   94     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
   96                       i += gridDim.x * blockDim.x)
 
  101     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  103                       i += gridDim.x * blockDim.x)
 
  109 template<
typename NumericT>
 
  116                           unsigned int options2,
 
  122   if (options2 & (1 << 0))
 
  125   if (options2 & (1 << 1))
 
  127     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  129                       i += gridDim.x * blockDim.x)
 
  134     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  136                       i += gridDim.x * blockDim.x)
 
  143 template<
typename NumericT, 
typename ScalarType1>
 
  149   unsigned int options_alpha = 
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
 
  151   value_type data_alpha = alpha;
 
  153     data_alpha = -data_alpha;
 
  154   if (reciprocal_alpha)
 
  155     data_alpha = 
static_cast<value_type
>(1) / data_alpha;
 
  157   value_type temporary_alpha = 0;
 
  159     temporary_alpha = alpha;
 
  178 template<
typename NumericT>
 
  185                             unsigned int options2,
 
  191                             unsigned int options3,
 
  197   if (options2 & (1 << 0))
 
  201   if (options3 & (1 << 0))
 
  204   if (options2 & (1 << 1))
 
  206     if (options3 & (1 << 1))
 
  208       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  210                         i += gridDim.x * blockDim.x)
 
  211         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
 
  215       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  217                         i += gridDim.x * blockDim.x)
 
  218         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
 
  223     if (options3 & (1 << 1))
 
  225       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  227                         i += gridDim.x * blockDim.x)
 
  228         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
 
  232       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  234                         i += gridDim.x * blockDim.x)
 
  235         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
 
  241 template<
typename NumericT>
 
  248                             unsigned int options2,
 
  254                             unsigned int options3,
 
  260   if (options2 & (1 << 0))
 
  264   if (options3 & (1 << 0))
 
  267   if (options2 & (1 << 1))
 
  269     if (options3 & (1 << 1))
 
  271       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  273                         i += gridDim.x * blockDim.x)
 
  274         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
 
  278       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  280                         i += gridDim.x * blockDim.x)
 
  281         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
 
  286     if (options3 & (1 << 1))
 
  288       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  290                         i += gridDim.x * blockDim.x)
 
  291         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
 
  295       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  297                         i += gridDim.x * blockDim.x)
 
  298         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
 
  304 template<
typename NumericT>
 
  311                             unsigned int options2,
 
  317                             unsigned int options3,
 
  323   if (options2 & (1 << 0))
 
  327   if (options3 & (1 << 0))
 
  330   if (options2 & (1 << 1))
 
  332     if (options3 & (1 << 1))
 
  334       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  336                         i += gridDim.x * blockDim.x)
 
  337         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
 
  341       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  343                         i += gridDim.x * blockDim.x)
 
  344         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
 
  349     if (options3 & (1 << 1))
 
  351       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  353                         i += gridDim.x * blockDim.x)
 
  354         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
 
  358       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  360                         i += gridDim.x * blockDim.x)
 
  361         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
 
  367 template<
typename NumericT>
 
  374                             unsigned int options2,
 
  380                             unsigned int options3,
 
  386   if (options2 & (1 << 0))
 
  390   if (options3 & (1 << 0))
 
  393   if (options2 & (1 << 1))
 
  395     if (options3 & (1 << 1))
 
  397       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  399                         i += gridDim.x * blockDim.x)
 
  400         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
 
  404       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  406                         i += gridDim.x * blockDim.x)
 
  407         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
 
  412     if (options3 & (1 << 1))
 
  414       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  416                         i += gridDim.x * blockDim.x)
 
  417         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
 
  421       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  423                         i += gridDim.x * blockDim.x)
 
  424         vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
 
  432 template<
typename NumericT, 
typename ScalarT1, 
typename ScalarT2>
 
  439   unsigned int options_alpha = 
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
 
  441   value_type data_alpha = alpha;
 
  443     data_alpha = -data_alpha;
 
  444   if (reciprocal_alpha)
 
  445     data_alpha = 
static_cast<value_type
>(1) / data_alpha;
 
  447   value_type temporary_alpha = 0;
 
  449     temporary_alpha = alpha;
 
  453   value_type temporary_beta = 0;
 
  455     temporary_beta = beta;
 
  482 template<
typename NumericT>
 
  489                               unsigned int options2,
 
  495                               unsigned int options3,
 
  501   if (options2 & (1 << 0))
 
  505   if (options3 & (1 << 0))
 
  508   if (options2 & (1 << 1))
 
  510     if (options3 & (1 << 1))
 
  512       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  514                         i += gridDim.x * blockDim.x)
 
  515         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
 
  519       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  521                         i += gridDim.x * blockDim.x)
 
  522         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
 
  527     if (options3 & (1 << 1))
 
  529       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  531                         i += gridDim.x * blockDim.x)
 
  532         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
 
  536       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  538                         i += gridDim.x * blockDim.x)
 
  539         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
 
  545 template<
typename NumericT>
 
  552                               unsigned int options2,
 
  558                               unsigned int options3,
 
  564   if (options2 & (1 << 0))
 
  568   if (options3 & (1 << 0))
 
  571   if (options2 & (1 << 1))
 
  573     if (options3 & (1 << 1))
 
  575       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  577                         i += gridDim.x * blockDim.x)
 
  578         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
 
  582       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  584                         i += gridDim.x * blockDim.x)
 
  585         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
 
  590     if (options3 & (1 << 1))
 
  592       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  594                         i += gridDim.x * blockDim.x)
 
  595         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
 
  599       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  601                         i += gridDim.x * blockDim.x)
 
  602         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
 
  608 template<
typename NumericT>
 
  615                               unsigned int options2,
 
  621                               unsigned int options3,
 
  627   if (options2 & (1 << 0))
 
  631   if (options3 & (1 << 0))
 
  634   if (options2 & (1 << 1))
 
  636     if (options3 & (1 << 1))
 
  638       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  640                         i += gridDim.x * blockDim.x)
 
  641         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
 
  645       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  647                         i += gridDim.x * blockDim.x)
 
  648         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
 
  653     if (options3 & (1 << 1))
 
  655       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  657                         i += gridDim.x * blockDim.x)
 
  658         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
 
  662       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  664                         i += gridDim.x * blockDim.x)
 
  665         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
 
  671 template<
typename NumericT>
 
  678                               unsigned int options2,
 
  684                               unsigned int options3,
 
  690   if (options2 & (1 << 0))
 
  694   if (options3 & (1 << 0))
 
  697   if (options2 & (1 << 1))
 
  699     if (options3 & (1 << 1))
 
  701       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  703                         i += gridDim.x * blockDim.x)
 
  704         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
 
  708       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  710                         i += gridDim.x * blockDim.x)
 
  711         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
 
  716     if (options3 & (1 << 1))
 
  718       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  720                         i += gridDim.x * blockDim.x)
 
  721         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
 
  725       for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  727                         i += gridDim.x * blockDim.x)
 
  728         vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
 
  734 template<
typename NumericT, 
typename ScalarT1, 
typename ScalarT2>
 
  741   unsigned int options_alpha = 
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
 
  743   value_type data_alpha = alpha;
 
  745     data_alpha = -data_alpha;
 
  746   if (reciprocal_alpha)
 
  747     data_alpha = 
static_cast<value_type
>(1) / data_alpha;
 
  749   value_type temporary_alpha = 0;
 
  751     temporary_alpha = alpha;
 
  755   value_type temporary_beta = 0;
 
  757     temporary_beta = beta;
 
  781 template<
typename NumericT>
 
  790   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  792                     i += gridDim.x * blockDim.x)
 
  802 template<
typename NumericT, 
typename ScalarT1>
 
  807   value_type temporary_alpha = 0;
 
  809     temporary_alpha = alpha;
 
  825 template<
typename NumericT>
 
  836   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  838                     i += gridDim.x * blockDim.x)
 
  840     tmp = vec2[i*inc2+
start2];
 
  842     vec1[i*inc1+
start1] = tmp;
 
  852 template<
typename NumericT>
 
  868 template<
typename NumericT>
 
  887     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  889                       i += gridDim.x * blockDim.x)
 
  891       vec1[i*inc1+
start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
 
  894   else if (op_type == 1)
 
  896     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  898                       i += gridDim.x * blockDim.x)
 
  900       vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / vec3[i*inc3+start3];
 
  903   else if (op_type == 0)
 
  905     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  907                       i += gridDim.x * blockDim.x)
 
  909       vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * vec3[i*inc3+start3];
 
  914 template<
typename NumericT>
 
  933     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  935                       i += gridDim.x * blockDim.x)
 
  937       vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / vec3[i*inc3+start3];
 
  940   else if (op_type == 0)
 
  942     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  944                       i += gridDim.x * blockDim.x)
 
  946       vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * vec3[i*inc3+start3];
 
  956 template<
typename NumericT, 
typename OpT>
 
  960   unsigned int op_type = 2; 
 
  984 template<
typename OpT>
 
  988   unsigned int op_type = 2; 
 
 1012 template<
typename OpT>
 
 1016   unsigned int op_type = 2; 
 
 1046 template<
typename NumericT>
 
 1051   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1052     vec1[i*inc1+
start1] = acos(vec2[i*inc2+start2]);
 
 1055 template<
typename NumericT>
 
 1073 template<
typename NumericT>
 
 1078   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1079     vec1[i*inc1+
start1] = asin(vec2[i*inc2+start2]);
 
 1082 template<
typename NumericT>
 
 1099 template<
typename NumericT>
 
 1104   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1105     vec1[i*inc1+
start1] = atan(vec2[i*inc2+start2]);
 
 1108 template<
typename NumericT>
 
 1125 template<
typename NumericT>
 
 1130   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1131     vec1[i*inc1+
start1] = ceil(vec2[i*inc2+start2]);
 
 1134 template<
typename NumericT>
 
 1151 template<
typename NumericT>
 
 1156   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1157     vec1[i*inc1+
start1] = cos(vec2[i*inc2+start2]);
 
 1160 template<
typename NumericT>
 
 1177 template<
typename NumericT>
 
 1182   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1183     vec1[i*inc1+
start1] = cosh(vec2[i*inc2+start2]);
 
 1186 template<
typename NumericT>
 
 1203 template<
typename NumericT>
 
 1208   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1209     vec1[i*inc1+
start1] = exp(vec2[i*inc2+start2]);
 
 1212 template<
typename NumericT>
 
 1229 template<
typename NumericT>
 
 1234   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1235     vec1[i*inc1+
start1] = fabs(vec2[i*inc2+start2]);
 
 1238 template<
typename NumericT>
 
 1254 template<
typename NumericT>
 
 1259   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1260     vec1[i*inc1+
start1] = abs(vec2[i*inc2+start2]);
 
 1263 template<
typename NumericT>
 
 1281 template<
typename NumericT>
 
 1286   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1287     vec1[i*inc1+
start1] = floor(vec2[i*inc2+start2]);
 
 1290 template<
typename NumericT>
 
 1307 template<
typename NumericT>
 
 1312   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1313     vec1[i*inc1+
start1] = log(vec2[i*inc2+start2]);
 
 1316 template<
typename NumericT>
 
 1333 template<
typename NumericT>
 
 1338   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1339     vec1[i*inc1+
start1] = log10(vec2[i*inc2+start2]);
 
 1342 template<
typename NumericT>
 
 1359 template<
typename NumericT>
 
 1364   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1365     vec1[i*inc1+
start1] = sin(vec2[i*inc2+start2]);
 
 1368 template<
typename NumericT>
 
 1385 template<
typename NumericT>
 
 1390   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1391     vec1[i*inc1+
start1] = sinh(vec2[i*inc2+start2]);
 
 1394 template<
typename NumericT>
 
 1411 template<
typename NumericT>
 
 1416   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1417     vec1[i*inc1+
start1] = sqrt(vec2[i*inc2+start2]);
 
 1420 template<
typename NumericT>
 
 1437 template<
typename NumericT>
 
 1442   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1443     vec1[i*inc1+
start1] = tan(vec2[i*inc2+start2]);
 
 1446 template<
typename NumericT>
 
 1463 template<
typename NumericT>
 
 1468   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
 
 1469     vec1[i*inc1+
start1] = tanh(vec2[i*inc2+start2]);
 
 1472 template<
typename NumericT>
 
 1492 template<
typename NumericT>
 
 1503   __shared__ 
NumericT tmp_buffer[128];
 
 1504   unsigned int group_start1 = (blockIdx.x * 
size1) / (gridDim.x) * inc1 + 
start1;
 
 1505   unsigned int group_start2 = (blockIdx.x * 
size2) / (gridDim.x) * inc2 + 
start2;
 
 1507   unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
 
 1508                                - (  blockIdx.x * size1) / (gridDim.x);
 
 1512   for (
unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
 
 1513     tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
 
 1514   tmp_buffer[threadIdx.x] = tmp;
 
 1520     if (threadIdx.x < 
stride)
 
 1521       tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
 
 1524   if (threadIdx.x == 0)
 
 1525     group_buffer[blockIdx.x] = tmp_buffer[0];
 
 1532 template<
typename NumericT>
 
 1538           unsigned int option, 
 
 1541   __shared__ 
NumericT tmp_buffer[128];
 
 1543   for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
 
 1546       thread_sum += vec1[i*inc1+
start1];
 
 1548       thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
 
 1551   tmp_buffer[threadIdx.x] = thread_sum;
 
 1556     if (threadIdx.x < 
stride)
 
 1559         tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + 
stride];
 
 1561         tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + 
stride]);
 
 1565   if (threadIdx.x == 0)
 
 1568       *result = sqrt(tmp_buffer[0]);
 
 1570       *result = tmp_buffer[0];
 
 1574 template<
typename NumericT>
 
 1580           unsigned int option, 
 
 1583   __shared__ 
NumericT tmp_buffer[128];
 
 1585   for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
 
 1588       thread_sum += vec1[i*inc1+
start1];
 
 1590       thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
 
 1593   tmp_buffer[threadIdx.x] = thread_sum;
 
 1598     if (threadIdx.x < 
stride)
 
 1601         tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + 
stride];
 
 1603         tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + 
stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + 
stride];
 
 1607   if (threadIdx.x == 0)
 
 1608     *result = tmp_buffer[0];
 
 1611 template<
typename NumericT>
 
 1617           unsigned int option, 
 
 1620   __shared__ 
NumericT tmp_buffer[128];
 
 1622   for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
 
 1625       thread_sum += vec1[i*inc1+
start1];
 
 1627       thread_sum = (thread_sum > vec1[i*inc1+
start1]) ? thread_sum : vec1[i*inc1+start1];
 
 1630   tmp_buffer[threadIdx.x] = thread_sum;
 
 1635     if (threadIdx.x < 
stride)
 
 1638         tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + 
stride];
 
 1640         tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + 
stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + 
stride];
 
 1644   if (threadIdx.x == 0)
 
 1645     *result = tmp_buffer[0];
 
 1651   struct vector_sum_kernel_launcher_integers
 
 1653     template<
typename NumericT, 
typename ScalarT>
 
 1655                       unsigned int option,
 
 1663                                             static_cast<unsigned int>(option),
 
 1669   struct vector_sum_kernel_launcher_unsigned_integers
 
 1671     template<
typename NumericT, 
typename ScalarT>
 
 1672     static void apply(vector_base<NumericT> 
const & temp,
 
 1673                       unsigned int option,
 
 1681                                                       static_cast<unsigned int>(option),
 
 1687   struct vector_sum_kernel_launcher_floats
 
 1689     template<
typename NumericT, 
typename ScalarT>
 
 1690     static void apply(vector_base<NumericT> 
const & temp,
 
 1691                       unsigned int option,
 
 1699                                             static_cast<unsigned int>(option),
 
 1705   template<
typename NumericT>
 
 1706   struct vector_sum_kernel_launcher : 
public vector_sum_kernel_launcher_integers {};
 
 1709   struct vector_sum_kernel_launcher<unsigned char>  : 
public vector_sum_kernel_launcher_unsigned_integers {};
 
 1712   struct vector_sum_kernel_launcher<unsigned short>  : 
public vector_sum_kernel_launcher_unsigned_integers {};
 
 1715   struct vector_sum_kernel_launcher<unsigned int>  : 
public vector_sum_kernel_launcher_unsigned_integers {};
 
 1718   struct vector_sum_kernel_launcher<unsigned long>  : 
public vector_sum_kernel_launcher_unsigned_integers {};
 
 1721   struct vector_sum_kernel_launcher<float>  : 
public vector_sum_kernel_launcher_floats {};
 
 1724   struct vector_sum_kernel_launcher<double> : 
public vector_sum_kernel_launcher_floats {};
 
 1738 template<
typename NumericT, 
typename ScalarT>
 
 1745   static const unsigned int work_groups = 128;
 
 1760   detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
 
 1770 template<
typename NumericT>
 
 1777   const unsigned int work_groups = 128;
 
 1793   std::vector<value_type> temp_cpu(work_groups);
 
 1797   for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
 
 1803 #define VIENNACL_MDOT_WORKGROUP_SIZE  128 
 1804 #define VIENNACL_MDOT_WORKGROUP_NUM   128 
 1806 template<
typename NumericT>
 
 1808                                     const NumericT *y0, 
unsigned int start0, 
unsigned int stride0,
 
 1813   unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
 
 1814   unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
 
 1815   unsigned int vec_stop_index  = 
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); 
 
 1820   for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
 
 1821     entry_x     = x[i * stridex + startx];   
 
 1822     group_sum0 += entry_x * y0[i * stride0 + start0];
 
 1823     group_sum1 += entry_x * y1[i * stride1 + 
start1];
 
 1825   tmp_buffer[threadIdx.x]              = group_sum0;
 
 1826   tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
 
 1831     if (threadIdx.x < 
stride) {
 
 1832       tmp_buffer[threadIdx.x             ] += tmp_buffer[threadIdx.x+
stride             ];
 
 1833       tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
 
 1838   if (threadIdx.x == 0) {
 
 1839     group_results[blockIdx.x]             = tmp_buffer[0];
 
 1840     group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
 
 1845 template<
typename NumericT>
 
 1847                                     const NumericT *y0, 
unsigned int start0, 
unsigned int stride0,
 
 1853   unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
 
 1854   unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
 
 1855   unsigned int vec_stop_index  = 
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); 
 
 1861   for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
 
 1862     entry_x     = x[i * stridex + startx];   
 
 1863     group_sum0 += entry_x * y0[i * stride0 + start0];
 
 1864     group_sum1 += entry_x * y1[i * stride1 + 
start1];
 
 1865     group_sum2 += entry_x * y2[i * stride2 + 
start2];
 
 1867   tmp_buffer[threadIdx.x]                  = group_sum0;
 
 1868   tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
 
 1869   tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
 
 1874     if (threadIdx.x < 
stride) {
 
 1875       tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+
stride                 ];
 
 1876       tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+
stride +     blockDim.x];
 
 1877       tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
 
 1882   if (threadIdx.x == 0) {
 
 1883     group_results[blockIdx.x                ] = tmp_buffer[0];
 
 1884     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
 
 1885     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
 
 1890 template<
typename NumericT>
 
 1892                                     const NumericT *y0, 
unsigned int start0, 
unsigned int stride0,
 
 1895                                     const NumericT *y3, 
unsigned int start3, 
unsigned int stride3,
 
 1899   unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
 
 1900   unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
 
 1901   unsigned int vec_stop_index  = 
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); 
 
 1908   for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
 
 1909     entry_x     = x[i * stridex + startx];   
 
 1910     group_sum0 += entry_x * y0[i * stride0 + start0];
 
 1911     group_sum1 += entry_x * y1[i * stride1 + 
start1];
 
 1912     group_sum2 += entry_x * y2[i * stride2 + 
start2];
 
 1913     group_sum3 += entry_x * y3[i * stride3 + start3];
 
 1915   tmp_buffer[threadIdx.x]                  = group_sum0;
 
 1916   tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
 
 1917   tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
 
 1918   tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
 
 1923     if (threadIdx.x < 
stride) {
 
 1924       tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+
stride                 ];
 
 1925       tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+
stride +     blockDim.x];
 
 1926       tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
 
 1927       tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 3 * blockDim.x];
 
 1932   if (threadIdx.x == 0) {
 
 1933     group_results[blockIdx.x                ] = tmp_buffer[0];
 
 1934     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
 
 1935     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
 
 1936     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
 
 1941 template<
typename NumericT>
 
 1943                                     const NumericT *y0, 
unsigned int start0, 
unsigned int stride0,
 
 1946                                     const NumericT *y3, 
unsigned int start3, 
unsigned int stride3,
 
 1947                                     const NumericT *y4, 
unsigned int start4, 
unsigned int stride4,
 
 1948                                     const NumericT *y5, 
unsigned int start5, 
unsigned int stride5,
 
 1949                                     const NumericT *y6, 
unsigned int start6, 
unsigned int stride6,
 
 1950                                     const NumericT *y7, 
unsigned int start7, 
unsigned int stride7,
 
 1954   unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
 
 1955   unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
 
 1956   unsigned int vec_stop_index  = 
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); 
 
 1967   for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
 
 1968     entry_x     = x[i * stridex + startx];   
 
 1969     group_sum0 += entry_x * y0[i * stride0 + start0];
 
 1970     group_sum1 += entry_x * y1[i * stride1 + 
start1];
 
 1971     group_sum2 += entry_x * y2[i * stride2 + 
start2];
 
 1972     group_sum3 += entry_x * y3[i * stride3 + start3];
 
 1973     group_sum4 += entry_x * y4[i * stride4 + start4];
 
 1974     group_sum5 += entry_x * y5[i * stride5 + start5];
 
 1975     group_sum6 += entry_x * y6[i * stride6 + start6];
 
 1976     group_sum7 += entry_x * y7[i * stride7 + start7];
 
 1978   tmp_buffer[threadIdx.x]                  = group_sum0;
 
 1979   tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
 
 1980   tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
 
 1981   tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
 
 1982   tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
 
 1983   tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
 
 1984   tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
 
 1985   tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
 
 1990     if (threadIdx.x < 
stride) {
 
 1991       tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+
stride                 ];
 
 1992       tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+
stride +     blockDim.x];
 
 1993       tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
 
 1994       tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 3 * blockDim.x];
 
 1995       tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 4 * blockDim.x];
 
 1996       tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 5 * blockDim.x];
 
 1997       tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 6 * blockDim.x];
 
 1998       tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 7 * blockDim.x];
 
 2003   if (threadIdx.x == 0) {
 
 2004     group_results[blockIdx.x                ] = tmp_buffer[0];
 
 2005     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
 
 2006     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
 
 2007     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
 
 2008     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
 
 2009     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
 
 2010     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
 
 2011     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
 
 2016 template<
typename NumericT>
 
 2020           unsigned int start_result,
 
 2021           unsigned int inc_result)
 
 2030     if (threadIdx.x < 
stride)
 
 2031       tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + 
stride];
 
 2034   if (threadIdx.x == 0)
 
 2035     result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
 
 2038 template<
typename NumericT>
 
 2048   while (vec_tuple.
const_size() > current_index)
 
 2050     switch (vec_tuple.
const_size() - current_index)
 
 2082         vector_multi_sum_kernel<<<4, VIENNACL_MDOT_WORKGROUP_NUM>>>(
viennacl::cuda_arg(temp),
 
 2114         vector_multi_sum_kernel<<<3, VIENNACL_MDOT_WORKGROUP_NUM>>>(
viennacl::cuda_arg(temp),
 
 2142         vector_multi_sum_kernel<<<2, VIENNACL_MDOT_WORKGROUP_NUM>>>(
viennacl::cuda_arg(temp),
 
 2219         vector_multi_sum_kernel<<<8, VIENNACL_MDOT_WORKGROUP_NUM>>>(
viennacl::cuda_arg(temp),
 
 2232 #undef VIENNACL_MDOT_WORKGROUP_NUM 
 2233 #undef VIENNACL_MDOT_WORKGROUP_SIZE 
 2237 template<
typename NumericT>
 
 2243           unsigned int norm_selector,
 
 2246   __shared__ 
NumericT tmp_buffer[128];
 
 2248   NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
 
 2249   unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
 
 2250   unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
 
 2251   unsigned int group_stop  = (blockIdx.x + 1) * work_per_thread * blockDim.x;
 
 2252   group_stop = (group_stop > size1) ? size1 : group_stop;
 
 2254   if (norm_selector == 1) 
 
 2256     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2257       tmp += fabs(vec[i*inc1 + start1]);
 
 2259   else if (norm_selector == 2) 
 
 2262     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2264       vec_entry = vec[i*inc1 + 
start1];
 
 2265       tmp += vec_entry * vec_entry;
 
 2268   else if (norm_selector == 0) 
 
 2270     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2271       tmp = fmax(fabs(vec[i*inc1 + start1]), tmp);
 
 2273   else if (norm_selector == 3) 
 
 2275     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2276       tmp = (vec[i*inc1 + 
start1] < tmp) ? vec[i*inc1 + start1] : tmp;
 
 2278   else if (norm_selector == 4) 
 
 2280     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2281       tmp = (vec[i*inc1 + 
start1] > tmp) ? vec[i*inc1 + start1] : tmp;
 
 2284   tmp_buffer[threadIdx.x] = tmp;
 
 2286   if (norm_selector == 1 || norm_selector == 2) 
 
 2291       if (threadIdx.x < 
stride)
 
 2292         tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
 
 2295   else if (norm_selector == 3)
 
 2301       if (threadIdx.x < 
stride)
 
 2302         tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
 
 2305   else if (norm_selector == 4)
 
 2311       if (threadIdx.x < 
stride)
 
 2312         tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
 
 2321       if (threadIdx.x < 
stride)
 
 2322         tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x+
stride]);
 
 2326   if (threadIdx.x == 0)
 
 2327     group_buffer[blockIdx.x] = tmp_buffer[0];
 
 2330 template<
typename NumericT>
 
 2336           unsigned int norm_selector,
 
 2339   __shared__ 
NumericT tmp_buffer[128];
 
 2341   NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
 
 2342   unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
 
 2343   unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
 
 2344   unsigned int group_stop  = (blockIdx.x + 1) * work_per_thread * blockDim.x;
 
 2345   group_stop = (group_stop > size1) ? size1 : group_stop;
 
 2347   if (norm_selector == 1) 
 
 2349     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2350       tmp += abs(vec[i*inc1 + start1]);
 
 2352   else if (norm_selector == 0) 
 
 2354     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2355       tmp = (tmp > abs(vec[i*inc1 + start1])) ? tmp : abs(vec[i*inc1 + start1]);
 
 2357   else if (norm_selector == 3) 
 
 2359     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2360       tmp = (vec[i*inc1 + 
start1] < tmp) ? vec[i*inc1 + start1] : tmp;
 
 2362   else if (norm_selector == 4) 
 
 2364     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2365       tmp = (vec[i*inc1 + 
start1] > tmp) ? vec[i*inc1 + start1] : tmp;
 
 2368   tmp_buffer[threadIdx.x] = tmp;
 
 2370   if (norm_selector == 1 || norm_selector == 2) 
 
 2375       if (threadIdx.x < 
stride)
 
 2376         tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
 
 2379   else if (norm_selector == 3)
 
 2385       if (threadIdx.x < 
stride)
 
 2386         tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
 
 2389   else if (norm_selector == 4)
 
 2395       if (threadIdx.x < 
stride)
 
 2396         tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
 
 2405       if (threadIdx.x < 
stride)
 
 2406         tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+
stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+
stride];
 
 2410   if (threadIdx.x == 0)
 
 2411     group_buffer[blockIdx.x] = tmp_buffer[0];
 
 2414 template<
typename NumericT>
 
 2420           unsigned int norm_selector,
 
 2423   __shared__ 
NumericT tmp_buffer[128];
 
 2425   NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
 
 2426   unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
 
 2427   unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
 
 2428   unsigned int group_stop  = (blockIdx.x + 1) * work_per_thread * blockDim.x;
 
 2429   group_stop = (group_stop > size1) ? size1 : group_stop;
 
 2431   if (norm_selector == 1) 
 
 2433     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2434       tmp += vec[i*inc1 + 
start1];
 
 2436   else if (norm_selector == 0) 
 
 2438     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2439       tmp = (tmp > vec[i*inc1 + 
start1]) ? tmp : vec[i*inc1 + start1];
 
 2441   else if (norm_selector == 3) 
 
 2443     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2444       tmp = (vec[i*inc1 + 
start1] < tmp) ? vec[i*inc1 + start1] : tmp;
 
 2446   else if (norm_selector == 4) 
 
 2448     for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
 
 2449       tmp = (vec[i*inc1 + 
start1] > tmp) ? vec[i*inc1 + start1] : tmp;
 
 2452   tmp_buffer[threadIdx.x] = tmp;
 
 2454   if (norm_selector == 1 || norm_selector == 2) 
 
 2459       if (threadIdx.x < 
stride)
 
 2460         tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
 
 2463   else if (norm_selector == 3)
 
 2469       if (threadIdx.x < 
stride)
 
 2470         tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
 
 2473   else if (norm_selector == 4)
 
 2479       if (threadIdx.x < 
stride)
 
 2480         tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
 
 2489       if (threadIdx.x < 
stride)
 
 2490         tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+
stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+
stride];
 
 2494   if (threadIdx.x == 0)
 
 2495     group_buffer[blockIdx.x] = tmp_buffer[0];
 
 2501   struct norm_kernel_launcher_integers
 
 2503     template<
typename NumericT>
 
 2506                       unsigned int option)
 
 2512                                          static_cast<unsigned int>(option),
 
 2519   struct norm_kernel_launcher_unsigned_integers
 
 2521     template<
typename NumericT>
 
 2522     static void apply(vector_base<NumericT> 
const & vec1,
 
 2523                       vector_base<NumericT> & temp,
 
 2524                       unsigned int option)
 
 2530                                                  static_cast<unsigned int>(option),
 
 2538   struct norm_kernel_launcher_floats
 
 2540     template<
typename NumericT>
 
 2541     static void apply(vector_base<NumericT> 
const & vec1,
 
 2542                       vector_base<NumericT> & temp,
 
 2543                       unsigned int option)
 
 2549                                        static_cast<unsigned int>(option),
 
 2556   template<
typename NumericT>
 
 2557   struct norm_kernel_launcher : 
public norm_kernel_launcher_integers {};
 
 2560   struct norm_kernel_launcher<unsigned char>  : 
public norm_kernel_launcher_unsigned_integers {};
 
 2563   struct norm_kernel_launcher<unsigned short>  : 
public norm_kernel_launcher_unsigned_integers {};
 
 2566   struct norm_kernel_launcher<unsigned int>  : 
public norm_kernel_launcher_unsigned_integers {};
 
 2569   struct norm_kernel_launcher<unsigned long>  : 
public norm_kernel_launcher_unsigned_integers {};
 
 2572   struct norm_kernel_launcher<float>  : 
public norm_kernel_launcher_floats {};
 
 2575   struct norm_kernel_launcher<double> : 
public norm_kernel_launcher_floats {};
 
 2586 template<
typename NumericT>
 
 2595   detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 1);
 
 2596   detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
 
 2604 template<
typename NumericT>
 
 2613   detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 1);
 
 2616   std::vector<value_type> temp_cpu(work_groups);
 
 2620   for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
 
 2631 template<
typename NumericT>
 
 2640   detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 2);
 
 2642   detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 2, result);
 
 2650 template<
typename NumericT>
 
 2659   detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 2);
 
 2661   std::vector<value_type> temp_cpu(work_groups);
 
 2665   for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
 
 2667   result = std::sqrt(result);
 
 2678 template<
typename NumericT>
 
 2687   detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 0);
 
 2688   detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 0, result);
 
 2698 template<
typename NumericT>
 
 2707   detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 0);
 
 2709   std::vector<value_type> temp_cpu(work_groups);
 
 2713   for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
 
 2721 template<
typename NumericT>
 
 2727           unsigned int option, 
 
 2730   __shared__ 
NumericT tmp_buffer[128];
 
 2732   for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
 
 2735       thread_minmax = (vec1[i*inc1+
start1] < thread_minmax) ? vec1[i*inc1+start1] : thread_minmax;
 
 2737       thread_minmax = (vec1[i*inc1+
start1] > thread_minmax) ? vec1[i*inc1+start1] : thread_minmax;
 
 2740   tmp_buffer[threadIdx.x] = thread_minmax;
 
 2745     if (threadIdx.x < 
stride)
 
 2748         tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x + 
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x + 
stride] : tmp_buffer[threadIdx.x];
 
 2750         tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x + 
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x + 
stride] : tmp_buffer[threadIdx.x];
 
 2754   if (threadIdx.x == 0)
 
 2755     *result = tmp_buffer[0];
 
 2764 template<
typename NumericT>
 
 2773   detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 4);
 
 2779                                    static_cast<unsigned int>(0),
 
 2792 template<
typename NumericT>
 
 2801   detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 4);
 
 2803   std::vector<value_type> temp_cpu(work_groups);
 
 2807   for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
 
 2818 template<
typename NumericT>
 
 2827   detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 3);
 
 2833                                    static_cast<unsigned int>(1),
 
 2846 template<
typename NumericT>
 
 2855   detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 3);
 
 2857   std::vector<value_type> temp_cpu(work_groups);
 
 2861   for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
 
 2873 template<
typename NumericT>
 
 2890 template<
typename NumericT>
 
 2909 template<
typename NumericT>
 
 2911 __device__ 
inline unsigned long  cuda_abs(
unsigned long  val) { 
return val; }
 
 2912 __device__ 
inline unsigned int   cuda_abs(
unsigned int   val) { 
return val; }
 
 2913 __device__ 
inline unsigned short cuda_abs(
unsigned short val) { 
return val; }
 
 2914 __device__ 
inline unsigned char  cuda_abs(
unsigned char  val) { 
return val; }
 
 2916 template<
typename NumericT>
 
 2921                                       unsigned int * result)
 
 2923   __shared__ 
NumericT float_buffer[128];
 
 2924   __shared__ 
unsigned int index_buffer[128];
 
 2926   float_buffer[threadIdx.x] = 0;
 
 2927   index_buffer[threadIdx.x] = 0;
 
 2932   for (
unsigned int i = threadIdx.x; i < size1; i += blockDim.x)
 
 2934     tmp = vec[i*inc1+
start1];
 
 2938       float_buffer[threadIdx.x] = tmp;
 
 2939       index_buffer[threadIdx.x] = i;
 
 2948     if (threadIdx.x < 
stride)
 
 2951       if (float_buffer[threadIdx.x] < float_buffer[threadIdx.x+
stride])
 
 2953         index_buffer[threadIdx.x] = index_buffer[threadIdx.x+
stride];
 
 2954         float_buffer[threadIdx.x] = float_buffer[threadIdx.x+
stride];
 
 2959   if (threadIdx.x == 0)
 
 2960     *result = index_buffer[0];
 
 2971 template<
typename NumericT>
 
 2983                                     viennacl::cuda_arg<unsigned int>(h)
 
 2988   unsigned int ret = 0;
 
 2995 template<
typename NumericT>
 
 3011   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += blockDim.x * gridDim.x)
 
 3013     tmp1 = vec1[i*inc1+
start1];
 
 3014     tmp2 = vec2[i*inc2+
start2];
 
 3016     vec1[i*inc1+
start1] = alpha * tmp1 + beta * tmp2;
 
 3017     vec2[i*inc2+
start2] = alpha * tmp2 - beta * tmp1;
 
 3031 template<
typename NumericT>
 
 3038   value_type temporary_alpha = 0;
 
 3040     temporary_alpha = alpha;
 
 3042   value_type temporary_beta = 0;
 
 3044     temporary_beta = beta;
 
 3062 template<
typename NumericT>
 
 3064                               unsigned int startX,
 
 3069                               unsigned int startY,
 
 3072                               unsigned int scan_offset,
 
 3075   __shared__ 
NumericT shared_buffer[256];
 
 3078   unsigned int work_per_thread = (sizeX - 1) / (gridDim.x * blockDim.x) + 1;
 
 3079   unsigned int block_start = work_per_thread * blockDim.x *  blockIdx.x;
 
 3080   unsigned int block_stop  = work_per_thread * blockDim.x * (blockIdx.x + 1);
 
 3081   unsigned int block_offset = 0;
 
 3084   for (
unsigned int i = block_start + threadIdx.x; i < block_stop; i += blockDim.x)
 
 3087     my_value = (i < sizeX) ? X[i * incX + startX] : 0;
 
 3093       shared_buffer[threadIdx.x] = my_value;
 
 3095       if (threadIdx.x >= 
stride)
 
 3096         my_value += shared_buffer[threadIdx.x - 
stride];
 
 3099     shared_buffer[threadIdx.x] = my_value;
 
 3103     if (scan_offset > 0)
 
 3104       my_value = (threadIdx.x > 0) ? shared_buffer[threadIdx.x - 1] : 0;
 
 3108       Y[i * incY + startY] = block_offset + my_value;
 
 3110     block_offset += shared_buffer[blockDim.x-1];
 
 3114   if (threadIdx.x == 0)
 
 3115     carries[blockIdx.x] = block_offset;
 
 3120 template<
typename NumericT>
 
 3123   __shared__ 
NumericT shared_buffer[256];
 
 3126   NumericT my_carry = carries[threadIdx.x];
 
 3133     shared_buffer[threadIdx.x] = my_carry;
 
 3135     if (threadIdx.x >= 
stride)
 
 3136       my_carry += shared_buffer[threadIdx.x - 
stride];
 
 3139   shared_buffer[threadIdx.x] = my_carry;
 
 3143   carries[threadIdx.x] = (threadIdx.x > 0) ? shared_buffer[threadIdx.x - 1] : 0;
 
 3146 template<
typename NumericT>
 
 3148                               unsigned int startY,
 
 3154   unsigned int work_per_thread = (sizeY - 1) / (gridDim.x * blockDim.x) + 1;
 
 3155   unsigned int block_start = work_per_thread * blockDim.x *  blockIdx.x;
 
 3156   unsigned int block_stop  = work_per_thread * blockDim.x * (blockIdx.x + 1);
 
 3160   if (threadIdx.x == 0)
 
 3161     shared_offset = carries[blockIdx.x];
 
 3166   for (
unsigned int i = block_start + threadIdx.x; i < block_stop; i += blockDim.x)
 
 3168       Y[i * incY + startY] += shared_offset;
 
 3180   template<
typename NumericT>
 
 3201                                                     static_cast<unsigned int>(is_inclusive ? 0 : 1),
 
 3202                                                     viennacl::cuda_arg<NumericT>(cuda_carries)
 
 3206     scan_kernel_2<<<1, block_num>>>(viennacl::cuda_arg<NumericT>(cuda_carries));
 
 3214                                                     viennacl::cuda_arg<NumericT>(cuda_carries)
 
 3225 template<
typename NumericT>
 
 3238 template<
typename NumericT>
 
vcl_size_t const_size() const 
__global__ void convert_kernel(DestNumericT *dest, unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, SrcNumericT const *src, unsigned int start_src, unsigned int inc_src)
__global__ void vec_element_abs_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
void vector_assign(vector_base< NumericT > &vec1, ScalarT1 const &alpha, bool up_to_internal_size=false)
Assign a constant value to a vector (-range/-slice) 
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
__global__ void vector_sum_kernel_unsigned_integers(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
void norm_2_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the l^2-norm of a vector - implementation. 
void convert(matrix_base< DestNumericT > &mat1, matrix_base< SrcNumericT > const &mat2)
__global__ void norm_kernel_floats(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, NumericT *group_results)
__global__ void vec_element_asin_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Generic size and resize functionality for different vector and matrix types. 
__global__ void plane_rotation_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, NumericT alpha, NumericT beta)
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, NumericT *group_results)
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.) 
void av(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
__global__ void vector_multi_sum_kernel(NumericT const *vec1, NumericT *result, unsigned int start_result, unsigned int inc_result)
__global__ void vector_maxmin_kernel(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void vec_element_fabs_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
void max_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU. 
This file provides the forward declarations for the main types used within ViennaCL. 
result_of::size_type< T >::type start1(T const &obj)
__global__ void norm_kernel_unsigned_integers(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
Determines row and column increments for matrices and matrix proxies. 
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM. 
void exclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an exclusive scan using CUDA. 
void norm_1_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the l^1-norm of a vector. 
T max(const T &lhs, const T &rhs)
Maximum. 
#define VIENNACL_MDOT_WORKGROUP_SIZE
An expression template class that represents a binary operation that yields a vector. 
__global__ void scan_kernel_2(NumericT *carries)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void vec_element_sqrt_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void avbv_v_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2, const NumericT *fac3, unsigned int options3, const NumericT *vec3, unsigned int start3, unsigned int inc3)
void max_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void vec_element_tanh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
#define VIENNACL_MDOT_WORKGROUP_NUM
void vector_swap(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
Swaps the contents of two vectors, data is copied. 
__global__ void index_norm_inf_kernel(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int *result)
void scan_impl(vector_base< NumericT > const &input, vector_base< NumericT > &output, bool is_inclusive)
Worker routine for scan routines. 
__global__ void inner_prod_kernel(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, NumericT *group_buffer)
__global__ void vec_element_tan_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
result_of::size_type< T >::type start2(T const &obj)
void inclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an inclusive scan using CUDA. 
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like) 
Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from vie...
void norm_inf_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the supremum-norm of a vector. 
void inner_prod_cpu(vector_base< NumericT > const &vec1, vector_base< NumericT > const &vec2, NumericT &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
result_of::size_type< T >::type start(T const &obj)
__global__ void vec_element_atan_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
void sum_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void scan_kernel_3(NumericT *Y, unsigned int startY, unsigned int incY, unsigned int sizeY, NumericT const *carries)
Common base class for dense vectors, vector ranges, and vector slices. 
__global__ void vector_sum_kernel_floats(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void scan_kernel_1(NumericT const *X, unsigned int startX, unsigned int incX, unsigned int sizeX, NumericT *Y, unsigned int startY, unsigned int incY, unsigned int scan_offset, NumericT *carries)
void sum_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU. 
void avbv_v(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< NumericT > const &vec3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
__global__ void vec_element_log_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_sum_kernel_integers(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void vec_element_cosh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
__global__ void av_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2)
__global__ void vec_element_acos_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vec_element_ceil_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
void min_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void norm_kernel_integers(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
void norm_1_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the l^1-norm of a vector. 
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vcl_size_t index_norm_inf(vector_base< NumericT > const &vec1)
Computes the index of the first entry that is equal to the supremum-norm in modulus. 
__global__ void element_op_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2, NumericT const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
__global__ void vec_element_exp_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc. 
void plane_rotation(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2, NumericT alpha, NumericT beta)
Computes a plane rotation of two vectors. 
void min_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU. 
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object. 
void element_op(matrix_base< NumericT, SizeT > &A, matrix_expression< const matrix_base< NumericT, SizeT >, const matrix_base< NumericT, SizeT >, op_element_binary< OpT > > const &proxy)
Common routines for CUDA execution. 
void inner_prod_impl(vector_base< NumericT > const &vec1, vector_base< NumericT > const &vec2, ScalarT &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
__global__ void vector_swap_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT *vec2, unsigned int start2, unsigned int inc2)
void norm_inf_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the supremum-norm of a vector. 
__global__ void avbv_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2, const NumericT *fac3, unsigned int options3, const NumericT *vec3, unsigned int start3, unsigned int inc3)
size_type size() const 
Returns the length of the vector (cf. std::vector) 
__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, NumericT *group_results)
__global__ void vec_element_sin_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
VectorType const & const_at(vcl_size_t i) const 
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
__global__ void vec_element_cos_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_assign_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int internal_size1, NumericT alpha)
void memory_create(mem_handle &handle, vcl_size_t size_in_bytes, viennacl::context const &ctx, const void *host_ptr=NULL)
Creates an array of the specified size. If the second argument is provided, the buffer is initialized...
__global__ void vec_element_log10_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. 
T min(const T &lhs, const T &rhs)
Minimum. 
size_type internal_size() const 
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
iterator end()
Returns an iterator pointing to the end of the vector (STL like) 
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
__global__ void vec_element_floor_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Implementation of the ViennaCL scalar class. 
void norm_2_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the l^2-norm of a vector - implementation. 
void avbv(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< NumericT > const &vec3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, const NumericT *y4, unsigned int start4, unsigned int stride4, const NumericT *y5, unsigned int start5, unsigned int stride5, const NumericT *y6, unsigned int start6, unsigned int stride6, const NumericT *y7, unsigned int start7, unsigned int stride7, NumericT *group_results)
__global__ void vec_element_sinh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< NumericT > &s, OtherT)
__device__ NumericT cuda_abs(NumericT val)
Simple enable-if variant that uses the SFINAE pattern. 
NumericT min(std::vector< NumericT > const &v1)
__global__ void element_op_int_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2, NumericT const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)