1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_ 
   33 template<
typename DestNumericT, 
typename SrcNumericT>
 
   36           unsigned int A_start1, 
unsigned int A_start2,
 
   37           unsigned int A_inc1,   
unsigned int A_inc2,
 
   38           unsigned int A_size1,  
unsigned int A_size2,
 
   39           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
   41           const SrcNumericT * B,
 
   42           unsigned int B_start1, 
unsigned int B_start2,
 
   43           unsigned int B_inc1,   
unsigned int B_inc2,
 
   44           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
   46   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
   47   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
   49   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
   50     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
   51       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2];
 
   55 template<
typename NumericT>
 
   58           unsigned int A_start1,          
unsigned int A_start2,
 
   59           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
   60           unsigned int A_size1,           
unsigned int A_size2,
 
   61           unsigned int A_stride1,         
unsigned int A_stride2,
 
   64           unsigned int B_start1,          
unsigned int B_start2,
 
   65           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
   66           unsigned int B_stride1,         
unsigned int B_stride2,
 
   69   for(
unsigned int row = blockIdx.x; 
row<A_size1; 
row+=gridDim.x)
 
   71     for(
unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x)
 
   74         B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * 
row)] = A[(A_start1 + A_stride1 * 
row) * A_internal_size2 + (A_start2 + A_stride2 * col)];
 
   76         B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * 
row) + (A_start2 + A_stride2 * col) * A_internal_size1];
 
   86 template<
typename NumericT>
 
   89           unsigned int A_start1, 
unsigned int A_start2,
 
   90           unsigned int A_inc1,   
unsigned int A_inc2,
 
   91           unsigned int A_size1,  
unsigned int A_size2,
 
   92           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
   95           unsigned int options2,
 
   97           unsigned int B_start1, 
unsigned int B_start2,
 
   98           unsigned int B_inc1,   
unsigned int B_inc2,
 
   99           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  102   if (options2 & (1 << 0))
 
  105   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  106   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  108   if (options2 & (1 << 1))
 
  110     for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  111       for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  112         A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
 
  116     for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  117       for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  118         A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
 
  123 template<
typename NumericT>
 
  126           unsigned int A_start1, 
unsigned int A_start2,
 
  127           unsigned int A_inc1,   
unsigned int A_inc2,
 
  128           unsigned int A_size1,  
unsigned int A_size2,
 
  129           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  132           unsigned int options2,
 
  134           unsigned int B_start1, 
unsigned int B_start2,
 
  135           unsigned int B_inc1,   
unsigned int B_inc2,
 
  136           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  139   if (options2 & (1 << 0))
 
  142   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  143   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  145   if (options2 & (1 << 1))
 
  147     for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  148       for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  149         A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
 
  153     for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  154       for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  155         A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
 
  165 template<
typename NumericT>
 
  168           unsigned int A_start1, 
unsigned int A_start2,
 
  169           unsigned int A_inc1,   
unsigned int A_inc2,
 
  170           unsigned int A_size1,  
unsigned int A_size2,
 
  171           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  174           unsigned int options2,
 
  176           unsigned int B_start1, 
unsigned int B_start2,
 
  177           unsigned int B_inc1,   
unsigned int B_inc2,
 
  178           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  181           unsigned int options3,
 
  183           unsigned int C_start1, 
unsigned int C_start2,
 
  184           unsigned int C_inc1,   
unsigned int C_inc2,
 
  185           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  188   if (options2 & (1 << 0))
 
  192   if (options3 & (1 << 0))
 
  195   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  196   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  198   if (options2 & (1 << 1))
 
  200     if (options3 & (1 << 1))
 
  202       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  203         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  204           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  205         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  206         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  210       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  211         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  212           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  213         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  214         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  219     if (options3 & (1 << 1))
 
  221       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  222         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  223           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  224         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  225         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  229       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  230         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  231           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  232         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  233         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  240 template<
typename NumericT>
 
  243           unsigned int A_start1, 
unsigned int A_start2,
 
  244           unsigned int A_inc1,   
unsigned int A_inc2,
 
  245           unsigned int A_size1,  
unsigned int A_size2,
 
  246           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  249           unsigned int options2,
 
  251           unsigned int B_start1, 
unsigned int B_start2,
 
  252           unsigned int B_inc1,   
unsigned int B_inc2,
 
  253           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  256           unsigned int options3,
 
  258           unsigned int C_start1, 
unsigned int C_start2,
 
  259           unsigned int C_inc1,   
unsigned int C_inc2,
 
  260           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  263   if (options2 & (1 << 0))
 
  267   if (options3 & (1 << 0))
 
  270   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  271   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  273   if (options2 & (1 << 1))
 
  275     if (options3 & (1 << 1))
 
  277       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  278         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  279           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  280         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  281         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  285       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  286         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  287           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  288         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  289         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  294     if (options3 & (1 << 1))
 
  296       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  297         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  298           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  299         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  300         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  304       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  305         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  306           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  307         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  308         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  314 template<
typename NumericT>
 
  317           unsigned int A_start1, 
unsigned int A_start2,
 
  318           unsigned int A_inc1,   
unsigned int A_inc2,
 
  319           unsigned int A_size1,  
unsigned int A_size2,
 
  320           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  323           unsigned int options2,
 
  325           unsigned int B_start1, 
unsigned int B_start2,
 
  326           unsigned int B_inc1,   
unsigned int B_inc2,
 
  327           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  330           unsigned int options3,
 
  332           unsigned int C_start1, 
unsigned int C_start2,
 
  333           unsigned int C_inc1,   
unsigned int C_inc2,
 
  334           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  337   if (options2 & (1 << 0))
 
  341   if (options3 & (1 << 0))
 
  344   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  345   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  347   if (options2 & (1 << 1))
 
  349     if (options3 & (1 << 1))
 
  351       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  352         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  353           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  354         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  355         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  359       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  360         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  361           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  362         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  363         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  368     if (options3 & (1 << 1))
 
  370       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  371         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  372           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  373         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  374         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  378       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  379         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  380           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  381         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  382         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  389 template<
typename NumericT>
 
  392           unsigned int A_start1, 
unsigned int A_start2,
 
  393           unsigned int A_inc1,   
unsigned int A_inc2,
 
  394           unsigned int A_size1,  
unsigned int A_size2,
 
  395           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  398           unsigned int options2,
 
  400           unsigned int B_start1, 
unsigned int B_start2,
 
  401           unsigned int B_inc1,   
unsigned int B_inc2,
 
  402           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  405           unsigned int options3,
 
  407           unsigned int C_start1, 
unsigned int C_start2,
 
  408           unsigned int C_inc1,   
unsigned int C_inc2,
 
  409           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  412   if (options2 & (1 << 0))
 
  416   if (options3 & (1 << 0))
 
  419   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  420   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  422   if (options2 & (1 << 1))
 
  424     if (options3 & (1 << 1))
 
  426       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  427         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  428           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  429         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  430         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  434       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  435         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  436           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  437         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  438         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  443     if (options3 & (1 << 1))
 
  445       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  446         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  447           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  448         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  449         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  453       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  454         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  455           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  456         = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  457         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  468 template<
typename NumericT>
 
  471           unsigned int A_start1, 
unsigned int A_start2,
 
  472           unsigned int A_inc1,   
unsigned int A_inc2,
 
  473           unsigned int A_size1,  
unsigned int A_size2,
 
  474           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  477           unsigned int options2,
 
  479           unsigned int B_start1, 
unsigned int B_start2,
 
  480           unsigned int B_inc1,   
unsigned int B_inc2,
 
  481           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  484           unsigned int options3,
 
  486           unsigned int C_start1, 
unsigned int C_start2,
 
  487           unsigned int C_inc1,   
unsigned int C_inc2,
 
  488           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  491   if (options2 & (1 << 0))
 
  495   if (options3 & (1 << 0))
 
  498   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  499   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  501   if (options2 & (1 << 1))
 
  503     if (options3 & (1 << 1))
 
  505       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  506         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  507           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  508        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  509         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  513       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  514         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  515           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  516        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  517         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  522     if (options3 & (1 << 1))
 
  524       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  525         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  526           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  527        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  528         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  532       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  533         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  534           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  535        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  536         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  543 template<
typename NumericT>
 
  546           unsigned int A_start1, 
unsigned int A_start2,
 
  547           unsigned int A_inc1,   
unsigned int A_inc2,
 
  548           unsigned int A_size1,  
unsigned int A_size2,
 
  549           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  552           unsigned int options2,
 
  554           unsigned int B_start1, 
unsigned int B_start2,
 
  555           unsigned int B_inc1,   
unsigned int B_inc2,
 
  556           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  559           unsigned int options3,
 
  561           unsigned int C_start1, 
unsigned int C_start2,
 
  562           unsigned int C_inc1,   
unsigned int C_inc2,
 
  563           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  566   if (options2 & (1 << 0))
 
  570   if (options3 & (1 << 0))
 
  573   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  574   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  576   if (options2 & (1 << 1))
 
  578     if (options3 & (1 << 1))
 
  580       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  581         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  582           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  583        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  584         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  588       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  589         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  590           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  591        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  592         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  597     if (options3 & (1 << 1))
 
  599       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  600         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  601           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  602        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  603         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  607       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  608         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  609           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  610        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  611         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  617 template<
typename NumericT>
 
  620           unsigned int A_start1, 
unsigned int A_start2,
 
  621           unsigned int A_inc1,   
unsigned int A_inc2,
 
  622           unsigned int A_size1,  
unsigned int A_size2,
 
  623           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  626           unsigned int options2,
 
  628           unsigned int B_start1, 
unsigned int B_start2,
 
  629           unsigned int B_inc1,   
unsigned int B_inc2,
 
  630           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  633           unsigned int options3,
 
  635           unsigned int C_start1, 
unsigned int C_start2,
 
  636           unsigned int C_inc1,   
unsigned int C_inc2,
 
  637           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  640   if (options2 & (1 << 0))
 
  644   if (options3 & (1 << 0))
 
  647   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  648   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  650   if (options2 & (1 << 1))
 
  652     if (options3 & (1 << 1))
 
  654       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  655         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  656           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  657        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  658         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  662       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  663         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  664           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  665        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  666         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  671     if (options3 & (1 << 1))
 
  673       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  674         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  675           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  676        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  677         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  681       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  682         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  683           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  684        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  685         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  692 template<
typename NumericT>
 
  695           unsigned int A_start1, 
unsigned int A_start2,
 
  696           unsigned int A_inc1,   
unsigned int A_inc2,
 
  697           unsigned int A_size1,  
unsigned int A_size2,
 
  698           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  701           unsigned int options2,
 
  703           unsigned int B_start1, 
unsigned int B_start2,
 
  704           unsigned int B_inc1,   
unsigned int B_inc2,
 
  705           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  708           unsigned int options3,
 
  710           unsigned int C_start1, 
unsigned int C_start2,
 
  711           unsigned int C_inc1,   
unsigned int C_inc2,
 
  712           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  715   if (options2 & (1 << 0))
 
  719   if (options3 & (1 << 0))
 
  722   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  723   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  725   if (options2 & (1 << 1))
 
  727     if (options3 & (1 << 1))
 
  729       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  730         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  731           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  732        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  733         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  737       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  738         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  739           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  740        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
 
  741         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  746     if (options3 & (1 << 1))
 
  748       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  749         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  750           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  751        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  752         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
 
  756       for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  757         for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  758           A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  759        += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
 
  760         + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
 
  769 template<
typename NumericT>
 
  772           unsigned int A_start1, 
unsigned int A_start2,
 
  773           unsigned int A_inc1,   
unsigned int A_inc2,
 
  774           unsigned int A_size1,  
unsigned int A_size2,
 
  775           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  778   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  779   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  781   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  782     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  783       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
 
  787 template<
typename NumericT>
 
  790           unsigned int A_start1, 
unsigned int A_start2,
 
  791           unsigned int A_inc1,   
unsigned int A_inc2,
 
  792           unsigned int A_size1,  
unsigned int A_size2,
 
  793           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  796   unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
 
  798   for (
unsigned int row = gid; 
row < A_size1; 
row += blockDim.x * gridDim.x)
 
  799     A[(
row * A_inc1 + A_start1) * A_internal_size2 + 
row * A_inc2 + A_start2] = alpha;
 
  806 template<
typename NumericT>
 
  809           unsigned int A_start1, 
unsigned int A_start2,
 
  810           unsigned int A_inc1,   
unsigned int A_inc2,
 
  811           unsigned int A_size1,  
unsigned int A_size2,
 
  812           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  815           unsigned int B_start1, 
unsigned int B_start2,
 
  816           unsigned int B_inc1,   
unsigned int B_inc2,
 
  817           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  820           unsigned int C_start1, 
unsigned int C_start2,
 
  821           unsigned int C_inc1,   
unsigned int C_inc2,
 
  822           unsigned int C_internal_size1,  
unsigned int C_internal_size2,
 
  824           unsigned int op_type) 
 
  826   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  827   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  831     for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  832       for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  833         A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  834       = pow(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
 
  835             C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
 
  837   else if (op_type == 1)
 
  839     for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  840       for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  841         A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  842       = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
 
  843       / C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
 
  845   else if (op_type == 0)
 
  847     for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  848       for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  849         A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  850       = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
 
  851       * C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
 
  855 template<
typename NumericT>
 
  858           unsigned int A_start1, 
unsigned int A_start2,
 
  859           unsigned int A_inc1,   
unsigned int A_inc2,
 
  860           unsigned int A_size1,  
unsigned int A_size2,
 
  861           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  864           unsigned int B_start1, 
unsigned int B_start2,
 
  865           unsigned int B_inc1,   
unsigned int B_inc2,
 
  866           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  869           unsigned int C_start1, 
unsigned int C_start2,
 
  870           unsigned int C_inc1,   
unsigned int C_inc2,
 
  871           unsigned int C_internal_size1,  
unsigned int C_internal_size2,
 
  873           unsigned int op_type) 
 
  875   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  876   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  880     for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  881       for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  882         A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  883       = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
 
  884       / C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
 
  886   else if (op_type == 0)
 
  888     for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  889       for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  890         A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
 
  891       = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
 
  892       * C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
 
  901 template<
typename NumericT>
 
  904           unsigned int A_start1, 
unsigned int A_start2,
 
  905           unsigned int A_inc1,   
unsigned int A_inc2,
 
  906           unsigned int A_size1,  
unsigned int A_size2,
 
  907           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  910           unsigned int B_start1, 
unsigned int B_start2,
 
  911           unsigned int B_inc1,   
unsigned int B_inc2,
 
  912           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  914   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  915   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  917   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  918     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  919       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
  924 template<
typename NumericT>
 
  927           unsigned int A_start1, 
unsigned int A_start2,
 
  928           unsigned int A_inc1,   
unsigned int A_inc2,
 
  929           unsigned int A_size1,  
unsigned int A_size2,
 
  930           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  933           unsigned int B_start1, 
unsigned int B_start2,
 
  934           unsigned int B_inc1,   
unsigned int B_inc2,
 
  935           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  937   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  938   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  940   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  941     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  942       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
  947 template<
typename NumericT>
 
  950           unsigned int A_start1, 
unsigned int A_start2,
 
  951           unsigned int A_inc1,   
unsigned int A_inc2,
 
  952           unsigned int A_size1,  
unsigned int A_size2,
 
  953           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  956           unsigned int B_start1, 
unsigned int B_start2,
 
  957           unsigned int B_inc1,   
unsigned int B_inc2,
 
  958           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  960   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  961   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  963   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  964     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  965       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
  970 template<
typename NumericT>
 
  973           unsigned int A_start1, 
unsigned int A_start2,
 
  974           unsigned int A_inc1,   
unsigned int A_inc2,
 
  975           unsigned int A_size1,  
unsigned int A_size2,
 
  976           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  979           unsigned int B_start1, 
unsigned int B_start2,
 
  980           unsigned int B_inc1,   
unsigned int B_inc2,
 
  981           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  983   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  984   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  986   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
  987     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
  988       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
  993 template<
typename NumericT>
 
  996           unsigned int A_start1, 
unsigned int A_start2,
 
  997           unsigned int A_inc1,   
unsigned int A_inc2,
 
  998           unsigned int A_size1,  
unsigned int A_size2,
 
  999           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1002           unsigned int B_start1, 
unsigned int B_start2,
 
 1003           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1004           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1006   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1007   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1009   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1010     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1011       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1016 template<
typename NumericT>
 
 1019           unsigned int A_start1, 
unsigned int A_start2,
 
 1020           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1021           unsigned int A_size1,  
unsigned int A_size2,
 
 1022           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1025           unsigned int B_start1, 
unsigned int B_start2,
 
 1026           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1027           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1029   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1030   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1032   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1033     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1034       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1039 template<
typename NumericT>
 
 1042           unsigned int A_start1, 
unsigned int A_start2,
 
 1043           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1044           unsigned int A_size1,  
unsigned int A_size2,
 
 1045           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1048           unsigned int B_start1, 
unsigned int B_start2,
 
 1049           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1050           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1052   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1053   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1055   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1056     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1057       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1062 template<
typename NumericT>
 
 1065           unsigned int A_start1, 
unsigned int A_start2,
 
 1066           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1067           unsigned int A_size1,  
unsigned int A_size2,
 
 1068           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1071           unsigned int B_start1, 
unsigned int B_start2,
 
 1072           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1073           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1075   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1076   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1078   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1079     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1080       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1085 template<
typename NumericT>
 
 1088           unsigned int A_start1, 
unsigned int A_start2,
 
 1089           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1090           unsigned int A_size1,  
unsigned int A_size2,
 
 1091           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1094           unsigned int B_start1, 
unsigned int B_start2,
 
 1095           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1096           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1098   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1099   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1101   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1102     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1103       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1108 template<
typename NumericT>
 
 1111           unsigned int A_start1, 
unsigned int A_start2,
 
 1112           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1113           unsigned int A_size1,  
unsigned int A_size2,
 
 1114           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1117           unsigned int B_start1, 
unsigned int B_start2,
 
 1118           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1119           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1121   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1122   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1124   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1125     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1126       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1131 template<
typename NumericT>
 
 1134           unsigned int A_start1, 
unsigned int A_start2,
 
 1135           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1136           unsigned int A_size1,  
unsigned int A_size2,
 
 1137           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1140           unsigned int B_start1, 
unsigned int B_start2,
 
 1141           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1142           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1144   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1145   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1147   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1148     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1149       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1154 template<
typename NumericT>
 
 1157           unsigned int A_start1, 
unsigned int A_start2,
 
 1158           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1159           unsigned int A_size1,  
unsigned int A_size2,
 
 1160           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1163           unsigned int B_start1, 
unsigned int B_start2,
 
 1164           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1165           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1167   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1168   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1170   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1171     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1172       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1177 template<
typename NumericT>
 
 1180           unsigned int A_start1, 
unsigned int A_start2,
 
 1181           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1182           unsigned int A_size1,  
unsigned int A_size2,
 
 1183           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1186           unsigned int B_start1, 
unsigned int B_start2,
 
 1187           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1188           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1190   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1191   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1193   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1194     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1195       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1200 template<
typename NumericT>
 
 1203           unsigned int A_start1, 
unsigned int A_start2,
 
 1204           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1205           unsigned int A_size1,  
unsigned int A_size2,
 
 1206           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1209           unsigned int B_start1, 
unsigned int B_start2,
 
 1210           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1211           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1213   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1214   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1216   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1217     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1218       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1223 template<
typename NumericT>
 
 1226           unsigned int A_start1, 
unsigned int A_start2,
 
 1227           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1228           unsigned int A_size1,  
unsigned int A_size2,
 
 1229           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1232           unsigned int B_start1, 
unsigned int B_start2,
 
 1233           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1234           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1236   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1237   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1239   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1240     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1241       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1246 template<
typename NumericT>
 
 1249           unsigned int A_start1, 
unsigned int A_start2,
 
 1250           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1251           unsigned int A_size1,  
unsigned int A_size2,
 
 1252           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1255           unsigned int B_start1, 
unsigned int B_start2,
 
 1256           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1257           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1259   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1260   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1262   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1263     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1264       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1269 template<
typename NumericT>
 
 1272           unsigned int A_start1, 
unsigned int A_start2,
 
 1273           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1274           unsigned int A_size1,  
unsigned int A_size2,
 
 1275           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1278           unsigned int B_start1, 
unsigned int B_start2,
 
 1279           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1280           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1282   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1283   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1285   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1286     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1287       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
 
 1296 template<
typename NumericT>
 
 1299           unsigned int A_row_start,
 
 1300           unsigned int A_col_start,
 
 1301           unsigned int A_row_inc,
 
 1302           unsigned int A_col_inc,
 
 1303           unsigned int A_row_size,
 
 1304           unsigned int A_col_size,
 
 1305           unsigned int A_internal_rows,
 
 1306           unsigned int A_internal_cols,
 
 1308           unsigned int v_start,
 
 1310           unsigned int v_size,
 
 1312           unsigned int result_start,
 
 1313           unsigned int result_inc,
 
 1314           unsigned int result_size)
 
 1318   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1319   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1320   unsigned int lid = threadIdx.x;
 
 1322   for (
unsigned int row = row_gid; 
row < A_row_size; 
row += gridDim.x)
 
 1325     for (
unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
 
 1326       dot_prod += A[(
row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
 
 1332         work[lid] += work[lid+
stride];
 
 1336       result[
row * result_inc + result_start] = work[0];
 
 1341 template<
typename NumericT>
 
 1344           unsigned int A_row_start,
 
 1345           unsigned int A_col_start,
 
 1346           unsigned int A_row_inc,
 
 1347           unsigned int A_col_inc,
 
 1348           unsigned int A_row_size,
 
 1349           unsigned int A_col_size,
 
 1350           unsigned int A_internal_rows,
 
 1351           unsigned int A_internal_cols,
 
 1353           unsigned int v_start,
 
 1355           unsigned int v_size,
 
 1357           unsigned int result_start,
 
 1358           unsigned int result_inc,
 
 1359           unsigned int result_size)
 
 1361   for (
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; 
row < A_col_size; 
row += gridDim.x * blockDim.x)
 
 1364     for (
unsigned int col = 0; col < A_row_size; ++col)
 
 1365       dot_prod += A[(
row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
 
 1366     result[
row * result_inc + result_start] = 
dot_prod;
 
 1383 template<
typename NumericT>
 
 1386           unsigned int A_start1, 
unsigned int A_start2,
 
 1387           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1388           unsigned int A_size1,  
unsigned int A_size2,
 
 1389           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1392           unsigned int options2,
 
 1405   if (options2 & (1 << 0))
 
 1407   if (options2 & (1 << 1))
 
 1410   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1411   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1413   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1416     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1417       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + 
start2];
 
 1423 template<
typename NumericT>
 
 1426           unsigned int A_start1, 
unsigned int A_start2,
 
 1427           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1428           unsigned int A_size1,  
unsigned int A_size2,
 
 1429           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1432           unsigned int options2,
 
 1445   if (options2 & (1 << 0))
 
 1447   if (options2 & (1 << 1))
 
 1450   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1451   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1453   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1456     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1457       A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + 
start2];
 
__global__ void element_op_int_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.) 
__global__ void matrix_row_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
result_of::size_type< T >::type start1(T const &obj)
__global__ void ambm_m_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
__global__ void matrix_row_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void trans_vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
result_of::size_type< T >::type start2(T const &obj)
__global__ void element_op_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void convert_row_kernel(DestNumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const SrcNumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__global__ void scaled_rank1_update_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void trans_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_internal_size1, unsigned int A_internal_size2, unsigned int A_size1, unsigned int A_size2, unsigned int A_stride1, unsigned int A_stride2, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_internal_size1, unsigned int B_internal_size2, unsigned int B_stride1, unsigned int B_stride2, bool data_major)
__global__ void matrix_row_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void matrix_row_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)