1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_ 
   33 template<
typename DestNumericT, 
typename SrcNumericT>
 
   35                                   unsigned int A_start1, 
unsigned int A_start2,
 
   36                                   unsigned int A_inc1,   
unsigned int A_inc2,
 
   37                                   unsigned int A_size1,  
unsigned int A_size2,
 
   38                                   unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
   40                                   const SrcNumericT * B,
 
   41                                   unsigned int B_start1, 
unsigned int B_start2,
 
   42                                   unsigned int B_inc1,   
unsigned int B_inc2,
 
   43                                   unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
   45   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
   46   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
   48   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
   49     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
   50       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1];
 
   58 template<
typename NumericT>
 
   60                               unsigned int A_start1, 
unsigned int A_start2,
 
   61                               unsigned int A_inc1,   
unsigned int A_inc2,
 
   62                               unsigned int A_size1,  
unsigned int A_size2,
 
   63                               unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
   66                               unsigned int options2,
 
   68                               unsigned int B_start1, 
unsigned int B_start2,
 
   69                               unsigned int B_inc1,   
unsigned int B_inc2,
 
   70                               unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
   73   if (options2 & (1 << 0))
 
   76   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
   77   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
   79   if (options2 & (1 << 1))
 
   81     for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
   82       for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
   83         A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
 
   87     for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
   88       for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
   89         A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
 
   94 template<
typename NumericT>
 
   96                               unsigned int A_start1, 
unsigned int A_start2,
 
   97                               unsigned int A_inc1,   
unsigned int A_inc2,
 
   98                               unsigned int A_size1,  
unsigned int A_size2,
 
   99                               unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  102                               unsigned int options2,
 
  104                               unsigned int B_start1, 
unsigned int B_start2,
 
  105                               unsigned int B_inc1,   
unsigned int B_inc2,
 
  106                               unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  109   if (options2 & (1 << 0))
 
  112   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  113   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  115   if (options2 & (1 << 1))
 
  117     for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  118       for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  119         A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
 
  123     for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  124       for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  125         A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
 
  135 template<
typename NumericT>
 
  137                                 unsigned int A_start1, 
unsigned int A_start2,
 
  138                                 unsigned int A_inc1,   
unsigned int A_inc2,
 
  139                                 unsigned int A_size1,  
unsigned int A_size2,
 
  140                                 unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  143                                 unsigned int options2,
 
  145                                 unsigned int B_start1, 
unsigned int B_start2,
 
  146                                 unsigned int B_inc1,   
unsigned int B_inc2,
 
  147                                 unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  150                                 unsigned int options3,
 
  152                                 unsigned int C_start1, 
unsigned int C_start2,
 
  153                                 unsigned int C_inc1,   
unsigned int C_inc2,
 
  154                                 unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  157   if (options2 & (1 << 0))
 
  161   if (options3 & (1 << 0))
 
  164   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  165   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  167   if (options2 & (1 << 1))
 
  169     if (options3 & (1 << 1))
 
  171       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  172         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  173           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  174         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  175         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  179       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  180         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  181           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  182         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  183         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  188     if (options3 & (1 << 1))
 
  190       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  191         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  192           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  193         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  194         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  198       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  199         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  200           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  201         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  202         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  209 template<
typename NumericT>
 
  211                                 unsigned int A_start1, 
unsigned int A_start2,
 
  212                                 unsigned int A_inc1,   
unsigned int A_inc2,
 
  213                                 unsigned int A_size1,  
unsigned int A_size2,
 
  214                                 unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  217                                 unsigned int options2,
 
  219                                 unsigned int B_start1, 
unsigned int B_start2,
 
  220                                 unsigned int B_inc1,   
unsigned int B_inc2,
 
  221                                 unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  224                                 unsigned int options3,
 
  226                                 unsigned int C_start1, 
unsigned int C_start2,
 
  227                                 unsigned int C_inc1,   
unsigned int C_inc2,
 
  228                                 unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  231   if (options2 & (1 << 0))
 
  235   if (options3 & (1 << 0))
 
  238   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  239   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  241   if (options2 & (1 << 1))
 
  243     if (options3 & (1 << 1))
 
  245       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  246         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  247           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  248         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  249         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  253       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  254         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  255           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  256         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  257         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  262     if (options3 & (1 << 1))
 
  264       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  265         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  266           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  267         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  268         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  272       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  273         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  274           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  275         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  276         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  282 template<
typename NumericT>
 
  284                                 unsigned int A_start1, 
unsigned int A_start2,
 
  285                                 unsigned int A_inc1,   
unsigned int A_inc2,
 
  286                                 unsigned int A_size1,  
unsigned int A_size2,
 
  287                                 unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  290                                 unsigned int options2,
 
  292                                 unsigned int B_start1, 
unsigned int B_start2,
 
  293                                 unsigned int B_inc1,   
unsigned int B_inc2,
 
  294                                 unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  297                                 unsigned int options3,
 
  299                                 unsigned int C_start1, 
unsigned int C_start2,
 
  300                                 unsigned int C_inc1,   
unsigned int C_inc2,
 
  301                                 unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  304   if (options2 & (1 << 0))
 
  308   if (options3 & (1 << 0))
 
  311   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  312   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  314   if (options2 & (1 << 1))
 
  316     if (options3 & (1 << 1))
 
  318       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  319         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  320           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  321         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  322         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  326       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  327         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  328           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  329         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  330         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  335     if (options3 & (1 << 1))
 
  337       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  338         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  339           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  340         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  341         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  345       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  346         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  347           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  348         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  349         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  356 template<
typename NumericT>
 
  359           unsigned int A_start1, 
unsigned int A_start2,
 
  360           unsigned int A_inc1,   
unsigned int A_inc2,
 
  361           unsigned int A_size1,  
unsigned int A_size2,
 
  362           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  365           unsigned int options2,
 
  367           unsigned int B_start1, 
unsigned int B_start2,
 
  368           unsigned int B_inc1,   
unsigned int B_inc2,
 
  369           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  372           unsigned int options3,
 
  374           unsigned int C_start1, 
unsigned int C_start2,
 
  375           unsigned int C_inc1,   
unsigned int C_inc2,
 
  376           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  379   if (options2 & (1 << 0))
 
  383   if (options3 & (1 << 0))
 
  386   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  387   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  389   if (options2 & (1 << 1))
 
  391     if (options3 & (1 << 1))
 
  393       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  394         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  395           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  396         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  397         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  401       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  402         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  403           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  404         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  405         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  410     if (options3 & (1 << 1))
 
  412       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  413         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  414           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  415         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  416         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  420       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  421         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  422           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  423         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  424         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  435 template<
typename NumericT>
 
  438           unsigned int A_start1, 
unsigned int A_start2,
 
  439           unsigned int A_inc1,   
unsigned int A_inc2,
 
  440           unsigned int A_size1,  
unsigned int A_size2,
 
  441           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  444           unsigned int options2,
 
  446           unsigned int B_start1, 
unsigned int B_start2,
 
  447           unsigned int B_inc1,   
unsigned int B_inc2,
 
  448           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  451           unsigned int options3,
 
  453           unsigned int C_start1, 
unsigned int C_start2,
 
  454           unsigned int C_inc1,   
unsigned int C_inc2,
 
  455           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  458   if (options2 & (1 << 0))
 
  462   if (options3 & (1 << 0))
 
  465   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  466   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  468   if (options2 & (1 << 1))
 
  470     if (options3 & (1 << 1))
 
  472       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  473         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  474           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  475        += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  476         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  480       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  481         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  482           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  483        += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  484         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  489     if (options3 & (1 << 1))
 
  491       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  492         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  493           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  494        += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  495         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  499       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  500         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  501           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  502        += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  503         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  510 template<
typename NumericT>
 
  513           unsigned int A_start1, 
unsigned int A_start2,
 
  514           unsigned int A_inc1,   
unsigned int A_inc2,
 
  515           unsigned int A_size1,  
unsigned int A_size2,
 
  516           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  519           unsigned int options2,
 
  521           unsigned int B_start1, 
unsigned int B_start2,
 
  522           unsigned int B_inc1,   
unsigned int B_inc2,
 
  523           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  526           unsigned int options3,
 
  528           unsigned int C_start1, 
unsigned int C_start2,
 
  529           unsigned int C_inc1,   
unsigned int C_inc2,
 
  530           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  533   if (options2 & (1 << 0))
 
  537   if (options3 & (1 << 0))
 
  540   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  541   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  543   if (options2 & (1 << 1))
 
  545     if (options3 & (1 << 1))
 
  547       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  548         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  549           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  550         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  551         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  555       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  556         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  557           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  558         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  559         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  564     if (options3 & (1 << 1))
 
  566       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  567         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  568           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  569         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  570         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  574       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  575         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  576           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  577         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  578         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  584 template<
typename NumericT>
 
  587           unsigned int A_start1, 
unsigned int A_start2,
 
  588           unsigned int A_inc1,   
unsigned int A_inc2,
 
  589           unsigned int A_size1,  
unsigned int A_size2,
 
  590           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  593           unsigned int options2,
 
  595           unsigned int B_start1, 
unsigned int B_start2,
 
  596           unsigned int B_inc1,   
unsigned int B_inc2,
 
  597           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  600           unsigned int options3,
 
  602           unsigned int C_start1, 
unsigned int C_start2,
 
  603           unsigned int C_inc1,   
unsigned int C_inc2,
 
  604           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  607   if (options2 & (1 << 0))
 
  611   if (options3 & (1 << 0))
 
  614   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  615   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  617   if (options2 & (1 << 1))
 
  619     if (options3 & (1 << 1))
 
  621       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  622         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  623           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  624         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  625         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  629       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  630         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  631           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  632         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  633         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  638     if (options3 & (1 << 1))
 
  640       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  641         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  642           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  643         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  644         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  648       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  649         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  650           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  651         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  652         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  659 template<
typename NumericT>
 
  662           unsigned int A_start1, 
unsigned int A_start2,
 
  663           unsigned int A_inc1,   
unsigned int A_inc2,
 
  664           unsigned int A_size1,  
unsigned int A_size2,
 
  665           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  668           unsigned int options2,
 
  670           unsigned int B_start1, 
unsigned int B_start2,
 
  671           unsigned int B_inc1,   
unsigned int B_inc2,
 
  672           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  675           unsigned int options3,
 
  677           unsigned int C_start1, 
unsigned int C_start2,
 
  678           unsigned int C_inc1,   
unsigned int C_inc2,
 
  679           unsigned int C_internal_size1,  
unsigned int C_internal_size2)
 
  682   if (options2 & (1 << 0))
 
  686   if (options3 & (1 << 0))
 
  689   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  690   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  692   if (options2 & (1 << 1))
 
  694     if (options3 & (1 << 1))
 
  696       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  697         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  698           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  699         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  700         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  704       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  705         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  706           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  707         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
 
  708         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  713     if (options3 & (1 << 1))
 
  715       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  716         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  717           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  718         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  719         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
 
  723       for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  724         for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  725           A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  726         = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
 
  727         + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
 
  738 template<
typename NumericT>
 
  741           unsigned int A_start1, 
unsigned int A_start2,
 
  742           unsigned int A_inc1,   
unsigned int A_inc2,
 
  743           unsigned int A_size1,  
unsigned int A_size2,
 
  744           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  747   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  748   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  750   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  751     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  752       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
 
  756 template<
typename NumericT>
 
  759           unsigned int A_start1, 
unsigned int A_start2,
 
  760           unsigned int A_inc1,   
unsigned int A_inc2,
 
  761           unsigned int A_size1,  
unsigned int A_size2,
 
  762           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  765   unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
 
  767   for (
unsigned int row = gid; 
row < A_size1; 
row += blockDim.x * gridDim.x)
 
  768     A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2) * A_internal_size1] = alpha;
 
  775 template<
typename NumericT>
 
  778           unsigned int A_start1, 
unsigned int A_start2,
 
  779           unsigned int A_inc1,   
unsigned int A_inc2,
 
  780           unsigned int A_size1,  
unsigned int A_size2,
 
  781           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  784           unsigned int B_start1, 
unsigned int B_start2,
 
  785           unsigned int B_inc1,   
unsigned int B_inc2,
 
  786           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  789           unsigned int C_start1, 
unsigned int C_start2,
 
  790           unsigned int C_inc1,   
unsigned int C_inc2,
 
  791           unsigned int C_internal_size1,  
unsigned int C_internal_size2,
 
  793           unsigned int op_type) 
 
  795   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  796   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  800     for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  801       for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  802         A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  803       = pow(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
 
  804             C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
 
  806   else if (op_type == 1)
 
  808     for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  809       for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  810         A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  811       = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
 
  812       / C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
 
  814   else if (op_type == 0)
 
  816     for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  817       for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  818         A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  819       = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
 
  820       * C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
 
  824 template<
typename NumericT>
 
  827           unsigned int A_start1, 
unsigned int A_start2,
 
  828           unsigned int A_inc1,   
unsigned int A_inc2,
 
  829           unsigned int A_size1,  
unsigned int A_size2,
 
  830           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  833           unsigned int B_start1, 
unsigned int B_start2,
 
  834           unsigned int B_inc1,   
unsigned int B_inc2,
 
  835           unsigned int B_internal_size1,  
unsigned int B_internal_size2,
 
  838           unsigned int C_start1, 
unsigned int C_start2,
 
  839           unsigned int C_inc1,   
unsigned int C_inc2,
 
  840           unsigned int C_internal_size1,  
unsigned int C_internal_size2,
 
  842           unsigned int op_type) 
 
  844   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  845   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  849     for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  850       for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  851         A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  852       = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
 
  853       / C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
 
  855   else if (op_type == 0)
 
  857     for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  858       for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  859         A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
 
  860       = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
 
  861       * C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
 
  871 template<
typename NumericT>
 
  874           unsigned int A_start1, 
unsigned int A_start2,
 
  875           unsigned int A_inc1,   
unsigned int A_inc2,
 
  876           unsigned int A_size1,  
unsigned int A_size2,
 
  877           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  880           unsigned int B_start1, 
unsigned int B_start2,
 
  881           unsigned int B_inc1,   
unsigned int B_inc2,
 
  882           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  884   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  885   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  887   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  888     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  889       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
  894 template<
typename NumericT>
 
  897           unsigned int A_start1, 
unsigned int A_start2,
 
  898           unsigned int A_inc1,   
unsigned int A_inc2,
 
  899           unsigned int A_size1,  
unsigned int A_size2,
 
  900           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  903           unsigned int B_start1, 
unsigned int B_start2,
 
  904           unsigned int B_inc1,   
unsigned int B_inc2,
 
  905           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  907   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  908   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  910   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  911     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  912       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
  917 template<
typename NumericT>
 
  920           unsigned int A_start1, 
unsigned int A_start2,
 
  921           unsigned int A_inc1,   
unsigned int A_inc2,
 
  922           unsigned int A_size1,  
unsigned int A_size2,
 
  923           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  926           unsigned int B_start1, 
unsigned int B_start2,
 
  927           unsigned int B_inc1,   
unsigned int B_inc2,
 
  928           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  930   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  931   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  933   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  934     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  935       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
  940 template<
typename NumericT>
 
  943           unsigned int A_start1, 
unsigned int A_start2,
 
  944           unsigned int A_inc1,   
unsigned int A_inc2,
 
  945           unsigned int A_size1,  
unsigned int A_size2,
 
  946           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  949           unsigned int B_start1, 
unsigned int B_start2,
 
  950           unsigned int B_inc1,   
unsigned int B_inc2,
 
  951           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  953   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  954   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  956   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  957     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  958       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
  963 template<
typename NumericT>
 
  966           unsigned int A_start1, 
unsigned int A_start2,
 
  967           unsigned int A_inc1,   
unsigned int A_inc2,
 
  968           unsigned int A_size1,  
unsigned int A_size2,
 
  969           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  972           unsigned int B_start1, 
unsigned int B_start2,
 
  973           unsigned int B_inc1,   
unsigned int B_inc2,
 
  974           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  976   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
  977   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
  979   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
  980     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
  981       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
  986 template<
typename NumericT>
 
  989           unsigned int A_start1, 
unsigned int A_start2,
 
  990           unsigned int A_inc1,   
unsigned int A_inc2,
 
  991           unsigned int A_size1,  
unsigned int A_size2,
 
  992           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  995           unsigned int B_start1, 
unsigned int B_start2,
 
  996           unsigned int B_inc1,   
unsigned int B_inc2,
 
  997           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
  999   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1000   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1002   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1003     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1004       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1009 template<
typename NumericT>
 
 1012           unsigned int A_start1, 
unsigned int A_start2,
 
 1013           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1014           unsigned int A_size1,  
unsigned int A_size2,
 
 1015           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1018           unsigned int B_start1, 
unsigned int B_start2,
 
 1019           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1020           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1022   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1023   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1025   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1026     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1027       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1032 template<
typename NumericT>
 
 1035           unsigned int A_start1, 
unsigned int A_start2,
 
 1036           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1037           unsigned int A_size1,  
unsigned int A_size2,
 
 1038           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1041           unsigned int B_start1, 
unsigned int B_start2,
 
 1042           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1043           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1045   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1046   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1048   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1049     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1050       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1055 template<
typename NumericT>
 
 1058           unsigned int A_start1, 
unsigned int A_start2,
 
 1059           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1060           unsigned int A_size1,  
unsigned int A_size2,
 
 1061           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1064           unsigned int B_start1, 
unsigned int B_start2,
 
 1065           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1066           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1068   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1069   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1071   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1072     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1073       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1078 template<
typename NumericT>
 
 1081           unsigned int A_start1, 
unsigned int A_start2,
 
 1082           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1083           unsigned int A_size1,  
unsigned int A_size2,
 
 1084           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1087           unsigned int B_start1, 
unsigned int B_start2,
 
 1088           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1089           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1091   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1092   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1094   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1095     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1096       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1101 template<
typename NumericT>
 
 1104           unsigned int A_start1, 
unsigned int A_start2,
 
 1105           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1106           unsigned int A_size1,  
unsigned int A_size2,
 
 1107           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1110           unsigned int B_start1, 
unsigned int B_start2,
 
 1111           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1112           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1114   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1115   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1117   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1118     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1119       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1124 template<
typename NumericT>
 
 1127           unsigned int A_start1, 
unsigned int A_start2,
 
 1128           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1129           unsigned int A_size1,  
unsigned int A_size2,
 
 1130           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1133           unsigned int B_start1, 
unsigned int B_start2,
 
 1134           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1135           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1137   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1138   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1140   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1141     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1142       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1147 template<
typename NumericT>
 
 1150           unsigned int A_start1, 
unsigned int A_start2,
 
 1151           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1152           unsigned int A_size1,  
unsigned int A_size2,
 
 1153           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1156           unsigned int B_start1, 
unsigned int B_start2,
 
 1157           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1158           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1160   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1161   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1163   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1164     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1165       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1170 template<
typename NumericT>
 
 1173           unsigned int A_start1, 
unsigned int A_start2,
 
 1174           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1175           unsigned int A_size1,  
unsigned int A_size2,
 
 1176           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1179           unsigned int B_start1, 
unsigned int B_start2,
 
 1180           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1181           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1183   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1184   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1186   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1187     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1188       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1193 template<
typename NumericT>
 
 1196           unsigned int A_start1, 
unsigned int A_start2,
 
 1197           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1198           unsigned int A_size1,  
unsigned int A_size2,
 
 1199           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1202           unsigned int B_start1, 
unsigned int B_start2,
 
 1203           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1204           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1206   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1207   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1209   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1210     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1211       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1216 template<
typename NumericT>
 
 1219           unsigned int A_start1, 
unsigned int A_start2,
 
 1220           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1221           unsigned int A_size1,  
unsigned int A_size2,
 
 1222           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1225           unsigned int B_start1, 
unsigned int B_start2,
 
 1226           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1227           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1229   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1230   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1232   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1233     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1234       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1239 template<
typename NumericT>
 
 1242           unsigned int A_start1, 
unsigned int A_start2,
 
 1243           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1244           unsigned int A_size1,  
unsigned int A_size2,
 
 1245           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1248           unsigned int B_start1, 
unsigned int B_start2,
 
 1249           unsigned int B_inc1,   
unsigned int B_inc2,
 
 1250           unsigned int B_internal_size1,  
unsigned int B_internal_size2)
 
 1252   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1253   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1255   for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
 
 1256     for (
unsigned int row = row_gid; 
row < A_size1; 
row += blockDim.x)
 
 1257       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
 
 1266 template<
typename NumericT>
 
 1269           unsigned int A_row_start,
 
 1270           unsigned int A_col_start,
 
 1271           unsigned int A_row_inc,
 
 1272           unsigned int A_col_inc,
 
 1273           unsigned int A_row_size,
 
 1274           unsigned int A_col_size,
 
 1275           unsigned int A_internal_rows,
 
 1276           unsigned int A_internal_cols,
 
 1278           unsigned int v_start,
 
 1280           unsigned int v_size,
 
 1282           unsigned int result_start,
 
 1283           unsigned int result_inc,
 
 1284           unsigned int result_size)
 
 1287   for (
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; 
row < A_row_size; 
row += gridDim.x * blockDim.x)
 
 1290     for (
unsigned int col = 0; col < A_col_size; ++col)
 
 1291       dot_prod += A[(
row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
 
 1292     result[
row * result_inc + result_start] = 
dot_prod;
 
 1297 template<
typename NumericT>
 
 1300           unsigned int A_row_start,
 
 1301           unsigned int A_col_start,
 
 1302           unsigned int A_row_inc,
 
 1303           unsigned int A_col_inc,
 
 1304           unsigned int A_row_size,
 
 1305           unsigned int A_col_size,
 
 1306           unsigned int A_internal_rows,
 
 1307           unsigned int A_internal_cols,
 
 1309           unsigned int v_start,
 
 1311           unsigned int v_size,
 
 1313           unsigned int result_start,
 
 1314           unsigned int result_inc,
 
 1315           unsigned int result_size)
 
 1319   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1320   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1321   unsigned int lid = threadIdx.x;
 
 1323   for (
unsigned int row = row_gid; 
row < A_col_size; 
row += gridDim.x)
 
 1326     for (
unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
 
 1327       dot_prod += A[(
row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
 
 1333         work[lid] += work[lid+
stride];
 
 1337       result[
row * result_inc + result_start] = work[0];
 
 1354 template<
typename NumericT>
 
 1357           unsigned int A_start1, 
unsigned int A_start2,
 
 1358           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1359           unsigned int A_size1,  
unsigned int A_size2,
 
 1360           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1363           unsigned int options2,
 
 1376   if (options2 & (1 << 0))
 
 1378   if (options2 & (1 << 1))
 
 1381   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1382   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1384   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1387     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1388       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
 
 1394 template<
typename NumericT>
 
 1397           unsigned int A_start1, 
unsigned int A_start2,
 
 1398           unsigned int A_inc1,   
unsigned int A_inc2,
 
 1399           unsigned int A_size1,  
unsigned int A_size2,
 
 1400           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
 1403           unsigned int options2,
 
 1416   if (options2 & (1 << 0))
 
 1418   if (options2 & (1 << 1))
 
 1421   unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
 
 1422   unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
 
 1424   for (
unsigned int row = row_gid; 
row < A_size1; 
row += gridDim.x)
 
 1427     for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
 
 1428       A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
 
 1433 template <
typename T>
 
 1442   unsigned int size = 
min(size1, size2);
 
 1443   if(blockIdx.x * blockDim.x + threadIdx.x == 0)
 
 1446   for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
 1448            i += gridDim.x * blockDim.x)
 
 1450       D[i] = A[i*stride + i];
 
 1451       S[i+1] = (i + 1 < 
size2) ? A[i*stride + (i + 1)] : 0;
 
 1455 template <
typename T>
 
 1464   unsigned int size = 
min(size1, size2);
 
 1465   if(blockIdx.x * blockDim.x + threadIdx.x == 0)
 
 1468   for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
 1470            i += gridDim.x * blockDim.x)
 
 1472       D[i] = A[i*stride + i];
 
 1473       S[i+1] = (i + 1 < 
size2) ? A[i + (i + 1) * 
stride] : 0;
 
 1479 template<
typename T>
 
 1483         unsigned int row_start,
 
 1484         unsigned int col_start,
 
 1488     unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 
 1489     unsigned int sz = gridDim.x * blockDim.x;
 
 1491     for(
unsigned int i = row_start + x; i < 
size; i += sz)
 
 1493         V[i - row_start] = A[i * stride + col_start];
 
 1497 template<
typename T>
 
 1501         unsigned int row_start,
 
 1502         unsigned int col_start,
 
 1506     unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 
 1507     unsigned int sz = gridDim.x * blockDim.x;
 
 1509     for(
unsigned int i = row_start + x; i < 
size; i += sz)
 
 1511         V[i - row_start] = A[i + col_start * 
stride];
 
 1515 template<
typename T>
 
 1519         unsigned int row_start,
 
 1520         unsigned int col_start,
 
 1524     unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 
 1525     unsigned int sz = gridDim.x * blockDim.x;
 
 1527     for(
unsigned int i = col_start + x; i < 
size; i += sz)
 
 1529         V[i - col_start] = A[row_start * stride + i];
 
 1534 template<
typename T>
 
 1538         unsigned int row_start,
 
 1539         unsigned int col_start,
 
 1543     unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 
 1544     unsigned int sz = gridDim.x * blockDim.x;
 
 1546     for(
unsigned int i = col_start + x; i < 
size; i += sz)
 
 1548         V[i - col_start] = A[row_start + i * 
stride];
 
 1555 template<
typename T>
 
 1559         unsigned int row_start,
 
 1560         unsigned int col_start,
 
 1567     for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
 
 1569         i += gridDim.x * blockDim.x)
 
 1572         for(
unsigned int j = row_start; j < 
size1; j++)
 
 1573             ss = ss +(V[j] * A[j * stride + i]);
 
 1575         for(
unsigned int j = row_start; j < 
size1; j++)
 
 1576             A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss);
 
 1580 template<
typename T>
 
 1584         unsigned int row_start,
 
 1585         unsigned int col_start,
 
 1592     for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
 
 1594         i += gridDim.x * blockDim.x)
 
 1597         for(
unsigned int j = row_start; j < 
size1; j++)
 
 1598             ss = ss +(V[j] * A[j + i * stride]);
 
 1600         for(
unsigned int j = row_start; j < 
size1; j++)
 
 1601             A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss);
 
 1607 template<
typename T>
 
 1611         unsigned int row_start,
 
 1612         unsigned int col_start,
 
 1617     __shared__ T sums[128];
 
 1620     for(
unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
 
 1623         for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
 
 1624             ss = ss + (V[j] * A[i * stride + j]);
 
 1625         sums[threadIdx.x] = ss;
 
 1633         for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
 
 1634             A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av);
 
 1638 template<
typename T>
 
 1642         unsigned int row_start,
 
 1643         unsigned int col_start,
 
 1648     __shared__ T sums[128];
 
 1651     for(
unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
 
 1654         for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
 
 1655             ss = ss + (V[j] * A[i + j * 
stride]);
 
 1656         sums[threadIdx.x] = ss;
 
 1664         for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
 
 1665             A[i + j * 
stride] = A[i + j * 
stride] - (2 * V[j] * sum_Av);
 
 1671 template<
typename T>
 
 1674         unsigned int th_Idx,
 
 1675         unsigned int bl_Dim)
 
 1677     unsigned int step = bl_Dim >> 1;
 
 1682             sums[th_Idx] += sums[th_Idx + 
step];
 
 1689 template <
typename T>
 
 1694         unsigned int strideQ)
 
 1696   __shared__ T sums[128];
 
 1698   for(
unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
 
 1701     for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
 
 1702       ss = ss + (V[j] * QL[i * strideQ + j]);
 
 1703     sums[threadIdx.x] = ss;
 
 1711     for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
 
 1712       QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv);
 
 1716 template <
typename T>
 
 1721         unsigned int strideQ)
 
 1723   __shared__ T sums[128];
 
 1725   for(
unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
 
 1728     for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
 
 1729       ss = ss + (V[j] * QL[i + j * strideQ]);
 
 1730     sums[threadIdx.x] = ss;
 
 1738     for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
 
 1739       QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv);
 
 1744 template <
typename T>
 
 1751         unsigned int start_i,
 
 1754     unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
 
 1755     __shared__ T cs_lcl[256];
 
 1756     __shared__ T ss_lcl[256];
 
 1758     T x = (j < 
size) ? matr[(end_i + 1) + j * 
stride] : 0;
 
 1760     unsigned int elems_num = end_i - start_i + 1;
 
 1761     unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
 
 1763     for(
unsigned int block_id = 0; block_id < block_num; block_id++)
 
 1765         unsigned int to = 
min(elems_num - block_id * blockDim.x, blockDim.x);
 
 1767         if(threadIdx.x < to)
 
 1769             cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
 
 1770             ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
 
 1775             for(
unsigned int ind = 0; ind < to; ind++)
 
 1777                 unsigned int i = end_i - (ind + block_id * blockDim.x);
 
 1778                 T z = matr[i + j * 
stride];
 
 1779                 T cs_val = cs_lcl[ind];
 
 1780                 T ss_val = ss_lcl[ind];
 
 1781                 matr[(i + 1) + j * stride] = x * cs_val + z * ss_val;
 
 1782                 x = -x * ss_val + z * cs_val;
 
 1788        matr[(start_i) + j * stride] = x;
 
 1791 template <
typename T>
 
 1798         unsigned int start_i,
 
 1801     unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
 
 1802     __shared__ T cs_lcl[256];
 
 1803     __shared__ T ss_lcl[256];
 
 1805     T x = (j < 
size) ? matr[(end_i + 1) *stride + j] : 0;
 
 1807     unsigned int elems_num = end_i - start_i + 1;
 
 1808     unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
 
 1810     for(
unsigned int block_id = 0; block_id < block_num; block_id++)
 
 1812         unsigned int to = 
min(elems_num - block_id * blockDim.x, blockDim.x);
 
 1814         if(threadIdx.x < to)
 
 1816             cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
 
 1817             ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
 
 1822             for(
unsigned int ind = 0; ind < to; ind++)
 
 1824                 unsigned int i = end_i - (ind + block_id * blockDim.x);
 
 1825                 T z = matr[i *stride + j];
 
 1826                 T cs_val = cs_lcl[ind];
 
 1827                 T ss_val = ss_lcl[ind];
 
 1828                 matr[(i + 1) * stride + j] = x * cs_val + z * ss_val;
 
 1829                 x = -x * ss_val + z * cs_val;
 
 1835        matr[(start_i) * stride + j] = x;
 
__global__ void matrix_col_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void house_update_A_right_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void matrix_col_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void convert_col_kernel(DestNumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const SrcNumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_row_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.) 
__global__ void house_update_A_left_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void matrix_col_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
result_of::size_type< T >::type start1(T const &obj)
__global__ void matrix_col_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void bidiag_pack_column_major_kernel(T *A, T *D, T *S, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void house_update_A_left_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
__global__ void matrix_col_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_col_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void matrix_col_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void house_update_QL_row_major_kernel(T *QL, T *V, unsigned int size1, unsigned int strideQ)
__global__ void scaled_rank1_update_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void matrix_col_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_m_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
result_of::size_type< T >::type start2(T const &obj)
__global__ void matrix_col_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_row_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
__global__ void matrix_col_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_col_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
__global__ void matrix_col_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__device__ void col_reduce_lcl_array(T *sums, unsigned int th_Idx, unsigned int bl_Dim)
__global__ void matrix_col_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_col_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
__global__ void element_op_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void house_update_A_right_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void givens_next_row_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
__global__ void matrix_col_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void element_op_int_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_col_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void trans_vec_mul_col_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
__global__ void matrix_col_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void house_update_QL_column_major_kernel(T *QL, T *V, unsigned int size1, unsigned int strideQ)
__global__ void givens_next_column_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
__global__ void bidiag_pack_row_major_kernel(T *A, T *D, T *S, unsigned int size1, unsigned int size2, unsigned int stride)
NumericT min(std::vector< NumericT > const &v1)