1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_ 
   38 template<
typename NumericT>
 
   42           unsigned int A_row_start,
 
   43           unsigned int A_col_start,
 
   44           unsigned int A_row_inc,
 
   45           unsigned int A_col_inc,
 
   46           unsigned int A_row_size,
 
   47           unsigned int A_col_size,
 
   48           unsigned int A_internal_rows,
 
   49           unsigned int A_internal_cols,
 
   51           unsigned int B_row_start,
 
   52           unsigned int B_col_start,
 
   53           unsigned int B_row_inc,
 
   54           unsigned int B_col_inc,
 
   55           unsigned int B_row_size,
 
   56           unsigned int B_col_size,
 
   57           unsigned int B_internal_rows,
 
   58           unsigned int B_internal_cols,
 
   61           unsigned int C_row_start,
 
   62           unsigned int C_col_start,
 
   63           unsigned int C_row_inc,
 
   64           unsigned int C_col_inc,
 
   65           unsigned int C_row_size,
 
   66           unsigned int C_col_size,
 
   67           unsigned int C_internal_rows,
 
   68           unsigned int C_internal_cols)
 
   79   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
 
   80   vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
 
   81   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
 
   83   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
   85   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
   86   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
   88   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
   89   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
   94     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
   95     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
   97     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
   98     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
   99       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  100       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  101       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  102       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  103       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  104       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  105       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  106       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  107       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  108       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  109       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  110       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  111       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  112       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  113       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  114       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  119   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
  120     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
  125 template<
typename NumericT>
 
  129           unsigned int A_row_start,
 
  130           unsigned int A_col_start,
 
  131           unsigned int A_row_inc,
 
  132           unsigned int A_col_inc,
 
  133           unsigned int A_row_size,
 
  134           unsigned int A_col_size,
 
  135           unsigned int A_internal_rows,
 
  136           unsigned int A_internal_cols,
 
  138           unsigned int B_row_start,
 
  139           unsigned int B_col_start,
 
  140           unsigned int B_row_inc,
 
  141           unsigned int B_col_inc,
 
  142           unsigned int B_row_size,
 
  143           unsigned int B_col_size,
 
  144           unsigned int B_internal_rows,
 
  145           unsigned int B_internal_cols,
 
  148           unsigned int C_row_start,
 
  149           unsigned int C_col_start,
 
  150           unsigned int C_row_inc,
 
  151           unsigned int C_col_inc,
 
  152           unsigned int C_row_size,
 
  153           unsigned int C_col_size,
 
  154           unsigned int C_internal_rows,
 
  155           unsigned int C_internal_cols)
 
  166   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
 
  167   vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
 
  168   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
 
  169   vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
 
  170   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
  172   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
  173   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
  175   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
  176   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
  181     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
  182     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
  184     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
  185     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
  186       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  187       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  188       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  189       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  190       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  191       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  192       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  193       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  194       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  195       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  196       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  197       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  198       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  199       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  200       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  201       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  206   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
  207     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
  212 template<
typename NumericT>
 
  216           unsigned int A_row_start,
 
  217           unsigned int A_col_start,
 
  218           unsigned int A_row_inc,
 
  219           unsigned int A_col_inc,
 
  220           unsigned int A_row_size,
 
  221           unsigned int A_col_size,
 
  222           unsigned int A_internal_rows,
 
  223           unsigned int A_internal_cols,
 
  225           unsigned int B_row_start,
 
  226           unsigned int B_col_start,
 
  227           unsigned int B_row_inc,
 
  228           unsigned int B_col_inc,
 
  229           unsigned int B_row_size,
 
  230           unsigned int B_col_size,
 
  231           unsigned int B_internal_rows,
 
  232           unsigned int B_internal_cols,
 
  235           unsigned int C_row_start,
 
  236           unsigned int C_col_start,
 
  237           unsigned int C_row_inc,
 
  238           unsigned int C_col_inc,
 
  239           unsigned int C_row_size,
 
  240           unsigned int C_col_size,
 
  241           unsigned int C_internal_rows,
 
  242           unsigned int C_internal_cols)
 
  253   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
 
  255   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
 
  257   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
  259   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
  260   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
  262   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
  263   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
  268     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
  269     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
  271     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
  272     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
  273       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  274       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  275       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  276       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  277       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  278       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  279       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  280       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  281       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  282       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  283       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  284       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  285       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  286       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  287       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  288       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  293   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
  294     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
  299 template<
typename NumericT>
 
  303           unsigned int A_row_start,
 
  304           unsigned int A_col_start,
 
  305           unsigned int A_row_inc,
 
  306           unsigned int A_col_inc,
 
  307           unsigned int A_row_size,
 
  308           unsigned int A_col_size,
 
  309           unsigned int A_internal_rows,
 
  310           unsigned int A_internal_cols,
 
  312           unsigned int B_row_start,
 
  313           unsigned int B_col_start,
 
  314           unsigned int B_row_inc,
 
  315           unsigned int B_col_inc,
 
  316           unsigned int B_row_size,
 
  317           unsigned int B_col_size,
 
  318           unsigned int B_internal_rows,
 
  319           unsigned int B_internal_cols,
 
  322           unsigned int C_row_start,
 
  323           unsigned int C_col_start,
 
  324           unsigned int C_row_inc,
 
  325           unsigned int C_col_inc,
 
  326           unsigned int C_row_size,
 
  327           unsigned int C_col_size,
 
  328           unsigned int C_internal_rows,
 
  329           unsigned int C_internal_cols)
 
  340   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
 
  342   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
 
  343   vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
 
  344   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
  346   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
  347   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
  349   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
  350   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
  355     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
  356     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
  358     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
  359     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
  360       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  361       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  362       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  363       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  364       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  365       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  366       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  367       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  368       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  369       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  370       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  371       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  372       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  373       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  374       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  375       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  380   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
  381     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
  393 template<
typename NumericT>
 
  397           unsigned int A_row_start,
 
  398           unsigned int A_col_start,
 
  399           unsigned int A_row_inc,
 
  400           unsigned int A_col_inc,
 
  401           unsigned int A_row_size,
 
  402           unsigned int A_col_size,
 
  403           unsigned int A_internal_rows,
 
  404           unsigned int A_internal_cols,
 
  406           unsigned int B_row_start,
 
  407           unsigned int B_col_start,
 
  408           unsigned int B_row_inc,
 
  409           unsigned int B_col_inc,
 
  410           unsigned int B_row_size,
 
  411           unsigned int B_col_size,
 
  412           unsigned int B_internal_rows,
 
  413           unsigned int B_internal_cols,
 
  416           unsigned int C_row_start,
 
  417           unsigned int C_col_start,
 
  418           unsigned int C_row_inc,
 
  419           unsigned int C_col_inc,
 
  420           unsigned int C_row_size,
 
  421           unsigned int C_col_size,
 
  422           unsigned int C_internal_rows,
 
  423           unsigned int C_internal_cols)
 
  434   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
 
  435   vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
 
  436   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
 
  438   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
  440   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
  441   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
  443   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
  444   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
  449     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
  450     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
  452     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
  453     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
  454       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  455       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  456       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  457       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  458       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  459       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  460       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  461       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  462       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  463       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  464       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  465       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  466       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  467       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  468       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  469       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  474   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
  475     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
  480 template<
typename NumericT>
 
  484           unsigned int A_row_start,
 
  485           unsigned int A_col_start,
 
  486           unsigned int A_row_inc,
 
  487           unsigned int A_col_inc,
 
  488           unsigned int A_row_size,
 
  489           unsigned int A_col_size,
 
  490           unsigned int A_internal_rows,
 
  491           unsigned int A_internal_cols,
 
  493           unsigned int B_row_start,
 
  494           unsigned int B_col_start,
 
  495           unsigned int B_row_inc,
 
  496           unsigned int B_col_inc,
 
  497           unsigned int B_row_size,
 
  498           unsigned int B_col_size,
 
  499           unsigned int B_internal_rows,
 
  500           unsigned int B_internal_cols,
 
  503           unsigned int C_row_start,
 
  504           unsigned int C_col_start,
 
  505           unsigned int C_row_inc,
 
  506           unsigned int C_col_inc,
 
  507           unsigned int C_row_size,
 
  508           unsigned int C_col_size,
 
  509           unsigned int C_internal_rows,
 
  510           unsigned int C_internal_cols)
 
  521   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
 
  522   vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
 
  523   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
 
  524   vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
 
  525   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
  527   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
  528   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
  530   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
  531   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
  536     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
  537     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
  539     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
  540     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
  541       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  542       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  543       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  544       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  545       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  546       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  547       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  548       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  549       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  550       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  551       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  552       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  553       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  554       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  555       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  556       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  561   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
  562     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
  567 template<
typename NumericT>
 
  571           unsigned int A_row_start,
 
  572           unsigned int A_col_start,
 
  573           unsigned int A_row_inc,
 
  574           unsigned int A_col_inc,
 
  575           unsigned int A_row_size,
 
  576           unsigned int A_col_size,
 
  577           unsigned int A_internal_rows,
 
  578           unsigned int A_internal_cols,
 
  580           unsigned int B_row_start,
 
  581           unsigned int B_col_start,
 
  582           unsigned int B_row_inc,
 
  583           unsigned int B_col_inc,
 
  584           unsigned int B_row_size,
 
  585           unsigned int B_col_size,
 
  586           unsigned int B_internal_rows,
 
  587           unsigned int B_internal_cols,
 
  590           unsigned int C_row_start,
 
  591           unsigned int C_col_start,
 
  592           unsigned int C_row_inc,
 
  593           unsigned int C_col_inc,
 
  594           unsigned int C_row_size,
 
  595           unsigned int C_col_size,
 
  596           unsigned int C_internal_rows,
 
  597           unsigned int C_internal_cols)
 
  608   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
 
  610   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
 
  612   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
  614   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
  615   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
  617   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
  618   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
  623     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
  624     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
  626     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
  627     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
  628       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  629       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  630       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  631       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  632       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  633       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  634       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  635       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  636       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  637       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  638       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  639       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  640       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  641       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  642       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  643       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  648   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
  649     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
  654 template<
typename NumericT>
 
  658           unsigned int A_row_start,
 
  659           unsigned int A_col_start,
 
  660           unsigned int A_row_inc,
 
  661           unsigned int A_col_inc,
 
  662           unsigned int A_row_size,
 
  663           unsigned int A_col_size,
 
  664           unsigned int A_internal_rows,
 
  665           unsigned int A_internal_cols,
 
  667           unsigned int B_row_start,
 
  668           unsigned int B_col_start,
 
  669           unsigned int B_row_inc,
 
  670           unsigned int B_col_inc,
 
  671           unsigned int B_row_size,
 
  672           unsigned int B_col_size,
 
  673           unsigned int B_internal_rows,
 
  674           unsigned int B_internal_cols,
 
  677           unsigned int C_row_start,
 
  678           unsigned int C_col_start,
 
  679           unsigned int C_row_inc,
 
  680           unsigned int C_col_inc,
 
  681           unsigned int C_row_size,
 
  682           unsigned int C_col_size,
 
  683           unsigned int C_internal_rows,
 
  684           unsigned int C_internal_cols)
 
  695   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
 
  697   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
 
  698   vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
 
  699   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
  701   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
  702   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
  704   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
  705   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
  710     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
  711     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
  713     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
  714     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
  715       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  716       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  717       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  718       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  719       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  720       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  721       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  722       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  723       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  724       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  725       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  726       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  727       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  728       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  729       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  730       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  735   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
  736     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
  749 template<
typename NumericT>
 
  753           unsigned int A_row_start,
 
  754           unsigned int A_col_start,
 
  755           unsigned int A_row_inc,
 
  756           unsigned int A_col_inc,
 
  757           unsigned int A_row_size,
 
  758           unsigned int A_col_size,
 
  759           unsigned int A_internal_rows,
 
  760           unsigned int A_internal_cols,
 
  762           unsigned int B_row_start,
 
  763           unsigned int B_col_start,
 
  764           unsigned int B_row_inc,
 
  765           unsigned int B_col_inc,
 
  766           unsigned int B_row_size,
 
  767           unsigned int B_col_size,
 
  768           unsigned int B_internal_rows,
 
  769           unsigned int B_internal_cols,
 
  772           unsigned int C_row_start,
 
  773           unsigned int C_col_start,
 
  774           unsigned int C_row_inc,
 
  775           unsigned int C_col_inc,
 
  776           unsigned int C_row_size,
 
  777           unsigned int C_col_size,
 
  778           unsigned int C_internal_rows,
 
  779           unsigned int C_internal_cols)
 
  790   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
 
  791   vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
 
  792   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
 
  793   vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
 
  794   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
  796   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
  797   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
  799   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
  800   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
  805     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
  806     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
  808     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
  809     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
  810       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  811       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  812       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  813       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  814       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  815       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  816       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  817       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  818       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  819       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  820       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  821       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  822       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  823       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  824       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  825       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  830   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
  831     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
  836 template<
typename NumericT>
 
  840           unsigned int A_row_start,
 
  841           unsigned int A_col_start,
 
  842           unsigned int A_row_inc,
 
  843           unsigned int A_col_inc,
 
  844           unsigned int A_row_size,
 
  845           unsigned int A_col_size,
 
  846           unsigned int A_internal_rows,
 
  847           unsigned int A_internal_cols,
 
  849           unsigned int B_row_start,
 
  850           unsigned int B_col_start,
 
  851           unsigned int B_row_inc,
 
  852           unsigned int B_col_inc,
 
  853           unsigned int B_row_size,
 
  854           unsigned int B_col_size,
 
  855           unsigned int B_internal_rows,
 
  856           unsigned int B_internal_cols,
 
  859           unsigned int C_row_start,
 
  860           unsigned int C_col_start,
 
  861           unsigned int C_row_inc,
 
  862           unsigned int C_col_inc,
 
  863           unsigned int C_row_size,
 
  864           unsigned int C_col_size,
 
  865           unsigned int C_internal_rows,
 
  866           unsigned int C_internal_cols)
 
  877   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
 
  878   vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
 
  879   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
 
  881   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
  883   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
  884   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
  886   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
  887   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
  892     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
  893     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
  895     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
  896     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
  897       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  898       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  899       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  900       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  901       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  902       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  903       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  904       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  905       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  906       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  907       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  908       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  909       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  910       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  911       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  912       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  917   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
  918     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
  923 template<
typename NumericT>
 
  927           unsigned int A_row_start,
 
  928           unsigned int A_col_start,
 
  929           unsigned int A_row_inc,
 
  930           unsigned int A_col_inc,
 
  931           unsigned int A_row_size,
 
  932           unsigned int A_col_size,
 
  933           unsigned int A_internal_rows,
 
  934           unsigned int A_internal_cols,
 
  936           unsigned int B_row_start,
 
  937           unsigned int B_col_start,
 
  938           unsigned int B_row_inc,
 
  939           unsigned int B_col_inc,
 
  940           unsigned int B_row_size,
 
  941           unsigned int B_col_size,
 
  942           unsigned int B_internal_rows,
 
  943           unsigned int B_internal_cols,
 
  946           unsigned int C_row_start,
 
  947           unsigned int C_col_start,
 
  948           unsigned int C_row_inc,
 
  949           unsigned int C_col_inc,
 
  950           unsigned int C_row_size,
 
  951           unsigned int C_col_size,
 
  952           unsigned int C_internal_rows,
 
  953           unsigned int C_internal_cols)
 
  964   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
 
  966   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
 
  967   vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
 
  968   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
  970   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
  971   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
  973   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
  974   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
  979     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
  980     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
  982     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
  983     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
  984       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  985       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  986       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  987       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  988       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  989       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  990       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  991       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  992       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  993       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  994       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  995       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  996       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  997       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  998       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
  999       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1004   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 1005     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
 1010 template<
typename NumericT>
 
 1014           unsigned int A_row_start,
 
 1015           unsigned int A_col_start,
 
 1016           unsigned int A_row_inc,
 
 1017           unsigned int A_col_inc,
 
 1018           unsigned int A_row_size,
 
 1019           unsigned int A_col_size,
 
 1020           unsigned int A_internal_rows,
 
 1021           unsigned int A_internal_cols,
 
 1023           unsigned int B_row_start,
 
 1024           unsigned int B_col_start,
 
 1025           unsigned int B_row_inc,
 
 1026           unsigned int B_col_inc,
 
 1027           unsigned int B_row_size,
 
 1028           unsigned int B_col_size,
 
 1029           unsigned int B_internal_rows,
 
 1030           unsigned int B_internal_cols,
 
 1033           unsigned int C_row_start,
 
 1034           unsigned int C_col_start,
 
 1035           unsigned int C_row_inc,
 
 1036           unsigned int C_col_inc,
 
 1037           unsigned int C_row_size,
 
 1038           unsigned int C_col_size,
 
 1039           unsigned int C_internal_rows,
 
 1040           unsigned int C_internal_cols)
 
 1051   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
 
 1053   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
 
 1055   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 1057   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
 1058   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 1060   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1061   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1066     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 1067     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 1069     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1070     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1071       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1072       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1073       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1074       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1075       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1076       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1077       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1078       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1079       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1080       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1081       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1082       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1083       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1084       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1085       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1086       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1091   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 1092     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
 1104 template<
typename NumericT>
 
 1108           unsigned int A_row_start,
 
 1109           unsigned int A_col_start,
 
 1110           unsigned int A_row_inc,
 
 1111           unsigned int A_col_inc,
 
 1112           unsigned int A_row_size,
 
 1113           unsigned int A_col_size,
 
 1114           unsigned int A_internal_rows,
 
 1115           unsigned int A_internal_cols,
 
 1117           unsigned int B_row_start,
 
 1118           unsigned int B_col_start,
 
 1119           unsigned int B_row_inc,
 
 1120           unsigned int B_col_inc,
 
 1121           unsigned int B_row_size,
 
 1122           unsigned int B_col_size,
 
 1123           unsigned int B_internal_rows,
 
 1124           unsigned int B_internal_cols,
 
 1127           unsigned int C_row_start,
 
 1128           unsigned int C_col_start,
 
 1129           unsigned int C_row_inc,
 
 1130           unsigned int C_col_inc,
 
 1131           unsigned int C_row_size,
 
 1132           unsigned int C_col_size,
 
 1133           unsigned int C_internal_rows,
 
 1134           unsigned int C_internal_cols)
 
 1145   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
 
 1146   vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
 
 1147   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
 
 1148   vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
 
 1149   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
 1151   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
 1152   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 1154   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1155   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1160     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
 1161     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
 1163     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1164     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1165       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1166       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1167       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1168       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1169       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1170       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1171       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1172       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1173       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1174       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1175       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1176       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1177       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1178       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1179       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1180       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1185   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 1186     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 1191 template<
typename NumericT>
 
 1195           unsigned int A_row_start,
 
 1196           unsigned int A_col_start,
 
 1197           unsigned int A_row_inc,
 
 1198           unsigned int A_col_inc,
 
 1199           unsigned int A_row_size,
 
 1200           unsigned int A_col_size,
 
 1201           unsigned int A_internal_rows,
 
 1202           unsigned int A_internal_cols,
 
 1204           unsigned int B_row_start,
 
 1205           unsigned int B_col_start,
 
 1206           unsigned int B_row_inc,
 
 1207           unsigned int B_col_inc,
 
 1208           unsigned int B_row_size,
 
 1209           unsigned int B_col_size,
 
 1210           unsigned int B_internal_rows,
 
 1211           unsigned int B_internal_cols,
 
 1214           unsigned int C_row_start,
 
 1215           unsigned int C_col_start,
 
 1216           unsigned int C_row_inc,
 
 1217           unsigned int C_col_inc,
 
 1218           unsigned int C_row_size,
 
 1219           unsigned int C_col_size,
 
 1220           unsigned int C_internal_rows,
 
 1221           unsigned int C_internal_cols)
 
 1232   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
 
 1233   vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
 
 1234   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
 
 1236   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
 1238   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
 1239   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 1241   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1242   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1247     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
 1248     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 1250     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1251     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1252       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1253       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1254       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1255       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1256       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1257       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1258       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1259       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1260       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1261       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1262       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1263       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1264       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1265       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1266       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1267       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1272   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 1273     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 1278 template<
typename NumericT>
 
 1282           unsigned int A_row_start,
 
 1283           unsigned int A_col_start,
 
 1284           unsigned int A_row_inc,
 
 1285           unsigned int A_col_inc,
 
 1286           unsigned int A_row_size,
 
 1287           unsigned int A_col_size,
 
 1288           unsigned int A_internal_rows,
 
 1289           unsigned int A_internal_cols,
 
 1291           unsigned int B_row_start,
 
 1292           unsigned int B_col_start,
 
 1293           unsigned int B_row_inc,
 
 1294           unsigned int B_col_inc,
 
 1295           unsigned int B_row_size,
 
 1296           unsigned int B_col_size,
 
 1297           unsigned int B_internal_rows,
 
 1298           unsigned int B_internal_cols,
 
 1301           unsigned int C_row_start,
 
 1302           unsigned int C_col_start,
 
 1303           unsigned int C_row_inc,
 
 1304           unsigned int C_col_inc,
 
 1305           unsigned int C_row_size,
 
 1306           unsigned int C_col_size,
 
 1307           unsigned int C_internal_rows,
 
 1308           unsigned int C_internal_cols)
 
 1319   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
 
 1321   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
 
 1322   vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
 
 1323   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 1325   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
 1326   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 1328   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1329   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1334     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 1335     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
 1337     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1338     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1339       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1340       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1341       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1342       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1343       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1344       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1345       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1346       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1347       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1348       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1349       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1350       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1351       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1352       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1353       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1354       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1359   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 1360     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 1365 template<
typename NumericT>
 
 1369           unsigned int A_row_start,
 
 1370           unsigned int A_col_start,
 
 1371           unsigned int A_row_inc,
 
 1372           unsigned int A_col_inc,
 
 1373           unsigned int A_row_size,
 
 1374           unsigned int A_col_size,
 
 1375           unsigned int A_internal_rows,
 
 1376           unsigned int A_internal_cols,
 
 1378           unsigned int B_row_start,
 
 1379           unsigned int B_col_start,
 
 1380           unsigned int B_row_inc,
 
 1381           unsigned int B_col_inc,
 
 1382           unsigned int B_row_size,
 
 1383           unsigned int B_col_size,
 
 1384           unsigned int B_internal_rows,
 
 1385           unsigned int B_internal_cols,
 
 1388           unsigned int C_row_start,
 
 1389           unsigned int C_col_start,
 
 1390           unsigned int C_row_inc,
 
 1391           unsigned int C_col_inc,
 
 1392           unsigned int C_row_size,
 
 1393           unsigned int C_col_size,
 
 1394           unsigned int C_internal_rows,
 
 1395           unsigned int C_internal_cols)
 
 1406   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
 
 1408   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
 
 1410   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 1412   vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
 
 1413   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 1415   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1416   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1421     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 1422     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 1424     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1425     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1426       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1427       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1428       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1429       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1430       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1431       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1432       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1433       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1434       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1435       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1436       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1437       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1438       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1439       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1440       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1441       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1446   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 1447     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 1463 template<
typename NumericT>
 
 1467           unsigned int A_row_start,
 
 1468           unsigned int A_col_start,
 
 1469           unsigned int A_row_inc,
 
 1470           unsigned int A_col_inc,
 
 1471           unsigned int A_row_size,
 
 1472           unsigned int A_col_size,
 
 1473           unsigned int A_internal_rows,
 
 1474           unsigned int A_internal_cols,
 
 1476           unsigned int B_row_start,
 
 1477           unsigned int B_col_start,
 
 1478           unsigned int B_row_inc,
 
 1479           unsigned int B_col_inc,
 
 1480           unsigned int B_row_size,
 
 1481           unsigned int B_col_size,
 
 1482           unsigned int B_internal_rows,
 
 1483           unsigned int B_internal_cols,
 
 1486           unsigned int C_row_start,
 
 1487           unsigned int C_col_start,
 
 1488           unsigned int C_row_inc,
 
 1489           unsigned int C_col_inc,
 
 1490           unsigned int C_row_size,
 
 1491           unsigned int C_col_size,
 
 1492           unsigned int C_internal_rows,
 
 1493           unsigned int C_internal_cols)
 
 1504   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
 
 1506   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
 
 1508   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
 1510   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 1511   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
 1513   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1514   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1519     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
 1520     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
 1522     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1523     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1524       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1525       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1526       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1527       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1528       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1529       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1530       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1531       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1532       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1533       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1534       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1535       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1536       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1537       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1538       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1539       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1544   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 1545     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
 1550 template<
typename NumericT>
 
 1554           unsigned int A_row_start,
 
 1555           unsigned int A_col_start,
 
 1556           unsigned int A_row_inc,
 
 1557           unsigned int A_col_inc,
 
 1558           unsigned int A_row_size,
 
 1559           unsigned int A_col_size,
 
 1560           unsigned int A_internal_rows,
 
 1561           unsigned int A_internal_cols,
 
 1563           unsigned int B_row_start,
 
 1564           unsigned int B_col_start,
 
 1565           unsigned int B_row_inc,
 
 1566           unsigned int B_col_inc,
 
 1567           unsigned int B_row_size,
 
 1568           unsigned int B_col_size,
 
 1569           unsigned int B_internal_rows,
 
 1570           unsigned int B_internal_cols,
 
 1573           unsigned int C_row_start,
 
 1574           unsigned int C_col_start,
 
 1575           unsigned int C_row_inc,
 
 1576           unsigned int C_col_inc,
 
 1577           unsigned int C_row_size,
 
 1578           unsigned int C_col_size,
 
 1579           unsigned int C_internal_rows,
 
 1580           unsigned int C_internal_cols)
 
 1591   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
 
 1593   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
 
 1594   vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
 
 1595   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
 1597   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 1598   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
 1600   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1601   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1606     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
 1607     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 1609     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1610     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1611       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1612       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1613       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1614       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1615       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1616       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1617       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1618       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1619       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1620       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1621       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1622       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1623       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1624       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1625       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1626       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1631   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 1632     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
 1637 template<
typename NumericT>
 
 1641           unsigned int A_row_start,
 
 1642           unsigned int A_col_start,
 
 1643           unsigned int A_row_inc,
 
 1644           unsigned int A_col_inc,
 
 1645           unsigned int A_row_size,
 
 1646           unsigned int A_col_size,
 
 1647           unsigned int A_internal_rows,
 
 1648           unsigned int A_internal_cols,
 
 1650           unsigned int B_row_start,
 
 1651           unsigned int B_col_start,
 
 1652           unsigned int B_row_inc,
 
 1653           unsigned int B_col_inc,
 
 1654           unsigned int B_row_size,
 
 1655           unsigned int B_col_size,
 
 1656           unsigned int B_internal_rows,
 
 1657           unsigned int B_internal_cols,
 
 1660           unsigned int C_row_start,
 
 1661           unsigned int C_col_start,
 
 1662           unsigned int C_row_inc,
 
 1663           unsigned int C_col_inc,
 
 1664           unsigned int C_row_size,
 
 1665           unsigned int C_col_size,
 
 1666           unsigned int C_internal_rows,
 
 1667           unsigned int C_internal_cols)
 
 1678   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
 
 1679   vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
 
 1680   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
 
 1682   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 1684   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 1685   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
 1687   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1688   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1693     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 1694     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
 1696     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1697     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1698       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1699       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1700       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1701       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1702       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1703       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1704       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1705       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1706       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1707       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1708       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1709       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1710       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1711       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1712       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1713       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1718   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 1719     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
 1724 template<
typename NumericT>
 
 1728           unsigned int A_row_start,
 
 1729           unsigned int A_col_start,
 
 1730           unsigned int A_row_inc,
 
 1731           unsigned int A_col_inc,
 
 1732           unsigned int A_row_size,
 
 1733           unsigned int A_col_size,
 
 1734           unsigned int A_internal_rows,
 
 1735           unsigned int A_internal_cols,
 
 1737           unsigned int B_row_start,
 
 1738           unsigned int B_col_start,
 
 1739           unsigned int B_row_inc,
 
 1740           unsigned int B_col_inc,
 
 1741           unsigned int B_row_size,
 
 1742           unsigned int B_col_size,
 
 1743           unsigned int B_internal_rows,
 
 1744           unsigned int B_internal_cols,
 
 1747           unsigned int C_row_start,
 
 1748           unsigned int C_col_start,
 
 1749           unsigned int C_row_inc,
 
 1750           unsigned int C_col_inc,
 
 1751           unsigned int C_row_size,
 
 1752           unsigned int C_col_size,
 
 1753           unsigned int C_internal_rows,
 
 1754           unsigned int C_internal_cols)
 
 1765   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
 
 1766   vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
 
 1767   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
 
 1768   vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
 
 1769   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 1771   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 1772   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
 1774   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1775   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1780     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 1781     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 1783     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1784     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1785       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1786       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1787       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1788       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1789       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1790       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1791       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1792       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1793       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1794       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1795       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1796       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1797       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1798       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1799       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1800       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1805   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 1806     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
 1819 template<
typename NumericT>
 
 1823           unsigned int A_row_start,
 
 1824           unsigned int A_col_start,
 
 1825           unsigned int A_row_inc,
 
 1826           unsigned int A_col_inc,
 
 1827           unsigned int A_row_size,
 
 1828           unsigned int A_col_size,
 
 1829           unsigned int A_internal_rows,
 
 1830           unsigned int A_internal_cols,
 
 1832           unsigned int B_row_start,
 
 1833           unsigned int B_col_start,
 
 1834           unsigned int B_row_inc,
 
 1835           unsigned int B_col_inc,
 
 1836           unsigned int B_row_size,
 
 1837           unsigned int B_col_size,
 
 1838           unsigned int B_internal_rows,
 
 1839           unsigned int B_internal_cols,
 
 1842           unsigned int C_row_start,
 
 1843           unsigned int C_col_start,
 
 1844           unsigned int C_row_inc,
 
 1845           unsigned int C_col_inc,
 
 1846           unsigned int C_row_size,
 
 1847           unsigned int C_col_size,
 
 1848           unsigned int C_internal_rows,
 
 1849           unsigned int C_internal_cols)
 
 1860   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
 
 1862   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
 
 1864   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
 1866   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 1867   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
 1869   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1870   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1875     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
 1876     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
 1878     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1879     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1880       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1881       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1882       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1883       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1884       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1885       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1886       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1887       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1888       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1889       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1890       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1891       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1892       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1893       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1894       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1895       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1900   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 1901     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 1906 template<
typename NumericT>
 
 1910           unsigned int A_row_start,
 
 1911           unsigned int A_col_start,
 
 1912           unsigned int A_row_inc,
 
 1913           unsigned int A_col_inc,
 
 1914           unsigned int A_row_size,
 
 1915           unsigned int A_col_size,
 
 1916           unsigned int A_internal_rows,
 
 1917           unsigned int A_internal_cols,
 
 1919           unsigned int B_row_start,
 
 1920           unsigned int B_col_start,
 
 1921           unsigned int B_row_inc,
 
 1922           unsigned int B_col_inc,
 
 1923           unsigned int B_row_size,
 
 1924           unsigned int B_col_size,
 
 1925           unsigned int B_internal_rows,
 
 1926           unsigned int B_internal_cols,
 
 1929           unsigned int C_row_start,
 
 1930           unsigned int C_col_start,
 
 1931           unsigned int C_row_inc,
 
 1932           unsigned int C_col_inc,
 
 1933           unsigned int C_row_size,
 
 1934           unsigned int C_col_size,
 
 1935           unsigned int C_internal_rows,
 
 1936           unsigned int C_internal_cols)
 
 1947   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
 
 1949   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
 
 1950   vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
 
 1951   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
 1953   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 1954   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
 1956   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 1957   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 1962     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
 1963     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 1965     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 1966     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 1967       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1968       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1969       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1970       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1971       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1972       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1973       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1974       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1975       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1976       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1977       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1978       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1979       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1980       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1981       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1982       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 1987   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 1988     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 1993 template<
typename NumericT>
 
 1997           unsigned int A_row_start,
 
 1998           unsigned int A_col_start,
 
 1999           unsigned int A_row_inc,
 
 2000           unsigned int A_col_inc,
 
 2001           unsigned int A_row_size,
 
 2002           unsigned int A_col_size,
 
 2003           unsigned int A_internal_rows,
 
 2004           unsigned int A_internal_cols,
 
 2006           unsigned int B_row_start,
 
 2007           unsigned int B_col_start,
 
 2008           unsigned int B_row_inc,
 
 2009           unsigned int B_col_inc,
 
 2010           unsigned int B_row_size,
 
 2011           unsigned int B_col_size,
 
 2012           unsigned int B_internal_rows,
 
 2013           unsigned int B_internal_cols,
 
 2016           unsigned int C_row_start,
 
 2017           unsigned int C_col_start,
 
 2018           unsigned int C_row_inc,
 
 2019           unsigned int C_col_inc,
 
 2020           unsigned int C_row_size,
 
 2021           unsigned int C_col_size,
 
 2022           unsigned int C_internal_rows,
 
 2023           unsigned int C_internal_cols)
 
 2034   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
 
 2035   vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
 
 2036   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
 
 2038   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 2040   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 2041   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
 2043   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 2044   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 2049     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 2050     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
 2052     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 2053     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 2054       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2055       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2056       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2057       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2058       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2059       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2060       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2061       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2062       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2063       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2064       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2065       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2066       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2067       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2068       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2069       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2074   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 2075     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 2080 template<
typename NumericT>
 
 2084           unsigned int A_row_start,
 
 2085           unsigned int A_col_start,
 
 2086           unsigned int A_row_inc,
 
 2087           unsigned int A_col_inc,
 
 2088           unsigned int A_row_size,
 
 2089           unsigned int A_col_size,
 
 2090           unsigned int A_internal_rows,
 
 2091           unsigned int A_internal_cols,
 
 2093           unsigned int B_row_start,
 
 2094           unsigned int B_col_start,
 
 2095           unsigned int B_row_inc,
 
 2096           unsigned int B_col_inc,
 
 2097           unsigned int B_row_size,
 
 2098           unsigned int B_col_size,
 
 2099           unsigned int B_internal_rows,
 
 2100           unsigned int B_internal_cols,
 
 2103           unsigned int C_row_start,
 
 2104           unsigned int C_col_start,
 
 2105           unsigned int C_row_inc,
 
 2106           unsigned int C_col_inc,
 
 2107           unsigned int C_row_size,
 
 2108           unsigned int C_col_size,
 
 2109           unsigned int C_internal_rows,
 
 2110           unsigned int C_internal_cols)
 
 2121   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
 
 2122   vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
 
 2123   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
 
 2124   vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
 
 2125   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 2127   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 2128   vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
 
 2130   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 2131   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 2136     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 2137     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 2139     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 2140     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 2141       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2142       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2143       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2144       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2145       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2146       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2147       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2148       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2149       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2150       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2151       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2152       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2153       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2154       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2155       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2156       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2161   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 2162     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 2178 template<
typename NumericT>
 
 2182           unsigned int A_row_start,
 
 2183           unsigned int A_col_start,
 
 2184           unsigned int A_row_inc,
 
 2185           unsigned int A_col_inc,
 
 2186           unsigned int A_row_size,
 
 2187           unsigned int A_col_size,
 
 2188           unsigned int A_internal_rows,
 
 2189           unsigned int A_internal_cols,
 
 2191           unsigned int B_row_start,
 
 2192           unsigned int B_col_start,
 
 2193           unsigned int B_row_inc,
 
 2194           unsigned int B_col_inc,
 
 2195           unsigned int B_row_size,
 
 2196           unsigned int B_col_size,
 
 2197           unsigned int B_internal_rows,
 
 2198           unsigned int B_internal_cols,
 
 2201           unsigned int C_row_start,
 
 2202           unsigned int C_col_start,
 
 2203           unsigned int C_row_inc,
 
 2204           unsigned int C_col_inc,
 
 2205           unsigned int C_row_size,
 
 2206           unsigned int C_col_size,
 
 2207           unsigned int C_internal_rows,
 
 2208           unsigned int C_internal_cols)
 
 2219   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
 
 2221   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
 
 2222   vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
 
 2223   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
 2225   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 2226   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 2228   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 2229   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 2234     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
 2235     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
 2237     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 2238     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 2239       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2240       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2241       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2242       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2243       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2244       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2245       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2246       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2247       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2248       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2249       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2250       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2251       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2252       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2253       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2254       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2259   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 2260     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
 2265 template<
typename NumericT>
 
 2269           unsigned int A_row_start,
 
 2270           unsigned int A_col_start,
 
 2271           unsigned int A_row_inc,
 
 2272           unsigned int A_col_inc,
 
 2273           unsigned int A_row_size,
 
 2274           unsigned int A_col_size,
 
 2275           unsigned int A_internal_rows,
 
 2276           unsigned int A_internal_cols,
 
 2278           unsigned int B_row_start,
 
 2279           unsigned int B_col_start,
 
 2280           unsigned int B_row_inc,
 
 2281           unsigned int B_col_inc,
 
 2282           unsigned int B_row_size,
 
 2283           unsigned int B_col_size,
 
 2284           unsigned int B_internal_rows,
 
 2285           unsigned int B_internal_cols,
 
 2288           unsigned int C_row_start,
 
 2289           unsigned int C_col_start,
 
 2290           unsigned int C_row_inc,
 
 2291           unsigned int C_col_inc,
 
 2292           unsigned int C_row_size,
 
 2293           unsigned int C_col_size,
 
 2294           unsigned int C_internal_rows,
 
 2295           unsigned int C_internal_cols)
 
 2306   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
 
 2308   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
 
 2310   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
 2312   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 2313   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 2315   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 2316   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 2321     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
 2322     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 2324     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 2325     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 2326       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2327       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2328       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2329       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2330       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2331       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2332       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2333       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2334       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2335       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2336       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2337       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2338       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2339       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2340       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2341       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2346   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 2347     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
 2352 template<
typename NumericT>
 
 2356           unsigned int A_row_start,
 
 2357           unsigned int A_col_start,
 
 2358           unsigned int A_row_inc,
 
 2359           unsigned int A_col_inc,
 
 2360           unsigned int A_row_size,
 
 2361           unsigned int A_col_size,
 
 2362           unsigned int A_internal_rows,
 
 2363           unsigned int A_internal_cols,
 
 2365           unsigned int B_row_start,
 
 2366           unsigned int B_col_start,
 
 2367           unsigned int B_row_inc,
 
 2368           unsigned int B_col_inc,
 
 2369           unsigned int B_row_size,
 
 2370           unsigned int B_col_size,
 
 2371           unsigned int B_internal_rows,
 
 2372           unsigned int B_internal_cols,
 
 2375           unsigned int C_row_start,
 
 2376           unsigned int C_col_start,
 
 2377           unsigned int C_row_inc,
 
 2378           unsigned int C_col_inc,
 
 2379           unsigned int C_row_size,
 
 2380           unsigned int C_col_size,
 
 2381           unsigned int C_internal_rows,
 
 2382           unsigned int C_internal_cols)
 
 2393   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
 
 2394   vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
 
 2395   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
 
 2396   vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
 
 2397   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 2399   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 2400   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 2402   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 2403   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 2408     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 2409     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
 2411     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 2412     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 2413       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2414       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2415       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2416       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2417       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2418       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2419       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2420       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2421       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2422       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2423       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2424       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2425       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2426       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2427       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2428       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2433   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 2434     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
 2439 template<
typename NumericT>
 
 2443           unsigned int A_row_start,
 
 2444           unsigned int A_col_start,
 
 2445           unsigned int A_row_inc,
 
 2446           unsigned int A_col_inc,
 
 2447           unsigned int A_row_size,
 
 2448           unsigned int A_col_size,
 
 2449           unsigned int A_internal_rows,
 
 2450           unsigned int A_internal_cols,
 
 2452           unsigned int B_row_start,
 
 2453           unsigned int B_col_start,
 
 2454           unsigned int B_row_inc,
 
 2455           unsigned int B_col_inc,
 
 2456           unsigned int B_row_size,
 
 2457           unsigned int B_col_size,
 
 2458           unsigned int B_internal_rows,
 
 2459           unsigned int B_internal_cols,
 
 2462           unsigned int C_row_start,
 
 2463           unsigned int C_col_start,
 
 2464           unsigned int C_row_inc,
 
 2465           unsigned int C_col_inc,
 
 2466           unsigned int C_row_size,
 
 2467           unsigned int C_col_size,
 
 2468           unsigned int C_internal_rows,
 
 2469           unsigned int C_internal_cols)
 
 2480   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
 
 2481   vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
 
 2482   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
 
 2484   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 2486   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 2487   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 2489   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 2490   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 2495     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 2496     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 2498     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 2499     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 2500       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2501       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2502       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2503       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2504       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2505       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2506       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2507       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2508       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2509       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2510       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2511       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2512       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2513       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2514       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2515       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2520   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 2521     C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
 
 2535 template<
typename NumericT>
 
 2539           unsigned int A_row_start,
 
 2540           unsigned int A_col_start,
 
 2541           unsigned int A_row_inc,
 
 2542           unsigned int A_col_inc,
 
 2543           unsigned int A_row_size,
 
 2544           unsigned int A_col_size,
 
 2545           unsigned int A_internal_rows,
 
 2546           unsigned int A_internal_cols,
 
 2548           unsigned int B_row_start,
 
 2549           unsigned int B_col_start,
 
 2550           unsigned int B_row_inc,
 
 2551           unsigned int B_col_inc,
 
 2552           unsigned int B_row_size,
 
 2553           unsigned int B_col_size,
 
 2554           unsigned int B_internal_rows,
 
 2555           unsigned int B_internal_cols,
 
 2558           unsigned int C_row_start,
 
 2559           unsigned int C_col_start,
 
 2560           unsigned int C_row_inc,
 
 2561           unsigned int C_col_inc,
 
 2562           unsigned int C_row_size,
 
 2563           unsigned int C_col_size,
 
 2564           unsigned int C_internal_rows,
 
 2565           unsigned int C_internal_cols)
 
 2576   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
 
 2578   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
 
 2579   vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
 
 2580   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
 2582   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 2583   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 2585   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 2586   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 2591     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
 2592     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
 2594     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 2595     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 2596       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2597       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2598       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2599       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2600       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2601       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2602       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2603       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2604       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2605       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2606       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2607       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2608       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2609       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2610       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2611       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2616   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 2617     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 2622 template<
typename NumericT>
 
 2626           unsigned int A_row_start,
 
 2627           unsigned int A_col_start,
 
 2628           unsigned int A_row_inc,
 
 2629           unsigned int A_col_inc,
 
 2630           unsigned int A_row_size,
 
 2631           unsigned int A_col_size,
 
 2632           unsigned int A_internal_rows,
 
 2633           unsigned int A_internal_cols,
 
 2635           unsigned int B_row_start,
 
 2636           unsigned int B_col_start,
 
 2637           unsigned int B_row_inc,
 
 2638           unsigned int B_col_inc,
 
 2639           unsigned int B_row_size,
 
 2640           unsigned int B_col_size,
 
 2641           unsigned int B_internal_rows,
 
 2642           unsigned int B_internal_cols,
 
 2645           unsigned int C_row_start,
 
 2646           unsigned int C_col_start,
 
 2647           unsigned int C_row_inc,
 
 2648           unsigned int C_col_inc,
 
 2649           unsigned int C_row_size,
 
 2650           unsigned int C_col_size,
 
 2651           unsigned int C_internal_rows,
 
 2652           unsigned int C_internal_cols)
 
 2663   vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
 
 2665   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
 
 2667   vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
 
 2669   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 2670   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 2672   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 2673   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 2678     bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
 
 2679     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 2681     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 2682     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 2683       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2684       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2685       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2686       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2687       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2688       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2689       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2690       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2691       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2692       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2693       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2694       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2695       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2696       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2697       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2698       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2703   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 2704     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 2709 template<
typename NumericT>
 
 2713           unsigned int A_row_start,
 
 2714           unsigned int A_col_start,
 
 2715           unsigned int A_row_inc,
 
 2716           unsigned int A_col_inc,
 
 2717           unsigned int A_row_size,
 
 2718           unsigned int A_col_size,
 
 2719           unsigned int A_internal_rows,
 
 2720           unsigned int A_internal_cols,
 
 2722           unsigned int B_row_start,
 
 2723           unsigned int B_col_start,
 
 2724           unsigned int B_row_inc,
 
 2725           unsigned int B_col_inc,
 
 2726           unsigned int B_row_size,
 
 2727           unsigned int B_col_size,
 
 2728           unsigned int B_internal_rows,
 
 2729           unsigned int B_internal_cols,
 
 2732           unsigned int C_row_start,
 
 2733           unsigned int C_col_start,
 
 2734           unsigned int C_row_inc,
 
 2735           unsigned int C_col_inc,
 
 2736           unsigned int C_row_size,
 
 2737           unsigned int C_col_size,
 
 2738           unsigned int C_internal_rows,
 
 2739           unsigned int C_internal_cols)
 
 2750   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
 
 2751   vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
 
 2752   vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
 
 2753   vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
 
 2754   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 2756   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 2757   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 2759   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 2760   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 2765     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 2766     bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
 
 2768     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 2769     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 2770       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2771       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2772       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2773       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2774       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2775       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2776       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2777       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2778       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2779       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2780       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2781       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2782       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2783       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2784       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2785       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2790   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
 
 2791     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
 2796 template<
typename NumericT>
 
 2800           unsigned int A_row_start,
 
 2801           unsigned int A_col_start,
 
 2802           unsigned int A_row_inc,
 
 2803           unsigned int A_col_inc,
 
 2804           unsigned int A_row_size,
 
 2805           unsigned int A_col_size,
 
 2806           unsigned int A_internal_rows,
 
 2807           unsigned int A_internal_cols,
 
 2809           unsigned int B_row_start,
 
 2810           unsigned int B_col_start,
 
 2811           unsigned int B_row_inc,
 
 2812           unsigned int B_col_inc,
 
 2813           unsigned int B_row_size,
 
 2814           unsigned int B_col_size,
 
 2815           unsigned int B_internal_rows,
 
 2816           unsigned int B_internal_cols,
 
 2819           unsigned int C_row_start,
 
 2820           unsigned int C_col_start,
 
 2821           unsigned int C_row_inc,
 
 2822           unsigned int C_col_inc,
 
 2823           unsigned int C_row_size,
 
 2824           unsigned int C_col_size,
 
 2825           unsigned int C_internal_rows,
 
 2826           unsigned int C_internal_cols)
 
 2837   vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
 
 2838   vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
 
 2839   vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
 
 2841   vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
 
 2843   vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
 
 2844   vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
 
 2846   vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
 
 2847   vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
 
 2852     bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
 
 2853     bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
 
 2855     NumericT * bufAptr = bufA + row_thread_id_times_block_size;
 
 2856     NumericT * bufBptr = bufB + col_thread_id_times_block_size;
 
 2857       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2858       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2859       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2860       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2861       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2862       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2863       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2864       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2865       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2866       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2867       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2868       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2869       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2870       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2871       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2872       Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
 
 2877   if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
 
 2878     C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
 
__global__ void matrix_matrix_col_col_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
This file provides the forward declarations for the main types used within ViennaCL. 
__global__ void matrix_matrix_col_row_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)