1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_ 
   53   template<
typename NumericT>
 
   55             const unsigned int * row_indices,
 
   56             const unsigned int * column_indices,
 
   62     for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
   64                       row += gridDim.x * blockDim.x)
 
   67       unsigned int row_end = row_indices[
row+1];
 
   72           for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
 
   73             value = 
max(value, fabs(elements[i]));
 
   77           for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
 
   78             value += fabs(elements[i]);
 
   82           for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
 
   83             value += elements[i] * elements[i];
 
   88           for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
 
   90             if (column_indices[i] == 
row)
 
  106   template<
typename NumericT, 
unsigned int AligmentV>
 
  111     csr_row_info_extractor_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
 
  112                                                 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
  113                                                 viennacl::cuda_arg<NumericT>(mat.
handle()),
 
  115                                                 static_cast<unsigned int>(mat.
size1()),
 
  116                                                 static_cast<unsigned int>(info_selector)
 
  123     template<
typename NumericT>
 
  129     template<
typename NumericT>
 
  137 template<
unsigned int SubWarpSizeV, 
typename AlphaBetaHandlerT, 
typename NumericT>
 
  139           const unsigned int * row_indices,
 
  140           const unsigned int * column_indices,
 
  143           unsigned int start_x,
 
  147           unsigned int start_result,
 
  148           unsigned int inc_result,
 
  149           unsigned int size_result,
 
  152   __shared__ 
NumericT shared_elements[512];
 
  154   const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
 
  155   const unsigned int block_increment = blockDim.x * ((size_result - 1) / (gridDim.x * blockDim.x) + 1);
 
  156   const unsigned int block_start = blockIdx.x * block_increment;
 
  157   const unsigned int block_stop  = 
min(block_start + block_increment, size_result);
 
  159   for (
unsigned int row  = block_start + threadIdx.x / SubWarpSizeV;
 
  161                     row += blockDim.x / SubWarpSizeV)
 
  164     unsigned int row_end = row_indices[
row+1];
 
  165     for (
unsigned int i = row_indices[
row] + id_in_row; i < row_end; i += SubWarpSizeV)
 
  166       dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
 
  168     shared_elements[threadIdx.x] = 
dot_prod;
 
  169     if (1  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  1];
 
  170     if (2  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  2];
 
  171     if (4  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  4];
 
  172     if (8  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  8];
 
  173     if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
 
  176       AlphaBetaHandlerT::apply(result[
row * inc_result + start_result], alpha, shared_elements[threadIdx.x], beta);
 
  181 template<
typename AlphaBetaHandlerT, 
typename NumericT>
 
  183           const unsigned int * row_indices,
 
  184           const unsigned int * column_indices,
 
  185           const unsigned int * row_blocks,
 
  187           unsigned int num_blocks,
 
  189           unsigned int start_x,
 
  193           unsigned int start_result,
 
  194           unsigned int inc_result,
 
  195           unsigned int size_result,
 
  198   __shared__ 
NumericT     shared_elements[1024];
 
  200   for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
 
  202     unsigned int row_start = row_blocks[block_id];
 
  203     unsigned int row_stop  = row_blocks[block_id + 1];
 
  204     unsigned int element_start = row_indices[row_start];
 
  205     unsigned int element_stop = row_indices[row_stop];
 
  206     unsigned int rows_to_process = row_stop - row_start;
 
  208     if (rows_to_process > 1)  
 
  211       for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
 
  212         shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
 
  217       for (
unsigned int row = row_start + threadIdx.x; 
row < row_stop; 
row += blockDim.x)
 
  220         unsigned int thread_row_start = row_indices[
row]     - element_start;
 
  221         unsigned int thread_row_stop  = row_indices[
row + 1] - element_start;
 
  222         for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
 
  223           dot_prod += shared_elements[i];
 
  224         AlphaBetaHandlerT::apply(result[
row * inc_result + start_result], alpha, dot_prod, beta);
 
  231       shared_elements[threadIdx.x] = 0;
 
  232       for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
 
  233         shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
 
  240           shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
 
  243       if (threadIdx.x == 0)
 
  244         AlphaBetaHandlerT::apply(result[row_start * inc_result + start_result], alpha, shared_elements[0], beta);
 
  262 template<
class NumericT, 
unsigned int AlignmentV>
 
  269   static bool first = 
true;
 
  270   static bool is_maxwell = 
false;
 
  280     int device_index = 0;
 
  282     cudaError_t err_flag = cudaGetDevice(&device_index);
 
  283     if (err_flag == cudaSuccess)
 
  285       err_flag = cudaGetDeviceProperties(&prop, device_index);
 
  286       if (err_flag == cudaSuccess && prop.major >= 5)
 
  292   if (is_maxwell && 
double(mat.
nnz()) / 
double(mat.
size1()) > 6.4) 
 
  295       compressed_matrix_vec_mul_kernel<8, detail::spmv_alpha_beta, NumericT><<<512, 256>>>(   
 
  296                                                                     viennacl::cuda_arg<unsigned int>(mat.
handle1()),
 
  297                                                                     viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
  298                                                                     viennacl::cuda_arg<NumericT>(mat.
handle()),
 
  300                                                                     static_cast<unsigned int>(vec.
start()),
 
  301                                                                     static_cast<unsigned int>(vec.
stride()),
 
  304                                                                     static_cast<unsigned int>(result.
start()),
 
  305                                                                     static_cast<unsigned int>(result.
stride()),
 
  306                                                                     static_cast<unsigned int>(result.
size()),
 
  310       compressed_matrix_vec_mul_kernel<8, detail::spmv_pure, NumericT><<<512, 256>>>(   
 
  311                                                                     viennacl::cuda_arg<unsigned int>(mat.
handle1()),
 
  312                                                                     viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
  313                                                                     viennacl::cuda_arg<NumericT>(mat.
handle()),
 
  315                                                                     static_cast<unsigned int>(vec.
start()),
 
  316                                                                     static_cast<unsigned int>(vec.
stride()),
 
  319                                                                     static_cast<unsigned int>(result.
start()),
 
  320                                                                     static_cast<unsigned int>(result.
stride()),
 
  321                                                                     static_cast<unsigned int>(result.
size()),
 
  326   else if (!is_maxwell && 
double(mat.
nnz()) / 
double(mat.
size1()) > 12.0) 
 
  329       compressed_matrix_vec_mul_kernel<16, detail::spmv_alpha_beta, NumericT><<<512, 256>>>(   
 
  330                                                                    viennacl::cuda_arg<unsigned int>(mat.
handle1()),
 
  331                                                                    viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
  332                                                                    viennacl::cuda_arg<NumericT>(mat.
handle()),
 
  334                                                                    static_cast<unsigned int>(vec.
start()),
 
  335                                                                    static_cast<unsigned int>(vec.
stride()),
 
  338                                                                    static_cast<unsigned int>(result.
start()),
 
  339                                                                    static_cast<unsigned int>(result.
stride()),
 
  340                                                                    static_cast<unsigned int>(result.
size()),
 
  344       compressed_matrix_vec_mul_kernel<16, detail::spmv_pure, NumericT><<<512, 256>>>(   
 
  345                                                                    viennacl::cuda_arg<unsigned int>(mat.
handle1()),
 
  346                                                                    viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
  347                                                                    viennacl::cuda_arg<NumericT>(mat.
handle()),
 
  349                                                                    static_cast<unsigned int>(vec.
start()),
 
  350                                                                    static_cast<unsigned int>(vec.
stride()),
 
  353                                                                    static_cast<unsigned int>(result.
start()),
 
  354                                                                    static_cast<unsigned int>(result.
stride()),
 
  355                                                                    static_cast<unsigned int>(result.
size()),
 
  363       compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_alpha_beta><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
 
  364                                                               viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
  365                                                               viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
  366                                                               viennacl::cuda_arg<NumericT>(mat.
handle()),
 
  367                                                               static_cast<unsigned int>(mat.
blocks1()),
 
  369                                                               static_cast<unsigned int>(vec.
start()),
 
  370                                                               static_cast<unsigned int>(vec.
stride()),
 
  373                                                               static_cast<unsigned int>(result.
start()),
 
  374                                                               static_cast<unsigned int>(result.
stride()),
 
  375                                                               static_cast<unsigned int>(result.
size()),
 
  379       compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_pure><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
 
  380                                                               viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
  381                                                               viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
  382                                                               viennacl::cuda_arg<NumericT>(mat.
handle()),
 
  383                                                               static_cast<unsigned int>(mat.
blocks1()),
 
  385                                                               static_cast<unsigned int>(vec.
start()),
 
  386                                                               static_cast<unsigned int>(vec.
stride()),
 
  389                                                               static_cast<unsigned int>(result.
start()),
 
  390                                                               static_cast<unsigned int>(result.
stride()),
 
  391                                                               static_cast<unsigned int>(result.
size()),
 
  402 template<
typename LayoutT>
 
  405   static __device__ 
unsigned int apply(
unsigned int i, 
unsigned int j,
 
  406                                 unsigned int row_start, 
unsigned int row_inc,
 
  407                                 unsigned int col_start, 
unsigned int col_inc,
 
  408                                 unsigned int internal_rows, 
unsigned int internal_cols)
 
  410     return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
 
  418   static __device__ 
unsigned int apply(
unsigned int i, 
unsigned int j,
 
  419                                 unsigned int row_start, 
unsigned int row_inc,
 
  420                                 unsigned int col_start, 
unsigned int col_inc,
 
  421                                 unsigned int internal_rows, 
unsigned int internal_cols)
 
  423     return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
 
  429 template<
typename DMatIndexT, 
typename ResultIndexT, 
typename NumericT>
 
  431           const unsigned int * sp_mat_row_indices,
 
  432           const unsigned int * sp_mat_col_indices,
 
  435           unsigned int d_mat_row_start,
 
  436           unsigned int d_mat_col_start,
 
  437           unsigned int d_mat_row_inc,
 
  438           unsigned int d_mat_col_inc,
 
  439           unsigned int d_mat_row_size,
 
  440           unsigned int d_mat_col_size,
 
  441           unsigned int d_mat_internal_rows,
 
  442           unsigned int d_mat_internal_cols,
 
  444           unsigned int result_row_start,
 
  445           unsigned int result_col_start,
 
  446           unsigned int result_row_inc,
 
  447           unsigned int result_col_inc,
 
  448           unsigned int result_row_size,
 
  449           unsigned int result_col_size,
 
  450           unsigned int result_internal_rows,
 
  451           unsigned int result_internal_cols)
 
  453   for (
unsigned int row  = blockIdx.x; 
row  < result_row_size; 
row += gridDim.x)
 
  455     unsigned int row_start = sp_mat_row_indices[
row];
 
  456     unsigned int row_end = sp_mat_row_indices[
row+1];
 
  458     for ( 
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
 
  462       for (
unsigned int k = row_start; k < row_end; k++)
 
  464         unsigned int j = sp_mat_col_indices[k];
 
  466         NumericT y = d_mat[ DMatIndexT::apply(j, col,
 
  467                                               d_mat_row_start, d_mat_row_inc,
 
  468                                               d_mat_col_start, d_mat_col_inc,
 
  469                                               d_mat_internal_rows, d_mat_internal_cols) ];
 
  474       result[ResultIndexT::apply(
row, col,
 
  475                                  result_row_start, result_row_inc,
 
  476                                  result_col_start, result_col_inc,
 
  477                                  result_internal_rows, result_internal_cols)] = r;
 
  491 template<
typename NumericT, 
unsigned int AlignmentV>
 
  499                                                   (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
 
  500                                                    viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
  501                                                    viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
  520                                                   (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
 
  521                                                    viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
  522                                                    viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
  541                                                   (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
 
  542                                                    viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
  543                                                    viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
  562                                                   (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
 
  563                                                    viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
  564                                                    viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
  583 template<
typename DMatIndexT, 
typename ResultIndexT, 
typename NumericT>
 
  585           const unsigned int * sp_mat_row_indices,
 
  586           const unsigned int * sp_mat_col_indices,
 
  589           unsigned int d_mat_row_start,
 
  590           unsigned int d_mat_col_start,
 
  591           unsigned int d_mat_row_inc,
 
  592           unsigned int d_mat_col_inc,
 
  593           unsigned int d_mat_row_size,
 
  594           unsigned int d_mat_col_size,
 
  595           unsigned int d_mat_internal_rows,
 
  596           unsigned int d_mat_internal_cols,
 
  598           unsigned int result_row_start,
 
  599           unsigned int result_col_start,
 
  600           unsigned int result_row_inc,
 
  601           unsigned int result_col_inc,
 
  602           unsigned int result_row_size,
 
  603           unsigned int result_col_size,
 
  604           unsigned int result_internal_rows,
 
  605           unsigned int result_internal_cols)
 
  607   for (
unsigned int row  = blockIdx.x; 
row  < result_row_size; 
row += gridDim.x)
 
  609     unsigned int row_start = sp_mat_row_indices[
row];
 
  610     unsigned int row_end = sp_mat_row_indices[
row+1];
 
  612     for ( 
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
 
  616       for (
unsigned int k = row_start; k < row_end; k++)
 
  618         unsigned int j = sp_mat_col_indices[k];
 
  620         NumericT y = d_mat[ DMatIndexT::apply(col, j,
 
  621                                               d_mat_row_start, d_mat_row_inc,
 
  622                                               d_mat_col_start, d_mat_col_inc,
 
  623                                               d_mat_internal_rows, d_mat_internal_cols) ];
 
  628       result [ ResultIndexT::apply(
row, col,
 
  629                                    result_row_start, result_row_inc,
 
  630                                    result_col_start, result_col_inc,
 
  631                                    result_internal_rows, result_internal_cols) ] = r;
 
  646 template<
typename NumericT, 
unsigned int AlignmentV>
 
  654   if (d_mat.lhs().row_major() && result.
row_major())
 
  657                                                 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
 
  658                                                  viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
  659                                                  viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
  675   else if (d_mat.lhs().row_major() && !result.
row_major())
 
  678                                                 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
 
  679                                                  viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
  680                                                  viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
  696   else if (!d_mat.lhs().row_major() && result.
row_major())
 
  699                                                 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
 
  700                                                  viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
  701                                                  viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
  720                                                 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
 
  721                                                  viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
  722                                                  viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
  745 template<
typename NumericT>
 
  747           const unsigned int * row_indices,
 
  748           const unsigned int * column_indices,
 
  753   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
  755                     row += gridDim.x * blockDim.x)
 
  758     unsigned int row_end = row_indices[
row+1];
 
  759     for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
 
  761       unsigned int col_index = column_indices[i];
 
  762       if (col_index == 
row)
 
  778 template<
typename SparseMatrixT, 
typename NumericT>
 
  784   csr_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
 
  785                                          viennacl::cuda_arg<unsigned int>(mat.handle2()),
 
  786                                          viennacl::cuda_arg<NumericT>(mat.handle()),
 
  788                                          static_cast<unsigned int>(mat.size1())
 
  799 template<
typename SparseMatrixT, 
typename NumericT>
 
  805   csr_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
 
  806                                     viennacl::cuda_arg<unsigned int>(mat.handle2()),
 
  807                                     viennacl::cuda_arg<NumericT>(mat.handle()),
 
  809                                     static_cast<unsigned int>(mat.size1())
 
  821 template<
typename SparseMatrixT, 
typename NumericT>
 
  827   csr_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
 
  828                                     viennacl::cuda_arg<unsigned int>(mat.handle2()),
 
  829                                     viennacl::cuda_arg<NumericT>(mat.handle()),
 
  831                                     static_cast<unsigned int>(mat.size1())
 
  842 template<
typename SparseMatrixT, 
typename NumericT>
 
  848   csr_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
 
  849                                     viennacl::cuda_arg<unsigned int>(mat.handle2()),
 
  850                                     viennacl::cuda_arg<NumericT>(mat.handle()),
 
  852                                     static_cast<unsigned int>(mat.size1())
 
  866 template<
typename SparseMatrixT, 
typename NumericT>
 
  872   csr_trans_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
 
  873                                           viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
 
  874                                           viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
 
  876                                           static_cast<unsigned int>(mat.
lhs().size1())
 
  887 template<
typename SparseMatrixT, 
typename NumericT>
 
  895   compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
 
  896                                                 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
 
  897                                                 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
 
  899                                                 static_cast<unsigned int>(mat.
size1())
 
  902   csr_trans_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
 
  903                                           viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
 
  904                                           viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
 
  907                                           static_cast<unsigned int>(mat.
lhs().size1())
 
  918 template<
typename SparseMatrixT, 
typename NumericT>
 
  924   csr_trans_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
 
  925                                                 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
 
  926                                                 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
 
  928                                                 static_cast<unsigned int>(mat.
lhs().size1())
 
  939 template<
typename SparseMatrixT, 
typename NumericT>
 
  947   compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
 
  948                                                 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
 
  949                                                 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
 
  951                                                 static_cast<unsigned int>(mat.
size1())
 
  954   csr_trans_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
 
  955                                            viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
 
  956                                            viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
 
  959                                            static_cast<unsigned int>(mat.
lhs().size1())
 
  969   template<
typename NumericT, 
unsigned int AlignmentV>
 
  978     csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(L.lhs().handle1()),
 
  979                                                          viennacl::cuda_arg<unsigned int>(L.lhs().handle2()),
 
  980                                                          viennacl::cuda_arg<NumericT>(L.lhs().handle()),
 
  981                                                          viennacl::cuda_arg<unsigned int>(block_indices),
 
  983                                                          static_cast<unsigned int>(L.lhs().size1())
 
  988   template<
typename NumericT, 
unsigned int AlignmentV>
 
  997     csr_block_trans_lu_backward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(U.lhs().handle1()),
 
  998                                                      viennacl::cuda_arg<unsigned int>(U.lhs().handle2()),
 
  999                                                      viennacl::cuda_arg<NumericT>(U.lhs().handle()),
 
 1001                                                      viennacl::cuda_arg<unsigned int>(block_indices),
 
 1003                                                      static_cast<unsigned int>(U.lhs().size1())
 
 1015 template<
typename NumericT>
 
 1017           const unsigned int * row_jumper,
 
 1018           const unsigned int * row_indices,
 
 1019           const unsigned int * column_indices,
 
 1021           unsigned int nonzero_rows,
 
 1023           unsigned int start_x,
 
 1027           unsigned int start_result,
 
 1028           unsigned int inc_result,
 
 1029           unsigned int size_result,
 
 1032   for (
unsigned int i  = blockDim.x * blockIdx.x + threadIdx.x;
 
 1034                     i += gridDim.x * blockDim.x)
 
 1037     unsigned int row_end = row_jumper[i+1];
 
 1038     for (
unsigned int j = row_jumper[i]; j < row_end; ++j)
 
 1039       dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
 
 1041     unsigned int index = row_indices[i] * inc_result + start_result;
 
 1042     if (beta != 0) result[index] += alpha * 
dot_prod;
 
 1043     else           result[index]  = alpha * 
dot_prod;
 
 1056 template<
typename NumericT>
 
 1063   if (beta < 0 || beta > 0)
 
 1068   compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
 
 1069                                                             viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 1070                                                             viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 1071                                                             viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 1072                                                             static_cast<unsigned int>(mat.
nnz1()),
 
 1074                                                             static_cast<unsigned int>(vec.
start()),
 
 1075                                                             static_cast<unsigned int>(vec.
stride()),
 
 1078                                                             static_cast<unsigned int>(result.
start()),
 
 1079                                                             static_cast<unsigned int>(result.
stride()),
 
 1080                                                             static_cast<unsigned int>(result.
size()),
 
 1094   template<
typename NumericT>
 
 1097                                           const unsigned int * group_boundaries,
 
 1099                                           unsigned int option)
 
 1101     __shared__ 
unsigned int shared_rows[128];
 
 1102     __shared__ 
NumericT inter_results[128];
 
 1106     unsigned int last_index  = blockDim.x - 1;
 
 1107     unsigned int group_start = group_boundaries[blockIdx.x];
 
 1108     unsigned int group_end   = group_boundaries[blockIdx.x + 1];
 
 1109     unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   
 
 1111     unsigned int local_index = 0;
 
 1113     for (
unsigned int k = 0; k < k_end; ++k)
 
 1115       local_index = group_start + k * blockDim.x + threadIdx.x;
 
 1117       tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
 
 1118       val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
 
 1121       if (threadIdx.x == 0 && k > 0)
 
 1123         if (tmp.x == shared_rows[last_index])
 
 1129               val = 
max(val, fabs(inter_results[last_index]));
 
 1133               val = fabs(val) + inter_results[last_index];
 
 1137               val = sqrt(val * val + inter_results[last_index]);
 
 1151               result[shared_rows[last_index]] = inter_results[last_index];
 
 1155               result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
 
 1164       shared_rows[threadIdx.x] = tmp.x;
 
 1169           inter_results[threadIdx.x] = val;
 
 1172           inter_results[threadIdx.x] = fabs(val);
 
 1175           inter_results[threadIdx.x] = val * val;
 
 1183         NumericT left = (threadIdx.x >= 
stride && tmp.x == shared_rows[threadIdx.x - 
stride]) ? inter_results[threadIdx.x - 
stride] : 0;
 
 1189             inter_results[threadIdx.x] = 
max(inter_results[threadIdx.x], left);
 
 1193             inter_results[threadIdx.x] += left;
 
 1197             inter_results[threadIdx.x] += left;
 
 1207       if (threadIdx.x != last_index &&
 
 1208           shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
 
 1209           inter_results[threadIdx.x] != 0)
 
 1211         result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
 
 1217     if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0)
 
 1218       result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
 
 1221   template<
typename NumericT, 
unsigned int AlignmentV>
 
 1226     coo_row_info_extractor<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle12()),
 
 1227                                          viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 1228                                          viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 1230                                          static_cast<unsigned int>(info_selector)
 
 1238 template<
typename NumericT>
 
 1241                                                  const unsigned int * group_boundaries,
 
 1243                                                  unsigned int start_x,
 
 1247                                                  unsigned int start_result,
 
 1248                                                  unsigned int inc_result,
 
 1251   __shared__ 
unsigned int shared_rows[128];
 
 1252   __shared__ 
NumericT inter_results[128];
 
 1256   unsigned int group_start = group_boundaries[blockIdx.x];
 
 1257   unsigned int group_end   = group_boundaries[blockIdx.x + 1];
 
 1258   unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   
 
 1260   unsigned int local_index = 0;
 
 1262   for (
unsigned int k = 0; k < k_end; ++k)
 
 1264     local_index = group_start + k * blockDim.x + threadIdx.x;
 
 1266     tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
 
 1267     val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
 
 1270     if (threadIdx.x == 0 && k > 0)
 
 1272       if (tmp.x == shared_rows[blockDim.x-1])
 
 1273         val += inter_results[blockDim.x-1];
 
 1275         result[shared_rows[blockDim.x-1] * inc_result + start_result] += alpha * inter_results[blockDim.x-1];
 
 1277         result[shared_rows[blockDim.x-1] * inc_result + start_result]  = alpha * inter_results[blockDim.x-1];
 
 1282     shared_rows[threadIdx.x] = tmp.x;
 
 1283     inter_results[threadIdx.x] = val;
 
 1289       left = (threadIdx.x >= 
stride && tmp.x == shared_rows[threadIdx.x - 
stride]) ? inter_results[threadIdx.x - 
stride] : 0;
 
 1291       inter_results[threadIdx.x] += left;
 
 1296     if (local_index < group_end - 1 && threadIdx.x < blockDim.x-1 &&
 
 1297         shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
 
 1299       if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
 
 1300       else           result[tmp.x * inc_result + start_result]  = alpha * inter_results[threadIdx.x];
 
 1306   if (local_index + 1 == group_end) {
 
 1307     if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
 
 1308     else           result[tmp.x * inc_result + start_result]  = alpha * inter_results[threadIdx.x];
 
 1321 template<
typename NumericT, 
unsigned int AlignmentV>
 
 1328   if (beta < 0 || beta > 0)
 
 1333   coordinate_matrix_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle12()),
 
 1334                                                 viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 1335                                                 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 1337                                                 static_cast<unsigned int>(vec.
start()),
 
 1338                                                 static_cast<unsigned int>(vec.
stride()),
 
 1341                                                 static_cast<unsigned int>(result.
start()),
 
 1342                                                 static_cast<unsigned int>(result.
stride()),
 
 1351 template<
typename DMatIndexT, 
typename ResultIndexT, 
typename NumericT>
 
 1354                                                    const unsigned int * group_boundaries,
 
 1356                                                    unsigned int d_mat_row_start,
 
 1357                                                    unsigned int d_mat_col_start,
 
 1358                                                    unsigned int d_mat_row_inc,
 
 1359                                                    unsigned int d_mat_col_inc,
 
 1360                                                    unsigned int d_mat_row_size,
 
 1361                                                    unsigned int d_mat_col_size,
 
 1362                                                    unsigned int d_mat_internal_rows,
 
 1363                                                    unsigned int d_mat_internal_cols,
 
 1365                                                    unsigned int result_row_start,
 
 1366                                                    unsigned int result_col_start,
 
 1367                                                    unsigned int result_row_inc,
 
 1368                                                    unsigned int result_col_inc,
 
 1369                                                    unsigned int result_row_size,
 
 1370                                                    unsigned int result_col_size,
 
 1371                                                    unsigned int result_internal_rows,
 
 1372                                                    unsigned int result_internal_cols)
 
 1374   __shared__ 
unsigned int shared_rows[128];
 
 1375   __shared__ 
NumericT inter_results[128];
 
 1379   unsigned int group_start = group_boundaries[blockIdx.x];
 
 1380   unsigned int group_end   = group_boundaries[blockIdx.x + 1];
 
 1381   unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   
 
 1383   unsigned int local_index = 0;
 
 1385   for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
 
 1387     for (
unsigned int k = 0; k < k_end; ++k)
 
 1389       local_index = group_start + k * blockDim.x + threadIdx.x;
 
 1391       tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
 
 1392       val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
 
 1393                                                                                         d_mat_row_start, d_mat_row_inc,
 
 1394                                                                                         d_mat_col_start, d_mat_col_inc,
 
 1395                                                                                         d_mat_internal_rows, d_mat_internal_cols) ] : 0;
 
 1398       if (threadIdx.x == 0 && k > 0)
 
 1400         if (tmp.x == shared_rows[blockDim.x-1])
 
 1401           val += inter_results[blockDim.x-1];
 
 1403           result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
 
 1404                                      result_row_start, result_row_inc,
 
 1405                                      result_col_start, result_col_inc,
 
 1406                                      result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
 
 1411       shared_rows[threadIdx.x] = tmp.x;
 
 1412       inter_results[threadIdx.x] = val;
 
 1418         left = (threadIdx.x >= 
stride && tmp.x == shared_rows[threadIdx.x - 
stride]) ? inter_results[threadIdx.x - 
stride] : 0;
 
 1420         inter_results[threadIdx.x] += left;
 
 1425       if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
 
 1426           shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
 
 1428         result[ResultIndexT::apply(tmp.x, result_col,
 
 1429                                    result_row_start, result_row_inc,
 
 1430                                    result_col_start, result_col_inc,
 
 1431                                    result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
 
 1437     if (local_index + 1 == group_end)
 
 1438       result[ResultIndexT::apply(tmp.x, result_col,
 
 1439                                  result_row_start, result_row_inc,
 
 1440                                  result_col_start, result_col_inc,
 
 1441                                  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
 
 1454 template<
typename NumericT, 
unsigned int AlignmentV>
 
 1462                                                   (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
 
 1463                                                    viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1464                                                    viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
 
 1483                                                   (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
 
 1484                                                    viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1485                                                    viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
 
 1504                                                   (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
 
 1505                                                    viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1506                                                    viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
 
 1525                                                   (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
 
 1526                                                    viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1527                                                    viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
 
 1546 template<
typename DMatIndexT, 
typename ResultIndexT, 
typename NumericT>
 
 1549                                                      const unsigned int * group_boundaries,
 
 1551                                                      unsigned int d_mat_row_start,
 
 1552                                                      unsigned int d_mat_col_start,
 
 1553                                                      unsigned int d_mat_row_inc,
 
 1554                                                      unsigned int d_mat_col_inc,
 
 1555                                                      unsigned int d_mat_row_size,
 
 1556                                                      unsigned int d_mat_col_size,
 
 1557                                                      unsigned int d_mat_internal_rows,
 
 1558                                                      unsigned int d_mat_internal_cols,
 
 1560                                                      unsigned int result_row_start,
 
 1561                                                      unsigned int result_col_start,
 
 1562                                                      unsigned int result_row_inc,
 
 1563                                                      unsigned int result_col_inc,
 
 1564                                                      unsigned int result_row_size,
 
 1565                                                      unsigned int result_col_size,
 
 1566                                                      unsigned int result_internal_rows,
 
 1567                                                      unsigned int result_internal_cols)
 
 1569   __shared__ 
unsigned int shared_rows[128];
 
 1570   __shared__ 
NumericT inter_results[128];
 
 1574   unsigned int group_start = group_boundaries[blockIdx.x];
 
 1575   unsigned int group_end   = group_boundaries[blockIdx.x + 1];
 
 1576   unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   
 
 1578   unsigned int local_index = 0;
 
 1580   for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
 
 1582     for (
unsigned int k = 0; k < k_end; ++k)
 
 1584       local_index = group_start + k * blockDim.x + threadIdx.x;
 
 1586       tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
 
 1587       val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
 
 1588                                                                                         d_mat_row_start, d_mat_row_inc,
 
 1589                                                                                         d_mat_col_start, d_mat_col_inc,
 
 1590                                                                                         d_mat_internal_rows, d_mat_internal_cols)] : 0;
 
 1593       if (threadIdx.x == 0 && k > 0)
 
 1595         if (tmp.x == shared_rows[blockDim.x-1])
 
 1596           val += inter_results[blockDim.x-1];
 
 1598           result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
 
 1599                                      result_row_start, result_row_inc,
 
 1600                                      result_col_start, result_col_inc,
 
 1601                                      result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
 
 1606       shared_rows[threadIdx.x] = tmp.x;
 
 1607       inter_results[threadIdx.x] = val;
 
 1613         left = (threadIdx.x >= 
stride && tmp.x == shared_rows[threadIdx.x - 
stride]) ? inter_results[threadIdx.x - 
stride] : 0;
 
 1615         inter_results[threadIdx.x] += left;
 
 1620       if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
 
 1621           shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
 
 1623         result[ ResultIndexT::apply(tmp.x, result_col,
 
 1624                                     result_row_start, result_row_inc,
 
 1625                                     result_col_start, result_col_inc,
 
 1626                                     result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
 
 1632     if (local_index + 1 == group_end)
 
 1633       result[ ResultIndexT::apply(tmp.x, result_col,
 
 1634                                   result_row_start, result_row_inc,
 
 1635                                   result_col_start, result_col_inc,
 
 1636                                   result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
 
 1648 template<
typename NumericT, 
unsigned int AlignmentV>
 
 1655   if (d_mat.lhs().row_major() && result.
row_major())
 
 1658                                                     (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
 
 1659                                                      viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1660                                                      viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
 
 1676   else if (d_mat.lhs().row_major() && !result.
row_major())
 
 1679                                                     (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
 
 1680                                                      viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1681                                                      viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
 
 1697   else if (!d_mat.lhs().row_major() && result.
row_major())
 
 1700                                                     (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
 
 1701                                                      viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1702                                                      viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
 
 1721                                                     (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
 
 1722                                                      viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1723                                                      viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
 
 1746 template<
typename AlphaBetaHandlerT, 
typename NumericT>
 
 1750                                           unsigned int start_x,
 
 1754                                           unsigned int start_result,
 
 1755                                           unsigned int inc_result,
 
 1757                                           unsigned int row_num,
 
 1758                                           unsigned int col_num,
 
 1759                                           unsigned int internal_row_num,
 
 1760                                           unsigned int items_per_row,
 
 1761                                           unsigned int aligned_items_per_row
 
 1764   unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
 
 1765   unsigned int glb_sz = gridDim.x * blockDim.x;
 
 1767   for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
 
 1771     unsigned int offset = row_id;
 
 1772     for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
 
 1778         int col = coords[offset];
 
 1779         sum += x[col * inc_x + start_x] * val;
 
 1783     AlphaBetaHandlerT::apply(result[row_id * inc_result + start_result], alpha, sum, beta);
 
 1796 template<
typename NumericT, 
unsigned int AlignmentV>
 
 1804     ell_matrix_vec_mul_kernel<detail::spmv_alpha_beta><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 1805                                             viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 1807                                             static_cast<unsigned int>(vec.
start()),
 
 1808                                             static_cast<unsigned int>(vec.
stride()),
 
 1811                                             static_cast<unsigned int>(result.
start()),
 
 1812                                             static_cast<unsigned int>(result.
stride()),
 
 1814                                             static_cast<unsigned int>(mat.
size1()),
 
 1815                                             static_cast<unsigned int>(mat.
size2()),
 
 1817                                             static_cast<unsigned int>(mat.
maxnnz()),
 
 1821     ell_matrix_vec_mul_kernel<detail::spmv_pure><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 1822                                             viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 1824                                             static_cast<unsigned int>(vec.
start()),
 
 1825                                             static_cast<unsigned int>(vec.
stride()),
 
 1828                                             static_cast<unsigned int>(result.
start()),
 
 1829                                             static_cast<unsigned int>(result.
stride()),
 
 1831                                             static_cast<unsigned int>(mat.
size1()),
 
 1832                                             static_cast<unsigned int>(mat.
size2()),
 
 1834                                             static_cast<unsigned int>(mat.
maxnnz()),
 
 1840 template<
typename DMatIndexT, 
typename ResultIndexT, 
typename NumericT>
 
 1843                                             unsigned int sp_mat_row_num,
 
 1844                                             unsigned int sp_mat_col_num,
 
 1845                                             unsigned int sp_mat_internal_row_num,
 
 1846                                             unsigned int sp_mat_items_per_row,
 
 1847                                             unsigned int sp_mat_aligned_items_per_row,
 
 1849                                             unsigned int d_mat_row_start,
 
 1850                                             unsigned int d_mat_col_start,
 
 1851                                             unsigned int d_mat_row_inc,
 
 1852                                             unsigned int d_mat_col_inc,
 
 1853                                             unsigned int d_mat_row_size,
 
 1854                                             unsigned int d_mat_col_size,
 
 1855                                             unsigned int d_mat_internal_rows,
 
 1856                                             unsigned int d_mat_internal_cols,
 
 1858                                             unsigned int result_row_start,
 
 1859                                             unsigned int result_col_start,
 
 1860                                             unsigned int result_row_inc,
 
 1861                                             unsigned int result_col_inc,
 
 1862                                             unsigned int result_row_size,
 
 1863                                             unsigned int result_col_size,
 
 1864                                             unsigned int result_internal_rows,
 
 1865                                             unsigned int result_internal_cols)
 
 1867   unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
 
 1868   unsigned int glb_sz = gridDim.x * blockDim.x;
 
 1870   for ( 
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
 
 1872     unsigned int row = rc % sp_mat_row_num;
 
 1873     unsigned int col = rc / sp_mat_row_num;
 
 1875     unsigned int offset = 
row;
 
 1878     for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
 
 1880       unsigned int j = sp_mat_coords[offset];
 
 1885         NumericT y = d_mat[ DMatIndexT::apply(j, col,
 
 1886                                               d_mat_row_start, d_mat_row_inc,
 
 1887                                               d_mat_col_start, d_mat_col_inc,
 
 1888                                               d_mat_internal_rows, d_mat_internal_cols) ];
 
 1893     result [ ResultIndexT::apply(row, col,
 
 1894                                  result_row_start, result_row_inc,
 
 1895                                  result_col_start, result_col_inc,
 
 1896                                  result_internal_rows, result_internal_cols) ] = r;
 
 1910 template<
typename NumericT, 
unsigned int AlignmentV>
 
 1918                                            (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
 1919                                             viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1920                                             static_cast<unsigned int>(sp_mat.
size1()),
 
 1921                                             static_cast<unsigned int>(sp_mat.
size2()),
 
 1923                                             static_cast<unsigned int>(sp_mat.
maxnnz()),
 
 1942                                            (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
 1943                                             viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1944                                             static_cast<unsigned int>(sp_mat.
size1()),
 
 1945                                             static_cast<unsigned int>(sp_mat.
size2()),
 
 1947                                             static_cast<unsigned int>(sp_mat.
maxnnz()),
 
 1966                                            (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
 1967                                             viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1968                                             static_cast<unsigned int>(sp_mat.
size1()),
 
 1969                                             static_cast<unsigned int>(sp_mat.
size2()),
 
 1971                                             static_cast<unsigned int>(sp_mat.
maxnnz()),
 
 1990                                            (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
 1991                                             viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 1992                                             static_cast<unsigned int>(sp_mat.
size1()),
 
 1993                                             static_cast<unsigned int>(sp_mat.
size2()),
 
 1995                                             static_cast<unsigned int>(sp_mat.
maxnnz()),
 
 2013 template<
typename DMatIndexT, 
typename ResultIndexT, 
typename NumericT >
 
 2016                                             unsigned int sp_mat_row_num,
 
 2017                                             unsigned int sp_mat_col_num,
 
 2018                                             unsigned int sp_mat_internal_row_num,
 
 2019                                             unsigned int sp_mat_items_per_row,
 
 2020                                             unsigned int sp_mat_aligned_items_per_row,
 
 2022                                             unsigned int d_mat_row_start,
 
 2023                                             unsigned int d_mat_col_start,
 
 2024                                             unsigned int d_mat_row_inc,
 
 2025                                             unsigned int d_mat_col_inc,
 
 2026                                             unsigned int d_mat_row_size,
 
 2027                                             unsigned int d_mat_col_size,
 
 2028                                             unsigned int d_mat_internal_rows,
 
 2029                                             unsigned int d_mat_internal_cols,
 
 2031                                             unsigned int result_row_start,
 
 2032                                             unsigned int result_col_start,
 
 2033                                             unsigned int result_row_inc,
 
 2034                                             unsigned int result_col_inc,
 
 2035                                             unsigned int result_row_size,
 
 2036                                             unsigned int result_col_size,
 
 2037                                             unsigned int result_internal_rows,
 
 2038                                             unsigned int result_internal_cols)
 
 2040   unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
 
 2041   unsigned int glb_sz = gridDim.x * blockDim.x;
 
 2043   for ( 
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
 
 2045     unsigned int row = rc % sp_mat_row_num;
 
 2046     unsigned int col = rc / sp_mat_row_num;
 
 2048     unsigned int offset = 
row;
 
 2051     for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
 
 2053       unsigned int j = sp_mat_coords[offset];
 
 2058         NumericT y = d_mat[ DMatIndexT::apply(col, j,
 
 2059                                               d_mat_row_start, d_mat_row_inc,
 
 2060                                               d_mat_col_start, d_mat_col_inc,
 
 2061                                               d_mat_internal_rows, d_mat_internal_cols) ];
 
 2066     result [ ResultIndexT::apply(row, col,
 
 2067                                  result_row_start, result_row_inc,
 
 2068                                  result_col_start, result_col_inc,
 
 2069                                  result_internal_rows, result_internal_cols) ] = r;
 
 2083 template<
typename NumericT, 
unsigned int AlignmentV>
 
 2090   if (d_mat.lhs().row_major() && result.
row_major())
 
 2093                                               (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
 2094                                                viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 2095                                                static_cast<unsigned int>(sp_mat.
size1()),
 
 2096                                                static_cast<unsigned int>(sp_mat.
size2()),
 
 2098                                                static_cast<unsigned int>(sp_mat.
maxnnz()),
 
 2115   else if (d_mat.lhs().row_major() && !result.
row_major())
 
 2118                                               (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
 2119                                                viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 2120                                                static_cast<unsigned int>(sp_mat.
size1()),
 
 2121                                                static_cast<unsigned int>(sp_mat.
size2()),
 
 2123                                                static_cast<unsigned int>(sp_mat.
maxnnz()),
 
 2140   else if (!d_mat.lhs().row_major() && result.
row_major())
 
 2143                                               (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
 2144                                                viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 2145                                                static_cast<unsigned int>(sp_mat.
size1()),
 
 2146                                                static_cast<unsigned int>(sp_mat.
size2()),
 
 2148                                                static_cast<unsigned int>(sp_mat.
maxnnz()),
 
 2168                                               (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
 
 2169                                                viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
 
 2170                                                static_cast<unsigned int>(sp_mat.
size1()),
 
 2171                                                static_cast<unsigned int>(sp_mat.
size2()),
 
 2173                                                static_cast<unsigned int>(sp_mat.
maxnnz()),
 
 2196 template<
typename AlphaBetaHandlerT, 
typename NumericT>
 
 2198                                                  const unsigned int * column_indices,
 
 2199                                                  const unsigned int * block_start,
 
 2202                                                  unsigned int start_x,
 
 2204                                                  unsigned int size_x,
 
 2207                                                  unsigned int start_result,
 
 2208                                                  unsigned int inc_result,
 
 2209                                                  unsigned int size_result,
 
 2211                                                  unsigned int block_size)
 
 2213   unsigned int blocks_per_threadblock = blockDim.x / block_size;
 
 2214   unsigned int id_in_block = threadIdx.x % block_size;
 
 2215   unsigned int num_blocks = (size_result - 1) / block_size + 1;
 
 2216   unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
 
 2217   unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
 
 2219   for (
unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
 
 2221     unsigned int row         = block_idx * block_size + id_in_block;
 
 2222     unsigned int offset      = block_start[block_idx];
 
 2223     unsigned int num_columns = columns_per_block[block_idx];
 
 2226     for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
 
 2228       unsigned int index = offset + item_id * block_size + id_in_block;
 
 2231       sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
 
 2234     if (row < size_result)
 
 2235       AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, sum, beta);
 
 2247 template<
typename NumericT, 
typename IndexT>
 
 2255     sliced_ell_matrix_vec_mul_kernel<detail::spmv_alpha_beta><<<256, 256>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
 
 2256                                                    viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2257                                                    viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2258                                                    viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2260                                                    static_cast<unsigned int>(vec.
start()),
 
 2261                                                    static_cast<unsigned int>(vec.
stride()),
 
 2262                                                    static_cast<unsigned int>(vec.
size()),
 
 2265                                                    static_cast<unsigned int>(result.
start()),
 
 2266                                                    static_cast<unsigned int>(result.
stride()),
 
 2267                                                    static_cast<unsigned int>(result.
size()),
 
 2272     sliced_ell_matrix_vec_mul_kernel<detail::spmv_pure><<<256, 256>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
 
 2273                                                    viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2274                                                    viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2275                                                    viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2277                                                    static_cast<unsigned int>(vec.
start()),
 
 2278                                                    static_cast<unsigned int>(vec.
stride()),
 
 2279                                                    static_cast<unsigned int>(vec.
size()),
 
 2282                                                    static_cast<unsigned int>(result.
start()),
 
 2283                                                    static_cast<unsigned int>(result.
stride()),
 
 2284                                                    static_cast<unsigned int>(result.
size()),
 
 2297 template<
typename AlphaBetaHandlerT, 
typename NumericT>
 
 2300                                           const unsigned int * csr_rows,
 
 2301                                           const unsigned int * csr_cols,
 
 2304                                           unsigned int start_x,
 
 2308                                           unsigned int start_result,
 
 2309                                           unsigned int inc_result,
 
 2311                                           unsigned int row_num,
 
 2312                                           unsigned int internal_row_num,
 
 2313                                           unsigned int items_per_row,
 
 2314                                           unsigned int aligned_items_per_row
 
 2317   unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
 
 2318   unsigned int glb_sz = gridDim.x * blockDim.x;
 
 2320   for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
 
 2324     unsigned int offset = row_id;
 
 2325     for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
 
 2327       NumericT val = ell_elements[offset];
 
 2332         int col = ell_coords[offset];
 
 2333         sum += (x[col * inc_x + start_x] * val);
 
 2337     unsigned int col_begin = csr_rows[row_id];
 
 2338     unsigned int col_end   = csr_rows[row_id + 1];
 
 2340     for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
 
 2341       sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
 
 2343     AlphaBetaHandlerT::apply(result[row_id * inc_result + start_result], alpha, sum, beta);
 
 2357 template<
typename NumericT, 
unsigned int AlignmentV>
 
 2365     hyb_matrix_vec_mul_kernel<detail::spmv_alpha_beta><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2366                                             viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2367                                             viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2368                                             viennacl::cuda_arg<unsigned int>(mat.
handle4()),
 
 2369                                             viennacl::cuda_arg<NumericT>(mat.
handle5()),
 
 2371                                             static_cast<unsigned int>(vec.
start()),
 
 2372                                             static_cast<unsigned int>(vec.
stride()),
 
 2375                                             static_cast<unsigned int>(result.
start()),
 
 2376                                             static_cast<unsigned int>(result.
stride()),
 
 2378                                             static_cast<unsigned int>(mat.
size1()),
 
 2380                                             static_cast<unsigned int>(mat.
ell_nnz()),
 
 2384     hyb_matrix_vec_mul_kernel<detail::spmv_pure><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2385                                             viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2386                                             viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2387                                             viennacl::cuda_arg<unsigned int>(mat.
handle4()),
 
 2388                                             viennacl::cuda_arg<NumericT>(mat.
handle5()),
 
 2390                                             static_cast<unsigned int>(vec.
start()),
 
 2391                                             static_cast<unsigned int>(vec.
stride()),
 
 2394                                             static_cast<unsigned int>(result.
start()),
 
 2395                                             static_cast<unsigned int>(result.
stride()),
 
 2397                                             static_cast<unsigned int>(mat.
size1()),
 
 2399                                             static_cast<unsigned int>(mat.
ell_nnz()),
 
 2407 template<
typename DMatIndexT, 
typename ResultIndexT, 
typename NumericT>
 
 2410                                           const unsigned int * csr_rows,
 
 2411                                           const unsigned int * csr_cols,
 
 2413                                           unsigned int row_num,
 
 2414                                           unsigned int internal_row_num,
 
 2415                                           unsigned int items_per_row,
 
 2416                                           unsigned int aligned_items_per_row,
 
 2418                                           unsigned int d_mat_row_start,
 
 2419                                           unsigned int d_mat_col_start,
 
 2420                                           unsigned int d_mat_row_inc,
 
 2421                                           unsigned int d_mat_col_inc,
 
 2422                                           unsigned int d_mat_row_size,
 
 2423                                           unsigned int d_mat_col_size,
 
 2424                                           unsigned int d_mat_internal_rows,
 
 2425                                           unsigned int d_mat_internal_cols,
 
 2427                                           unsigned int result_row_start,
 
 2428                                           unsigned int result_col_start,
 
 2429                                           unsigned int result_row_inc,
 
 2430                                           unsigned int result_col_inc,
 
 2431                                           unsigned int result_row_size,
 
 2432                                           unsigned int result_col_size,
 
 2433                                           unsigned int result_internal_rows,
 
 2434                                           unsigned int result_internal_cols)
 
 2436   unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
 
 2437   unsigned int glb_sz = gridDim.x * blockDim.x;
 
 2439   for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
 
 2441     for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
 
 2445       unsigned int offset = row_id;
 
 2446       for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
 
 2448         NumericT val = ell_elements[offset];
 
 2452           sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
 
 2453                                          d_mat_row_start, d_mat_row_inc,
 
 2454                                          d_mat_col_start, d_mat_col_inc,
 
 2455                                          d_mat_internal_rows, d_mat_internal_cols)] * val;
 
 2459       unsigned int col_begin = csr_rows[row_id];
 
 2460       unsigned int col_end   = csr_rows[row_id + 1];
 
 2462       for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
 
 2464         sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
 
 2465                                        d_mat_row_start, d_mat_row_inc,
 
 2466                                        d_mat_col_start, d_mat_col_inc,
 
 2467                                        d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
 
 2470       result[ResultIndexT::apply(row_id, result_col,
 
 2471                                  result_row_start, result_row_inc,
 
 2472                                  result_col_start, result_col_inc,
 
 2473                                  result_internal_rows, result_internal_cols)] = 
sum;
 
 2488 template<
typename NumericT, 
unsigned int AlignmentV>
 
 2496       viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2497       viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2498       viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2499       viennacl::cuda_arg<unsigned int>(mat.
handle4()),
 
 2500       viennacl::cuda_arg<NumericT>(mat.
handle5()),
 
 2501       static_cast<unsigned int>(mat.
size1()),
 
 2503       static_cast<unsigned int>(mat.
ell_nnz()),
 
 2523       viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2524       viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2525       viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2526       viennacl::cuda_arg<unsigned int>(mat.
handle4()),
 
 2527       viennacl::cuda_arg<NumericT>(mat.
handle5()),
 
 2528       static_cast<unsigned int>(mat.
size1()),
 
 2530       static_cast<unsigned int>(mat.
ell_nnz()),
 
 2550       viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2551       viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2552       viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2553       viennacl::cuda_arg<unsigned int>(mat.
handle4()),
 
 2554       viennacl::cuda_arg<NumericT>(mat.
handle5()),
 
 2555       static_cast<unsigned int>(mat.
size1()),
 
 2557       static_cast<unsigned int>(mat.
ell_nnz()),
 
 2577       viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2578       viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2579       viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2580       viennacl::cuda_arg<unsigned int>(mat.
handle4()),
 
 2581       viennacl::cuda_arg<NumericT>(mat.
handle5()),
 
 2582       static_cast<unsigned int>(mat.
size1()),
 
 2584       static_cast<unsigned int>(mat.
ell_nnz()),
 
 2605 template<
typename DMatIndexT, 
typename ResultIndexT, 
typename NumericT>
 
 2608                                           const unsigned int * csr_rows,
 
 2609                                           const unsigned int * csr_cols,
 
 2611                                           unsigned int row_num,
 
 2612                                           unsigned int internal_row_num,
 
 2613                                           unsigned int items_per_row,
 
 2614                                           unsigned int aligned_items_per_row,
 
 2616                                           unsigned int d_mat_row_start,
 
 2617                                           unsigned int d_mat_col_start,
 
 2618                                           unsigned int d_mat_row_inc,
 
 2619                                           unsigned int d_mat_col_inc,
 
 2620                                           unsigned int d_mat_row_size,
 
 2621                                           unsigned int d_mat_col_size,
 
 2622                                           unsigned int d_mat_internal_rows,
 
 2623                                           unsigned int d_mat_internal_cols,
 
 2625                                           unsigned int result_row_start,
 
 2626                                           unsigned int result_col_start,
 
 2627                                           unsigned int result_row_inc,
 
 2628                                           unsigned int result_col_inc,
 
 2629                                           unsigned int result_row_size,
 
 2630                                           unsigned int result_col_size,
 
 2631                                           unsigned int result_internal_rows,
 
 2632                                           unsigned int result_internal_cols)
 
 2634   unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
 
 2635   unsigned int glb_sz = gridDim.x * blockDim.x;
 
 2637   for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
 
 2639     for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
 
 2643       unsigned int offset = row_id;
 
 2644       for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
 
 2646         NumericT val = ell_elements[offset];
 
 2650           sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
 
 2651                                          d_mat_row_start, d_mat_row_inc,
 
 2652                                          d_mat_col_start, d_mat_col_inc,
 
 2653                                          d_mat_internal_rows, d_mat_internal_cols)] * val;
 
 2657       unsigned int col_begin = csr_rows[row_id];
 
 2658       unsigned int col_end   = csr_rows[row_id + 1];
 
 2660       for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
 
 2662         sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
 
 2663                                        d_mat_row_start, d_mat_row_inc,
 
 2664                                        d_mat_col_start, d_mat_col_inc,
 
 2665                                        d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
 
 2668       result[ResultIndexT::apply(row_id, result_col,
 
 2669                                  result_row_start, result_row_inc,
 
 2670                                  result_col_start, result_col_inc,
 
 2671                                  result_internal_rows, result_internal_cols)] = 
sum;
 
 2686 template<
typename NumericT, 
unsigned int AlignmentV>
 
 2693   if (d_mat.lhs().row_major() && result.
row_major())
 
 2696       viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2697       viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2698       viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2699       viennacl::cuda_arg<unsigned int>(mat.
handle4()),
 
 2700       viennacl::cuda_arg<NumericT>(mat.
handle5()),
 
 2701       static_cast<unsigned int>(mat.
size1()),
 
 2703       static_cast<unsigned int>(mat.
ell_nnz()),
 
 2720   else if (d_mat.lhs().row_major() && !result.
row_major())
 
 2723       viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2724       viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2725       viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2726       viennacl::cuda_arg<unsigned int>(mat.
handle4()),
 
 2727       viennacl::cuda_arg<NumericT>(mat.
handle5()),
 
 2728       static_cast<unsigned int>(mat.
size1()),
 
 2730       static_cast<unsigned int>(mat.
ell_nnz()),
 
 2747   else if (!d_mat.lhs().row_major() && result.
row_major())
 
 2750       viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2751       viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2752       viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2753       viennacl::cuda_arg<unsigned int>(mat.
handle4()),
 
 2754       viennacl::cuda_arg<NumericT>(mat.
handle5()),
 
 2755       static_cast<unsigned int>(mat.
size1()),
 
 2757       static_cast<unsigned int>(mat.
ell_nnz()),
 
 2777       viennacl::cuda_arg<unsigned int>(mat.
handle2()),
 
 2778       viennacl::cuda_arg<NumericT>(mat.
handle()),
 
 2779       viennacl::cuda_arg<unsigned int>(mat.
handle3()),
 
 2780       viennacl::cuda_arg<unsigned int>(mat.
handle4()),
 
 2781       viennacl::cuda_arg<NumericT>(mat.
handle5()),
 
 2782       static_cast<unsigned int>(mat.
size1()),
 
 2784       static_cast<unsigned int>(mat.
ell_nnz()),
 
vcl_size_t internal_ellnnz() const 
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Simple enable-if variant that uses the SFINAE pattern. 
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector. 
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, NumericT beta, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const 
const vcl_size_t & size1() const 
Returns the number of rows. 
const handle_type & handle2() const 
Returns the OpenCL handle to the column index array. 
const handle_type & handle1() const 
Returns the OpenCL handle to the row index array. 
const handle_type & handle() const 
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
const handle_type & handle12() const 
Returns the OpenCL handle to the (row, column) index array. 
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.) 
A tag class representing a lower triangular matrix. 
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
void av(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
static __device__ void apply(NumericT &result, NumericT alpha, NumericT Ax, NumericT beta)
vcl_size_t internal_size1() const 
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, NumericT beta, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
Expression template class for representing a tree of expressions which ultimately result in a matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL. 
result_of::size_type< T >::type start1(T const &obj)
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
const handle_type & handle4() const 
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, NumericT beta)
vcl_size_t rows_per_block() const 
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication. 
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
static __device__ void apply(NumericT &result, NumericT alpha, NumericT Ax, NumericT beta)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
vcl_size_t size1() const 
Returns the size of the result vector. 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
const handle_type & handle1() const 
Returns the OpenCL handle to the row index array. 
Helper struct for accessing an element of a row- or column-major matrix. 
vcl_size_t internal_size1() const 
const vcl_size_t & nnz() const 
Returns the number of nonzero entries. 
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const 
Returns the stride within the buffer (in multiples of sizeof(NumericT)) 
const handle_type & handle2() const 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
result_of::size_type< T >::type start2(T const &obj)
Sparse matrix class using the ELLPACK format for storing the nonzeros. 
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta)
A tag class representing an upper triangular matrix. 
Sparse matrix class using the sliced ELLPACK with parameters C, . 
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const 
Returns the OpenCL handle to the row index array. 
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const 
Returns the OpenCL handle to the column index array. 
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta)
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const 
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta, unsigned int block_size)
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
const handle_type & handle3() const 
Returns the OpenCL handle to the group start index array. 
Implementations of direct triangular solvers for sparse matrices using CUDA. 
const handle_type & handle3() const 
Returns the OpenCL handle to the row block array. 
void clear()
Resets all entries to zero. Does not change the size of the vector. 
Common routines for CUDA execution. 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
__global__ void compressed_matrix_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta)
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const 
Returns the length of the vector (cf. std::vector) 
const vcl_size_t & nnz1() const 
Returns the number of nonzero entries. 
vcl_size_t ell_nnz() const 
A tag class representing a lower triangular matrix with unit diagonal. 
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices. 
A sparse square matrix in compressed sparse rows format. 
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
A tag for column-major storage of a dense matrix. 
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. 
const handle_type & handle5() const 
LHS & lhs() const 
Get left hand side operand. 
size_type start() const 
Returns the offset within the buffer. 
const vcl_size_t & blocks1() const 
Returns the internal number of row blocks for an adaptive SpMV. 
vcl_size_t internal_maxnnz() const 
Implementation of the ViennaCL scalar class. 
Implementations of NMF operations using CUDA. 
A tag class representing an upper triangular matrix with unit diagonal. 
NumericT min(std::vector< NumericT > const &v1)
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)