1 #ifndef VIENNACL_LINALG_CUDA_SPGEMM_RMERGE_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_SPGEMM_RMERGE_HPP_ 
   45 template<
typename NumericT>
 
   48 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 
   49   return __ldg(address);
 
   59 template<
typename IndexT>
 
   78 template<
typename IndexT>
 
   80           const IndexT * A_row_indices,
 
   81           const IndexT * A_col_indices,
 
   83           const IndexT * B_row_indices,
 
   84           IndexT *subwarpsize_per_group,
 
   85           IndexT *max_nnz_row_A_per_group,
 
   86           IndexT *max_nnz_row_B_per_group)
 
   88   unsigned int subwarpsize_in_thread = 0;
 
   89   unsigned int max_nnz_row_A = 0;
 
   90   unsigned int max_nnz_row_B = 0;
 
   92   unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
 
   93   unsigned int row_per_group_end = 
min(A_size1, rows_per_group * (blockIdx.x + 1));
 
   95   for (
unsigned int row = rows_per_group * blockIdx.x + threadIdx.x; 
row < row_per_group_end; 
row += blockDim.x)
 
   97     unsigned int A_row_start = A_row_indices[
row];
 
   98     unsigned int A_row_end   = A_row_indices[
row+1];
 
   99     unsigned int row_num = A_row_end - A_row_start;
 
  100     subwarpsize_in_thread = 
max(A_row_end - A_row_start, subwarpsize_in_thread);
 
  101     max_nnz_row_A = 
max(max_nnz_row_A, row_num);
 
  102     for (
unsigned int j = A_row_start; j < A_row_end; ++j)
 
  104       unsigned int col = A_col_indices[j];
 
  105       unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col];
 
  106       max_nnz_row_B = 
max(row_len_B, max_nnz_row_B);
 
  111   __shared__ 
unsigned int shared_subwarpsize[256];
 
  112   __shared__ 
unsigned int shared_max_nnz_row_A[256];
 
  113   __shared__ 
unsigned int shared_max_nnz_row_B[256];
 
  115     shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread;
 
  116   shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A;
 
  117   shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B;
 
  123         shared_subwarpsize[threadIdx.x] = 
max(  shared_subwarpsize[threadIdx.x],   shared_subwarpsize[threadIdx.x + 
stride]);
 
  124       shared_max_nnz_row_A[threadIdx.x] = 
max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x + 
stride]);
 
  125       shared_max_nnz_row_B[threadIdx.x] = 
max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x + 
stride]);
 
  129   if (threadIdx.x == 0)
 
  132     max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0];
 
  133     max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0];
 
  142 template<
unsigned int SubWarpSizeV, 
typename IndexT>
 
  145   for (
unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
 
  146     min_index = 
min(min_index, __shfl_xor((
int)min_index, (
int)i));
 
  151 template<
unsigned int SubWarpSizeV, 
typename IndexT>
 
  154   shared_buffer[threadIdx.x] = min_index;
 
  155   for (
unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
 
  156     shared_buffer[threadIdx.x] = 
min(shared_buffer[threadIdx.x], shared_buffer[(threadIdx.x + i) % 512]);
 
  157   return shared_buffer[threadIdx.x - id_in_warp];
 
  161 template<
unsigned int SubWarpSizeV, 
typename IndexT>
 
  163           const IndexT * A_row_indices,
 
  164           const IndexT * A_col_indices,
 
  166           const IndexT * B_row_indices,
 
  167           const IndexT * B_col_indices,
 
  169           IndexT * C_row_indices)
 
  171   __shared__ 
unsigned int shared_buffer[512];
 
  173   unsigned int num_warps  =  blockDim.x / SubWarpSizeV;
 
  174   unsigned int warp_id    = threadIdx.x / SubWarpSizeV;
 
  175   unsigned int id_in_warp = threadIdx.x % SubWarpSizeV;
 
  177   unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
 
  178   unsigned int row_per_group_end = 
min(A_size1, rows_per_group * (blockIdx.x + 1));
 
  180   for (
unsigned int row = rows_per_group * blockIdx.x + warp_id; 
row < row_per_group_end; 
row += num_warps)
 
  182     unsigned int row_A_start = A_row_indices[
row];
 
  183     unsigned int row_A_end   = A_row_indices[
row+1];
 
  185     unsigned int my_row_B = row_A_start + id_in_warp;
 
  186     unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
 
  187     unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
 
  188     unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
 
  190     unsigned int num_nnz = 0;
 
  191     if (row_A_end - row_A_start > 1) 
 
  193       unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
 
  198         unsigned int min_index = current_front_index;
 
  199         min_index = subwarp_minimum_shared<SubWarpSizeV>(min_index, id_in_warp, shared_buffer);
 
  201         if (min_index == B_size2)
 
  205         if (current_front_index == min_index)
 
  208           current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
 
  216       num_nnz = row_B_end - row_B_start;
 
  220       C_row_indices[
row] = num_nnz;
 
  231 template<
unsigned int SubWarpSizeV, 
typename NumericT>
 
  234   for (
unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
 
  235     output_value += __shfl_xor((
int)output_value, (
int)i);
 
  240 template<
unsigned int SubWarpSizeV, 
typename NumericT>
 
  243   shared_buffer[threadIdx.x] = output_value;
 
  244   for (
unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
 
  245     shared_buffer[threadIdx.x] += shared_buffer[(threadIdx.x + i) % 512];
 
  246   return shared_buffer[threadIdx.x - id_in_warp];
 
  250 template<
unsigned int SubWarpSizeV, 
typename IndexT, 
typename NumericT>
 
  252           const IndexT * A_row_indices,
 
  253           const IndexT * A_col_indices,
 
  256           const IndexT * B_row_indices,
 
  257           const IndexT * B_col_indices,
 
  260           IndexT 
const * C_row_indices,
 
  261           IndexT * C_col_indices,
 
  264   __shared__ 
unsigned int shared_indices[512];
 
  265   __shared__ 
NumericT     shared_values[512];
 
  267   unsigned int num_warps  =  blockDim.x / SubWarpSizeV;
 
  268   unsigned int warp_id    = threadIdx.x / SubWarpSizeV;
 
  269   unsigned int id_in_warp = threadIdx.x % SubWarpSizeV;
 
  271   unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
 
  272   unsigned int row_per_group_end = 
min(A_size1, rows_per_group * (blockIdx.x + 1));
 
  274   for (
unsigned int row = rows_per_group * blockIdx.x + warp_id; 
row < row_per_group_end; 
row += num_warps)
 
  276     unsigned int row_A_start = A_row_indices[
row];
 
  277     unsigned int row_A_end   = A_row_indices[
row+1];
 
  279     unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? id_in_warp : 0); 
 
  280     unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
 
  281     unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index)     : 0;
 
  282     unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
 
  283     NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
 
  285     unsigned int index_in_C = C_row_indices[
row];
 
  287     if (row_A_end - row_A_start > 1)
 
  289       unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
 
  290       NumericT     current_front_value = (row_B_start < row_B_end) ? load_and_cache(B_elements    + row_B_start) : 0;
 
  292       unsigned int index_buffer = 0;
 
  294       unsigned int buffer_size = 0;
 
  298         unsigned int min_index = subwarp_minimum_shared<SubWarpSizeV>(current_front_index, id_in_warp, shared_indices);
 
  300         if (min_index == B_size2) 
 
  304         NumericT output_value = (current_front_index == min_index) ? val_A * current_front_value : 0;
 
  305         output_value = subwarp_accumulate_shared<SubWarpSizeV>(output_value, id_in_warp, shared_values);
 
  308         if (current_front_index == min_index)
 
  311           current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
 
  312           current_front_value = (row_B_start < row_B_end) ? load_and_cache(B_elements    + row_B_start) : 0;
 
  316         index_buffer = (id_in_warp == buffer_size) ? min_index    : index_buffer;
 
  317         value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer;
 
  321         if (buffer_size == SubWarpSizeV)
 
  323           C_col_indices[index_in_C + id_in_warp] = index_buffer;
 
  324           C_elements[index_in_C + id_in_warp]    = value_buffer;
 
  327         index_in_C += (buffer_size == SubWarpSizeV) ? SubWarpSizeV : 0;
 
  328         buffer_size = (buffer_size == SubWarpSizeV) ?           0  : buffer_size;
 
  332       if (id_in_warp < buffer_size)
 
  334         C_col_indices[index_in_C + id_in_warp] = index_buffer;
 
  335         C_elements[index_in_C + id_in_warp]  = value_buffer;
 
  340       for (
unsigned int i = row_B_start + id_in_warp; i < row_B_end; i += SubWarpSizeV)
 
  342         C_col_indices[index_in_C + id_in_warp] = load_and_cache(B_col_indices + i);
 
  343         C_elements[index_in_C + id_in_warp]    = val_A * load_and_cache(B_elements    + i);
 
  344         index_in_C += SubWarpSizeV;
 
  357 template<
typename IndexT>
 
  359           const IndexT * A_row_indices,
 
  362           IndexT *chunks_per_row)
 
  364   for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
 
  366     IndexT num_entries = A_row_indices[i+1] - A_row_indices[i];
 
  367     chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1);
 
  372 template<
typename IndexT, 
typename NumericT>
 
  374           IndexT * A2_row_indices,
 
  375           IndexT * A2_col_indices,
 
  378           IndexT *new_row_buffer)
 
  380   for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x)
 
  382     unsigned int index_start = new_row_buffer[i];
 
  383     unsigned int index_stop  = new_row_buffer[i+1];
 
  385     A2_row_indices[i] = index_start;
 
  387     for (IndexT j = index_start; j < index_stop; ++j)
 
  389       A2_col_indices[j] = j;
 
  395   if (threadIdx.x == 0 && blockIdx.x == 0)
 
  396     A2_row_indices[A2_size1] = new_row_buffer[A2_size1];
 
  399 template<
typename IndexT, 
typename NumericT>
 
  401           IndexT * G1_row_indices,
 
  402           IndexT * G1_col_indices,
 
  405           IndexT 
const *A_row_indices,
 
  406           IndexT 
const *A_col_indices,
 
  411           IndexT *new_row_buffer)
 
  414   for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x)
 
  416     G1_col_indices[i] = A_col_indices[i];
 
  417     G1_elements[i]    = A_elements[i];
 
  421   for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
 
  423     unsigned int old_start = A_row_indices[i];
 
  424     unsigned int new_start = new_row_buffer[i];
 
  425     unsigned int row_chunks = new_row_buffer[i+1] - new_start;
 
  427     for (IndexT j=0; j<row_chunks; ++j)
 
  428       G1_row_indices[new_start + j] = old_start + j * max_per_row;
 
  432   if (threadIdx.x == 0 && blockIdx.x == 0)
 
  433     G1_row_indices[G1_size1] = A_row_indices[A_size1];
 
  447 template<
class NumericT, 
unsigned int AlignmentV>
 
  454   unsigned int blocknum = 256;
 
  455   unsigned int threadnum = 128;
 
  464   compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  465                                                           viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  466                                                           static_cast<unsigned int>(A.
size1()),
 
  467                                                           viennacl::cuda_arg<unsigned int>(B.
handle1()),
 
  475   unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.handle());
 
  478   unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.handle());
 
  481   unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.handle());
 
  483   unsigned int max_subwarp_size = 0;
 
  485   for (std::size_t i=0; i<subwarp_sizes.size(); ++i)
 
  486     max_subwarp_size = 
std::max(max_subwarp_size, subwarp_sizes_ptr[i]);
 
  487   unsigned int A_max_nnz_per_row = 0;
 
  488   for (std::size_t i=0; i<max_nnz_row_A.size(); ++i)
 
  489     A_max_nnz_per_row = 
std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]);
 
  491   if (max_subwarp_size > 32)
 
  494     unsigned int max_entries_in_G = 32;
 
  495     if (A_max_nnz_per_row <= 256)
 
  496       max_entries_in_G = 16;
 
  497     if (A_max_nnz_per_row <= 64)
 
  498       max_entries_in_G = 8;
 
  501     compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  502                                                                 static_cast<unsigned int>(A.
size1()),
 
  503                                                                 static_cast<unsigned int>(max_entries_in_G),
 
  509     unsigned int augmented_size = exclusive_scan_helper[A.
size1()];
 
  516     compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()),
 
  517                                                        viennacl::cuda_arg<unsigned int>(A2.handle2()),
 
  518                                                        viennacl::cuda_arg<NumericT>(A2.handle()),
 
  519                                                        static_cast<unsigned int>(A2.size1()),
 
  525     compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()),
 
  526                                                        viennacl::cuda_arg<unsigned int>(G1.handle2()),
 
  527                                                        viennacl::cuda_arg<NumericT>(G1.handle()),
 
  528                                                        static_cast<unsigned int>(G1.size1()),
 
  529                                                        viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  530                                                        viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  531                                                        viennacl::cuda_arg<NumericT>(A.
handle()),
 
  532                                                        static_cast<unsigned int>(A.
size1()),
 
  533                                                        static_cast<unsigned int>(A.
nnz()),
 
  534                                                        static_cast<unsigned int>(max_entries_in_G),
 
  557   if (max_subwarp_size == 32)
 
  559     compressed_matrix_gemm_stage_2<32><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  560                                                            viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  561                                                            static_cast<unsigned int>(A.
size1()),
 
  562                                                            viennacl::cuda_arg<unsigned int>(B.
handle1()),
 
  563                                                            viennacl::cuda_arg<unsigned int>(B.
handle2()),
 
  564                                                            static_cast<unsigned int>(B.
size2()),
 
  565                                                            viennacl::cuda_arg<unsigned int>(C.
handle1())
 
  569   else if (max_subwarp_size == 16)
 
  571     compressed_matrix_gemm_stage_2<16><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  572                                                            viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  573                                                            static_cast<unsigned int>(A.
size1()),
 
  574                                                            viennacl::cuda_arg<unsigned int>(B.
handle1()),
 
  575                                                            viennacl::cuda_arg<unsigned int>(B.
handle2()),
 
  576                                                            static_cast<unsigned int>(B.
size2()),
 
  577                                                            viennacl::cuda_arg<unsigned int>(C.
handle1())
 
  583     compressed_matrix_gemm_stage_2<8><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  584                                                            viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  585                                                            static_cast<unsigned int>(A.
size1()),
 
  586                                                            viennacl::cuda_arg<unsigned int>(B.
handle1()),
 
  587                                                            viennacl::cuda_arg<unsigned int>(B.
handle2()),
 
  588                                                            static_cast<unsigned int>(B.
size2()),
 
  589                                                            viennacl::cuda_arg<unsigned int>(C.
handle1())
 
  597   unsigned int current_offset = 0;
 
  598   for (std::size_t i=0; i<C.
size1(); ++i)
 
  600     unsigned int tmp = row_buffer[i];
 
  601     row_buffer.set(i, current_offset);
 
  602     current_offset += tmp;
 
  604   row_buffer.
set(C.
size1(), current_offset);
 
  611   C.
reserve(current_offset, 
false);
 
  613   if (max_subwarp_size == 32)
 
  615     compressed_matrix_gemm_stage_3<32><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  616                                                             viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  617                                                             viennacl::cuda_arg<NumericT>(A.
handle()),
 
  618                                                             static_cast<unsigned int>(A.
size1()),
 
  619                                                             viennacl::cuda_arg<unsigned int>(B.
handle1()),
 
  620                                                             viennacl::cuda_arg<unsigned int>(B.
handle2()),
 
  621                                                             viennacl::cuda_arg<NumericT>(B.
handle()),
 
  622                                                             static_cast<unsigned int>(B.
size2()),
 
  623                                                             viennacl::cuda_arg<unsigned int>(C.
handle1()),
 
  624                                                             viennacl::cuda_arg<unsigned int>(C.
handle2()),
 
  625                                                             viennacl::cuda_arg<NumericT>(C.
handle())
 
  629   else if (max_subwarp_size == 16)
 
  631     compressed_matrix_gemm_stage_3<16><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  632                                                             viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  633                                                             viennacl::cuda_arg<NumericT>(A.
handle()),
 
  634                                                             static_cast<unsigned int>(A.
size1()),
 
  635                                                             viennacl::cuda_arg<unsigned int>(B.
handle1()),
 
  636                                                             viennacl::cuda_arg<unsigned int>(B.
handle2()),
 
  637                                                             viennacl::cuda_arg<NumericT>(B.
handle()),
 
  638                                                             static_cast<unsigned int>(B.
size2()),
 
  639                                                             viennacl::cuda_arg<unsigned int>(C.
handle1()),
 
  640                                                             viennacl::cuda_arg<unsigned int>(C.
handle2()),
 
  641                                                             viennacl::cuda_arg<NumericT>(C.
handle())
 
  647     compressed_matrix_gemm_stage_3<8><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  648                                                             viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  649                                                             viennacl::cuda_arg<NumericT>(A.
handle()),
 
  650                                                             static_cast<unsigned int>(A.
size1()),
 
  651                                                             viennacl::cuda_arg<unsigned int>(B.
handle1()),
 
  652                                                             viennacl::cuda_arg<unsigned int>(B.
handle2()),
 
  653                                                             viennacl::cuda_arg<NumericT>(B.
handle()),
 
  654                                                             static_cast<unsigned int>(B.
size2()),
 
  655                                                             viennacl::cuda_arg<unsigned int>(C.
handle1()),
 
  656                                                             viennacl::cuda_arg<unsigned int>(C.
handle2()),
 
  657                                                             viennacl::cuda_arg<NumericT>(C.
handle())
 
const vcl_size_t & size2() const 
Returns the number of columns. 
Helper class implementing an array on the host. Default case: No conversion necessary. 
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'. 
__device__ IndexT subwarp_minimum_shared(IndexT min_index, IndexT id_in_warp, IndexT *shared_buffer)
const vcl_size_t & size1() const 
Returns the number of rows. 
__global__ void compressed_matrix_gemm_decompose_1(const IndexT *A_row_indices, IndexT A_size1, IndexT max_per_row, IndexT *chunks_per_row)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL. 
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM. 
T max(const T &lhs, const T &rhs)
Maximum. 
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication. 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
const handle_type & handle1() const 
Returns the OpenCL handle to the row index array. 
const vcl_size_t & nnz() const 
Returns the number of nonzero entries. 
__global__ void compressed_matrix_gemm_stage_3(const IndexT *A_row_indices, const IndexT *A_col_indices, const NumericT *A_elements, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, const NumericT *B_elements, IndexT B_size2, IndexT const *C_row_indices, IndexT *C_col_indices, NumericT *C_elements, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices, NumericT *scratchpad_values)
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
__device__ IndexT round_to_next_power_of_2(IndexT val)
__global__ void compressed_matrix_gemm_stage_1(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, IndexT *subwarpsize_per_group, IndexT *max_nnz_row_A_per_group, IndexT *max_nnz_row_B_per_group)
__device__ IndexT subwarp_minimum_shuffle(IndexT min_index)
__global__ void compressed_matrix_gemm_A2(IndexT *A2_row_indices, IndexT *A2_col_indices, NumericT *A2_elements, IndexT A2_size1, IndexT *new_row_buffer)
__device__ NumericT subwarp_accumulate_shared(NumericT output_value, unsigned int id_in_warp, NumericT *shared_buffer)
const handle_type & handle2() const 
Returns the OpenCL handle to the column index array. 
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Implementations of direct triangular solvers for sparse matrices using CUDA. 
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling. 
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object. 
Common routines for CUDA execution. 
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
void set(vcl_size_t index, U value)
__global__ void compressed_matrix_gemm_G1(IndexT *G1_row_indices, IndexT *G1_col_indices, NumericT *G1_elements, IndexT G1_size1, IndexT const *A_row_indices, IndexT const *A_col_indices, NumericT const *A_elements, IndexT A_size1, IndexT A_nnz, IndexT max_per_row, IndexT *new_row_buffer)
vcl_size_t raw_size() const 
Returns the number of bytes of the currently active buffer. 
A sparse square matrix in compressed sparse rows format. 
void exclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an exclusive scan. 
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. 
Implementation of the ViennaCL scalar class. 
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix. 
__device__ NumericT subwarp_accumulate_shuffle(NumericT output_value)
__global__ void compressed_matrix_gemm_stage_2(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, IndexT B_size2, IndexT *C_row_indices, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices)
NumericT min(std::vector< NumericT > const &v1)