1 #ifndef VIENNACL_LINALG_CUDA_SPGEMM_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_SPGEMM_HPP_ 
   27 #include <thrust/scan.h> 
   28 #include <thrust/device_ptr.h> 
   48 template<
typename NumericT>
 
   51 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 
   52   return __ldg(address);
 
   62 template<
typename IndexT>
 
   81 template<
typename IndexT>
 
   83           const IndexT * A_row_indices,
 
   84           const IndexT * A_col_indices,
 
   86           const IndexT * B_row_indices,
 
   87           IndexT *subwarpsize_per_group,
 
   88           IndexT *max_nnz_row_A_per_group,
 
   89           IndexT *max_nnz_row_B_per_group)
 
   91   unsigned int subwarpsize_in_thread = 0;
 
   92   unsigned int max_nnz_row_A = 0;
 
   93   unsigned int max_nnz_row_B = 0;
 
   95   unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
 
   96   unsigned int row_per_group_end = 
min(A_size1, rows_per_group * (blockIdx.x + 1));
 
   98   for (
unsigned int row = rows_per_group * blockIdx.x + threadIdx.x; 
row < row_per_group_end; 
row += blockDim.x)
 
  100     unsigned int A_row_start = A_row_indices[
row];
 
  101     unsigned int A_row_end   = A_row_indices[
row+1];
 
  102     unsigned int row_num = A_row_end - A_row_start;
 
  105       unsigned int subwarp_sqrt = (
unsigned int)sqrt(
double(row_num)) + 1;
 
  106       subwarpsize_in_thread = 
max(subwarp_sqrt, subwarpsize_in_thread);
 
  109       subwarpsize_in_thread = 
max(A_row_end - A_row_start, subwarpsize_in_thread);
 
  110     max_nnz_row_A = 
max(max_nnz_row_A, row_num);
 
  111     for (
unsigned int j = A_row_start; j < A_row_end; ++j)
 
  113       unsigned int col = A_col_indices[j];
 
  114       unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col];
 
  115       max_nnz_row_B = 
max(row_len_B, max_nnz_row_B);
 
  120   __shared__ 
unsigned int shared_subwarpsize[256];
 
  121   __shared__ 
unsigned int shared_max_nnz_row_A[256];
 
  122   __shared__ 
unsigned int shared_max_nnz_row_B[256];
 
  124     shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread;
 
  125   shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A;
 
  126   shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B;
 
  132         shared_subwarpsize[threadIdx.x] = 
max(  shared_subwarpsize[threadIdx.x],   shared_subwarpsize[threadIdx.x + 
stride]);
 
  133       shared_max_nnz_row_A[threadIdx.x] = 
max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x + 
stride]);
 
  134       shared_max_nnz_row_B[threadIdx.x] = 
max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x + 
stride]);
 
  138   if (threadIdx.x == 0)
 
  141     max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0];
 
  142     max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0];
 
  149 inline __device__ 
unsigned int merge_subwarp_symbolic(
unsigned int row_B_start, 
unsigned int row_B_end, 
unsigned int const *B_col_indices, 
unsigned int B_size2, 
unsigned int subwarpsize)
 
  151   unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
 
  153   unsigned int num_nnz = 0;
 
  157     unsigned int min_index = current_front_index;
 
  158     for (
unsigned int i = subwarpsize/2; i >= 1; i /= 2)
 
  159       min_index = 
min(min_index, __shfl_xor((
int)min_index, (
int)i));
 
  161     if (min_index == B_size2)
 
  165     current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2)
 
  166                                                              : current_front_index;
 
  173 inline __device__ 
unsigned int merge_subwarp_symbolic_double(
unsigned int row_B_start, 
unsigned int row_B_end, 
unsigned int const *B_col_indices, 
unsigned int B_size2,
 
  174                                                              unsigned int *output_array, 
unsigned int id_in_warp, 
unsigned int subwarpsize)
 
  176   unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
 
  178   unsigned int num_nnz = 0;
 
  179   unsigned int index_buffer = 0;
 
  180   unsigned int buffer_size = 0;
 
  184     unsigned int min_index = current_front_index;
 
  185     for (
unsigned int i = subwarpsize/2; i >= 1; i /= 2)
 
  186       min_index = 
min(min_index, __shfl_xor((
int)min_index, (
int)i));
 
  188     if (min_index == B_size2)
 
  192     current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2)
 
  193                                                              : current_front_index;
 
  196     index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
 
  199     if (buffer_size == subwarpsize) 
 
  201       output_array[id_in_warp] = index_buffer;
 
  202       output_array += subwarpsize;
 
  210   if (id_in_warp < buffer_size)
 
  211     output_array[id_in_warp] = index_buffer;
 
  216 template<
typename IndexT>
 
  218           const IndexT * A_row_indices,
 
  219           const IndexT * A_col_indices,
 
  221           const IndexT * B_row_indices,
 
  222           const IndexT * B_col_indices,
 
  224           IndexT * C_row_indices,
 
  225           unsigned int *subwarpsize_array,
 
  226           unsigned int *max_row_size_A,
 
  227           unsigned int *max_row_size_B,
 
  228           unsigned int *scratchpad_offsets,
 
  229           unsigned int *scratchpad_indices)
 
  231   unsigned int subwarpsize = subwarpsize_array[blockIdx.x];
 
  233   unsigned int num_warps  =  blockDim.x / subwarpsize;
 
  234   unsigned int warp_id    = threadIdx.x / subwarpsize;
 
  235   unsigned int id_in_warp = threadIdx.x % subwarpsize;
 
  237   unsigned int scratchpad_rowlength     = max_row_size_B[blockIdx.x] * subwarpsize;
 
  238   unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1;
 
  239   unsigned int *subwarp_scratchpad_start = scratchpad_indices + scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength;
 
  241   unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
 
  242   unsigned int row_per_group_end = 
min(A_size1, rows_per_group * (blockIdx.x + 1));
 
  244   for (
unsigned int row = rows_per_group * blockIdx.x + warp_id; 
row < row_per_group_end; 
row += num_warps)
 
  246     unsigned int row_A_start = A_row_indices[
row];
 
  247     unsigned int row_A_end   = A_row_indices[
row+1];
 
  249     if (row_A_end - row_A_start > subwarpsize)
 
  251       unsigned int final_merge_start = 0;
 
  252       unsigned int final_merge_end   = 0;
 
  255       unsigned int *subwarp_scratchpad = subwarp_scratchpad_start;
 
  256       unsigned int iter = 0;
 
  257       while (row_A_end > row_A_start)
 
  259         unsigned int my_row_B = row_A_start + id_in_warp;
 
  260         unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
 
  261         unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
 
  262         unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
 
  265                                                                   subwarp_scratchpad, id_in_warp, subwarpsize);
 
  267         final_merge_start = (iter == id_in_warp) ? subwarp_scratchpad - scratchpad_indices : final_merge_start;
 
  268         final_merge_end   = (iter == id_in_warp) ? final_merge_start + nnz_in_merge        : final_merge_end;
 
  271         row_A_start += subwarpsize;
 
  272         subwarp_scratchpad += scratchpad_rowlength; 
 
  276       unsigned int num_nnz = 
merge_subwarp_symbolic(final_merge_start, final_merge_end, scratchpad_indices, B_size2, subwarpsize);
 
  279         C_row_indices[
row] = num_nnz;
 
  284       unsigned int my_row_B = row_A_start + id_in_warp;
 
  285       unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
 
  286       unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
 
  287       unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
 
  289       unsigned int num_nnz = 
merge_subwarp_symbolic(row_B_start, row_B_end, B_col_indices, B_size2, subwarpsize);
 
  292         C_row_indices[
row] = num_nnz;
 
  302 template<
typename NumericT>
 
  304                                               unsigned int input_start, 
unsigned int input_end, 
const unsigned int *input_indices, 
const NumericT *input_values, 
unsigned int invalid_token,
 
  305                                               unsigned int *output_indices, 
NumericT *output_values,
 
  306                                               unsigned int id_in_warp, 
unsigned int subwarpsize)
 
  308   unsigned int current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token;
 
  309   NumericT     current_front_value = (input_start < input_end) ? load_and_cache(input_values  + input_start) : 0;
 
  311   unsigned int index_buffer = 0;
 
  313   unsigned int buffer_size = 0;
 
  314   unsigned int nnz_written = 0;
 
  318     unsigned int min_index = current_front_index;
 
  319     for (
unsigned int i = subwarpsize/2; i >= 1; i /= 2)
 
  320       min_index = 
min(min_index, __shfl_xor((
int)min_index, (int)i));
 
  322     if (min_index == invalid_token) 
 
  326     NumericT output_value = (current_front_index == min_index) ? scaling_factor * current_front_value : 0;
 
  327     for (
unsigned int i = subwarpsize/2; i >= 1; i /= 2)
 
  328       output_value += __shfl_xor((
int)output_value, (int)i);
 
  331     if (current_front_index == min_index)
 
  334       current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token;
 
  335       current_front_value = (input_start < input_end) ? load_and_cache(input_values  + input_start) : 0;
 
  339     index_buffer = (id_in_warp == buffer_size) ? min_index    : index_buffer;
 
  340     value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer;
 
  344     if (buffer_size == subwarpsize)
 
  346       output_indices[id_in_warp] = index_buffer; output_indices += subwarpsize;
 
  347       output_values[id_in_warp]  = value_buffer; output_values  += subwarpsize;
 
  355   if (id_in_warp < buffer_size)
 
  357     output_indices[id_in_warp] = index_buffer;
 
  358     output_values[id_in_warp]  = value_buffer;
 
  364 template<
typename IndexT, 
typename NumericT>
 
  366           const IndexT * A_row_indices,
 
  367           const IndexT * A_col_indices,
 
  370           const IndexT * B_row_indices,
 
  371           const IndexT * B_col_indices,
 
  374           IndexT 
const * C_row_indices,
 
  375           IndexT * C_col_indices,
 
  377           unsigned int *subwarpsize_array,
 
  378           unsigned int *max_row_size_A,
 
  379           unsigned int *max_row_size_B,
 
  380           unsigned int *scratchpad_offsets,
 
  381           unsigned int *scratchpad_indices,
 
  384   unsigned int subwarpsize = subwarpsize_array[blockIdx.x];
 
  386   unsigned int num_warps  =  blockDim.x / subwarpsize;
 
  387   unsigned int warp_id    = threadIdx.x / subwarpsize;
 
  388   unsigned int id_in_warp = threadIdx.x % subwarpsize;
 
  390   unsigned int scratchpad_rowlength     = max_row_size_B[blockIdx.x] * subwarpsize;
 
  391   unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1;
 
  392   unsigned int subwarp_scratchpad_shift = scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength;
 
  394   unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
 
  395   unsigned int row_per_group_end = 
min(A_size1, rows_per_group * (blockIdx.x + 1));
 
  397   for (
unsigned int row = rows_per_group * blockIdx.x + warp_id; 
row < row_per_group_end; 
row += num_warps)
 
  399     unsigned int row_A_start = A_row_indices[
row];
 
  400     unsigned int row_A_end   = A_row_indices[
row+1];
 
  402     if (row_A_end - row_A_start > subwarpsize)
 
  405       unsigned int final_merge_start = 0;
 
  406       unsigned int final_merge_end = 0;
 
  407       unsigned int iter = 0;
 
  408       unsigned int *scratchpad_indices_ptr = scratchpad_indices + subwarp_scratchpad_shift;
 
  409       NumericT     *scratchpad_values_ptr  = scratchpad_values  + subwarp_scratchpad_shift;
 
  411       while (row_A_start < row_A_end)
 
  413         unsigned int my_row_B = row_A_start + id_in_warp;
 
  414         unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
 
  415         unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index)     : 0;
 
  416         unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
 
  417         NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
 
  420                                                          row_B_start, row_B_end, B_col_indices, B_elements, B_size2,
 
  421                                                          scratchpad_indices_ptr, scratchpad_values_ptr,
 
  422                                                          id_in_warp, subwarpsize);
 
  424         if (iter == id_in_warp)
 
  426           final_merge_start = scratchpad_indices_ptr - scratchpad_indices;
 
  427           final_merge_end   = final_merge_start + nnz_written;
 
  431         row_A_start += subwarpsize;
 
  432         scratchpad_indices_ptr += scratchpad_rowlength;
 
  433         scratchpad_values_ptr  += scratchpad_rowlength;
 
  437       unsigned int index_in_C = C_row_indices[
row];
 
  439                             final_merge_start, final_merge_end, scratchpad_indices, scratchpad_values, B_size2,
 
  440                             C_col_indices + index_in_C, C_elements + index_in_C,
 
  441                             id_in_warp, subwarpsize);
 
  445       unsigned int my_row_B = row_A_start + id_in_warp;
 
  446       unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
 
  447       unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index)     : 0;
 
  448       unsigned int row_B_end   = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
 
  449       NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
 
  451       unsigned int index_in_C = C_row_indices[
row];
 
  454                             row_B_start, row_B_end, B_col_indices, B_elements, B_size2,
 
  455                             C_col_indices + index_in_C, C_elements + index_in_C,
 
  456                             id_in_warp, subwarpsize);
 
  468 template<
typename IndexT>
 
  470           const IndexT * A_row_indices,
 
  473           IndexT *chunks_per_row)
 
  475   for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
 
  477     IndexT num_entries = A_row_indices[i+1] - A_row_indices[i];
 
  478     chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1);
 
  483 template<
typename IndexT, 
typename NumericT>
 
  485           IndexT * A2_row_indices,
 
  486           IndexT * A2_col_indices,
 
  489           IndexT *new_row_buffer)
 
  491   for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x)
 
  493     unsigned int index_start = new_row_buffer[i];
 
  494     unsigned int index_stop  = new_row_buffer[i+1];
 
  496     A2_row_indices[i] = index_start;
 
  498     for (IndexT j = index_start; j < index_stop; ++j)
 
  500       A2_col_indices[j] = j;
 
  506   if (threadIdx.x == 0 && blockIdx.x == 0)
 
  507     A2_row_indices[A2_size1] = new_row_buffer[A2_size1];
 
  510 template<
typename IndexT, 
typename NumericT>
 
  512           IndexT * G1_row_indices,
 
  513           IndexT * G1_col_indices,
 
  516           IndexT 
const *A_row_indices,
 
  517           IndexT 
const *A_col_indices,
 
  522           IndexT *new_row_buffer)
 
  525   for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x)
 
  527     G1_col_indices[i] = A_col_indices[i];
 
  528     G1_elements[i]    = A_elements[i];
 
  532   for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
 
  534     unsigned int old_start = A_row_indices[i];
 
  535     unsigned int new_start = new_row_buffer[i];
 
  536     unsigned int row_chunks = new_row_buffer[i+1] - new_start;
 
  538     for (IndexT j=0; j<row_chunks; ++j)
 
  539       G1_row_indices[new_start + j] = old_start + j * max_per_row;
 
  543   if (threadIdx.x == 0 && blockIdx.x == 0)
 
  544     G1_row_indices[G1_size1] = A_row_indices[A_size1];
 
  558 template<
class NumericT, 
unsigned int AlignmentV>
 
  565   unsigned int blocknum = 256;
 
  566   unsigned int threadnum = 128;
 
  572 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS 
  579 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS 
  580   cudaDeviceSynchronize();
 
  584   compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  585                                                           viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  586                                                           static_cast<unsigned int>(A.
size1()),
 
  587                                                           viennacl::cuda_arg<unsigned int>(B.
handle1()),
 
  593 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS 
  594   cudaDeviceSynchronize();
 
  595   std::cout << 
"Stage 1 device: " << timer.
get() << std::endl;
 
  600   unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.
handle());
 
  603   unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.
handle());
 
  606   unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.
handle());
 
  611   unsigned int * scratchpad_offsets_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(scratchpad_offsets.handle());
 
  613   unsigned int max_subwarp_size = 0;
 
  614   unsigned int A_max_nnz_per_row = 0;
 
  615   unsigned int scratchpad_offset = 0;
 
  617   for (std::size_t i=0; i<subwarp_sizes.
size(); ++i)
 
  619     max_subwarp_size = 
std::max(max_subwarp_size, subwarp_sizes_ptr[i]);
 
  620     A_max_nnz_per_row = 
std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]);
 
  622     scratchpad_offsets_ptr[i] = scratchpad_offset;
 
  626     unsigned int max_warp_reloads = max_nnz_row_A_ptr[i] / subwarp_sizes_ptr[i] + 1;
 
  627     unsigned int max_row_length_after_warp_merge = subwarp_sizes_ptr[i] * max_nnz_row_B_ptr[i];
 
  628     unsigned int warps_in_group = threadnum / subwarp_sizes_ptr[i];
 
  629     scratchpad_offset +=  max_warp_reloads
 
  630                         * max_row_length_after_warp_merge
 
  635   if (max_subwarp_size > 32)
 
  638     unsigned int max_entries_in_G = 1024;
 
  639     if (A_max_nnz_per_row <= 512*512)
 
  640       max_entries_in_G = 512;
 
  641     if (A_max_nnz_per_row <= 256*256)
 
  642       max_entries_in_G = 256;
 
  643     if (A_max_nnz_per_row <= 128*128)
 
  644       max_entries_in_G = 128;
 
  645     if (A_max_nnz_per_row <= 64*64)
 
  646       max_entries_in_G = 64;
 
  649     compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  650                                                                 static_cast<unsigned int>(A.
size1()),
 
  651                                                                 static_cast<unsigned int>(max_entries_in_G),
 
  657                            thrust::device_ptr<unsigned int>(
viennacl::cuda_arg(exclusive_scan_helper) + exclusive_scan_helper.size()),
 
  660     unsigned int augmented_size = exclusive_scan_helper[A.
size1()];
 
  667     compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()),
 
  668                                                        viennacl::cuda_arg<unsigned int>(A2.handle2()),
 
  669                                                        viennacl::cuda_arg<NumericT>(A2.handle()),
 
  670                                                        static_cast<unsigned int>(A2.size1()),
 
  676     compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()),
 
  677                                                        viennacl::cuda_arg<unsigned int>(G1.handle2()),
 
  678                                                        viennacl::cuda_arg<NumericT>(G1.handle()),
 
  679                                                        static_cast<unsigned int>(G1.size1()),
 
  680                                                        viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  681                                                        viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  682                                                        viennacl::cuda_arg<NumericT>(A.
handle()),
 
  683                                                        static_cast<unsigned int>(A.
size1()),
 
  684                                                        static_cast<unsigned int>(A.
nnz()),
 
  685                                                        static_cast<unsigned int>(max_entries_in_G),
 
  705 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS 
  706   std::cout << 
"Intermediate host stage: " << timer.
get() << std::endl;
 
  714   compressed_matrix_gemm_stage_2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  715                                                          viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  716                                                          static_cast<unsigned int>(A.
size1()),
 
  717                                                          viennacl::cuda_arg<unsigned int>(B.
handle1()),
 
  718                                                          viennacl::cuda_arg<unsigned int>(B.
handle2()),
 
  719                                                          static_cast<unsigned int>(B.
size2()),
 
  720                                                          viennacl::cuda_arg<unsigned int>(C.
handle1()),
 
  728 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS 
  729   cudaDeviceSynchronize();
 
  730   std::cout << 
"Stage 2: " << timer.
get() << std::endl;
 
  738   unsigned int current_offset = 0;
 
  739   for (std::size_t i=0; i<C.
size1(); ++i)
 
  741     unsigned int tmp = row_buffer[i];
 
  742     row_buffer.set(i, current_offset);
 
  743     current_offset += tmp;
 
  745   row_buffer.
set(C.
size1(), current_offset);
 
  752   C.
reserve(current_offset, 
false);
 
  756 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS 
  757   std::cout << 
"Intermediate stage 2->3: " << timer.
get() << std::endl;
 
  761   compressed_matrix_gemm_stage_3<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  762                                                           viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  763                                                           viennacl::cuda_arg<NumericT>(A.
handle()),
 
  764                                                           static_cast<unsigned int>(A.
size1()),
 
  765                                                           viennacl::cuda_arg<unsigned int>(B.
handle1()),
 
  766                                                           viennacl::cuda_arg<unsigned int>(B.
handle2()),
 
  767                                                           viennacl::cuda_arg<NumericT>(B.
handle()),
 
  768                                                           static_cast<unsigned int>(B.
size2()),
 
  769                                                           viennacl::cuda_arg<unsigned int>(C.
handle1()),
 
  770                                                           viennacl::cuda_arg<unsigned int>(C.
handle2()),
 
  771                                                           viennacl::cuda_arg<NumericT>(C.
handle()),
 
  780 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS 
  781   cudaDeviceSynchronize();
 
  782   std::cout << 
"Stage 3: " << timer.
get() << std::endl;
 
  783   std::cout << 
"----------" << std::endl;
 
const vcl_size_t & size2() const 
Returns the number of columns. 
Helper class implementing an array on the host. Default case: No conversion necessary. 
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'. 
const vcl_size_t & size1() const 
Returns the number of rows. 
__global__ void compressed_matrix_gemm_decompose_1(const IndexT *A_row_indices, IndexT A_size1, IndexT max_per_row, IndexT *chunks_per_row)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL. 
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM. 
void exclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an exclusive scan using CUDA. 
T max(const T &lhs, const T &rhs)
Maximum. 
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication. 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
const handle_type & handle1() const 
Returns the OpenCL handle to the row index array. 
const vcl_size_t & nnz() const 
Returns the number of nonzero entries. 
__global__ void compressed_matrix_gemm_stage_3(const IndexT *A_row_indices, const IndexT *A_col_indices, const NumericT *A_elements, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, const NumericT *B_elements, IndexT B_size2, IndexT const *C_row_indices, IndexT *C_col_indices, NumericT *C_elements, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices, NumericT *scratchpad_values)
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
__device__ IndexT round_to_next_power_of_2(IndexT val)
__device__ unsigned int merge_subwarp_symbolic(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, unsigned int subwarpsize)
__global__ void compressed_matrix_gemm_stage_1(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, IndexT *subwarpsize_per_group, IndexT *max_nnz_row_A_per_group, IndexT *max_nnz_row_B_per_group)
__global__ void compressed_matrix_gemm_A2(IndexT *A2_row_indices, IndexT *A2_col_indices, NumericT *A2_elements, IndexT A2_size1, IndexT *new_row_buffer)
const handle_type & handle2() const 
Returns the OpenCL handle to the column index array. 
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__device__ unsigned int merge_subwarp_symbolic_double(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, unsigned int *output_array, unsigned int id_in_warp, unsigned int subwarpsize)
Implementations of direct triangular solvers for sparse matrices using CUDA. 
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling. 
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object. 
Common routines for CUDA execution. 
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
void set(vcl_size_t index, U value)
size_type size() const 
Returns the length of the vector (cf. std::vector) 
__device__ unsigned int merge_subwarp_numeric(NumericT scaling_factor, unsigned int input_start, unsigned int input_end, const unsigned int *input_indices, const NumericT *input_values, unsigned int invalid_token, unsigned int *output_indices, NumericT *output_values, unsigned int id_in_warp, unsigned int subwarpsize)
__global__ void compressed_matrix_gemm_G1(IndexT *G1_row_indices, IndexT *G1_col_indices, NumericT *G1_elements, IndexT G1_size1, IndexT const *A_row_indices, IndexT const *A_col_indices, NumericT const *A_elements, IndexT A_size1, IndexT A_nnz, IndexT max_per_row, IndexT *new_row_buffer)
vcl_size_t raw_size() const 
Returns the number of bytes of the currently active buffer. 
A sparse square matrix in compressed sparse rows format. 
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. 
Implementation of the ViennaCL scalar class. 
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix. 
const handle_type & handle() const 
Returns the memory handle. 
__global__ void compressed_matrix_gemm_stage_2(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, IndexT B_size2, IndexT *C_row_indices, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices)
NumericT min(std::vector< NumericT > const &v1)
void switch_memory_context(viennacl::context new_ctx)