1 #ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_ 
   43 template<
typename NumericT>
 
   54   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
 
   59     result[i] += alpha * value_p;
 
   60     value_r   -= alpha * Ap[i];
 
   61     value_p    = value_r + beta * value_p;
 
   65     inner_prod_contrib += value_r * value_r;
 
   69   __shared__ 
NumericT shared_array[256];
 
   70   shared_array[threadIdx.x] = inner_prod_contrib;
 
   75       shared_array[threadIdx.x] += shared_array[threadIdx.x + 
stride];
 
   80     inner_prod_buffer[blockIdx.x] = shared_array[0];
 
   84 template<
typename NumericT>
 
  113 template<
unsigned int SubWarpSizeV, 
typename NumericT>
 
  115           const unsigned int * row_indices,
 
  116           const unsigned int * column_indices,
 
  122           unsigned int buffer_size)
 
  124   __shared__ 
NumericT shared_elements[256];
 
  128   const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
 
  129   const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1);
 
  130   const unsigned int block_start = blockIdx.x * block_increment;
 
  131   const unsigned int block_stop  = 
min(block_start + block_increment, size);
 
  133   for (
unsigned int row  = block_start + threadIdx.x / SubWarpSizeV;
 
  135                     row += blockDim.x / SubWarpSizeV)
 
  138     unsigned int row_end = row_indices[
row+1];
 
  139     for (
unsigned int i = row_indices[
row] + id_in_row; i < row_end; i += SubWarpSizeV)
 
  140       dot_prod += elements[i] * p[column_indices[i]];
 
  142     shared_elements[threadIdx.x] = 
dot_prod;
 
  143     if (1  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  1];
 
  144     if (2  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  2];
 
  145     if (4  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  4];
 
  146     if (8  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  8];
 
  147     if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
 
  151       Ap[
row] = shared_elements[threadIdx.x];
 
  152       inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x];
 
  153       inner_prod_pAp  +=                       p[
row] * shared_elements[threadIdx.x];
 
  158   __shared__ 
NumericT shared_array_ApAp[256];
 
  159   __shared__ 
NumericT shared_array_pAp[256];
 
  160   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
  161   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
  167       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
  168       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
  173   if (threadIdx.x == 0) {
 
  174     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
  175     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
  180 template<
typename NumericT>
 
  182           const unsigned int * row_indices,
 
  183           const unsigned int * column_indices,
 
  184           const unsigned int * row_blocks,
 
  186           unsigned int num_blocks,
 
  191           unsigned int buffer_size)
 
  196   __shared__ 
NumericT     shared_elements[1024];
 
  198   for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
 
  200     unsigned int row_start = row_blocks[block_id];
 
  201     unsigned int row_stop  = row_blocks[block_id + 1];
 
  202     unsigned int element_start = row_indices[row_start];
 
  203     unsigned int element_stop = row_indices[row_stop];
 
  204     unsigned int rows_to_process = row_stop - row_start;
 
  206     if (rows_to_process > 1)  
 
  209       for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
 
  210         shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
 
  215       for (
unsigned int row = row_start + threadIdx.x; 
row < row_stop; 
row += blockDim.x)
 
  218         unsigned int thread_row_start = row_indices[
row]     - element_start;
 
  219         unsigned int thread_row_stop  = row_indices[
row + 1] - element_start;
 
  220         for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
 
  221           dot_prod += shared_elements[i];
 
  223         inner_prod_ApAp += dot_prod * 
dot_prod;
 
  231       shared_elements[threadIdx.x] = 0;
 
  232       for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
 
  233         shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
 
  240           shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
 
  243       if (threadIdx.x == 0)
 
  245         Ap[row_start] = shared_elements[0];
 
  246         inner_prod_ApAp += shared_elements[0] * shared_elements[0];
 
  247         inner_prod_pAp  +=       p[row_start] * shared_elements[0];
 
  255   __shared__ 
NumericT shared_array_ApAp[256];
 
  256   __shared__ 
NumericT shared_array_pAp[256];
 
  257   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
  258   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
  264       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
  265       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
  270   if (threadIdx.x == 0) {
 
  271     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
  272     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
  279 template<
typename NumericT>
 
  286   unsigned int buffer_size_per_vector = 
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
 
  288 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500 
  289   if (
double(A.
nnz()) / 
double(A.
size1()) > 6.4) 
 
  291     pipelined_cg_csr_vec_mul_blocked_kernel<8,  NumericT><<<256, 256>>>(   
 
  293   if (
double(A.
nnz()) / 
double(A.
size1()) > 12.0) 
 
  295     pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>(   
 
  297                                                                         viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  298                                                                         viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  299                                                                         viennacl::cuda_arg<NumericT>(A.
handle()),
 
  304                                                                         buffer_size_per_vector
 
  310     pipelined_cg_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  311                                                            viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  312                                                            viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
  313                                                            viennacl::cuda_arg<NumericT>(A.
handle()),
 
  314                                                            static_cast<unsigned int>(A.
blocks1()),
 
  319                                                            buffer_size_per_vector);
 
  330 template<
typename NumericT>
 
  333                                                 const unsigned int * group_boundaries,
 
  338                                                 unsigned int buffer_size)
 
  342   __shared__ 
unsigned int shared_rows[128];
 
  343   __shared__ 
NumericT inter_results[128];
 
  347   unsigned int group_start = group_boundaries[blockIdx.x];
 
  348   unsigned int group_end   = group_boundaries[blockIdx.x + 1];
 
  349   unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   
 
  351   unsigned int local_index = 0;
 
  353   for (
unsigned int k = 0; k < k_end; ++k)
 
  355     local_index = group_start + k * blockDim.x + threadIdx.x;
 
  357     tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
 
  358     val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
 
  361     if (threadIdx.x == 0 && k > 0)
 
  363       if (tmp.x == shared_rows[blockDim.x-1])
 
  364         val += inter_results[blockDim.x-1];
 
  367         NumericT Ap_entry = inter_results[blockDim.x-1];
 
  368         Ap[shared_rows[blockDim.x-1]] = Ap_entry;
 
  369         inner_prod_ApAp += Ap_entry * Ap_entry;
 
  370         inner_prod_pAp  += Ap_entry * p[shared_rows[blockDim.x-1]];
 
  376     shared_rows[threadIdx.x] = tmp.x;
 
  377     inter_results[threadIdx.x] = val;
 
  383       left = (threadIdx.x >= 
stride && tmp.x == shared_rows[threadIdx.x - 
stride]) ? inter_results[threadIdx.x - 
stride] : 0;
 
  385       inter_results[threadIdx.x] += left;
 
  390     if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
 
  391         shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
 
  393       NumericT Ap_entry = inter_results[threadIdx.x];
 
  394       Ap[tmp.x] = Ap_entry;
 
  395       inner_prod_ApAp += Ap_entry * Ap_entry;
 
  396       inner_prod_pAp  += Ap_entry * p[tmp.x];
 
  402   if (local_index + 1 == group_end)
 
  404     NumericT Ap_entry = inter_results[threadIdx.x];
 
  405     Ap[tmp.x] = Ap_entry;
 
  406     inner_prod_ApAp += Ap_entry * Ap_entry;
 
  407     inner_prod_pAp  += Ap_entry * p[tmp.x];
 
  411   __shared__ 
NumericT shared_array_ApAp[256];
 
  412   __shared__ 
NumericT shared_array_pAp[256];
 
  413   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
  414   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
  420       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
  421       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
  426   if (threadIdx.x == 0) {
 
  427     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
  428     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
  434 template<
typename NumericT>
 
  440   unsigned int size = p.
size();
 
  441   unsigned int buffer_size_per_vector = 
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
 
  445   pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle12()),
 
  446                                                 viennacl::cuda_arg<NumericT>(A.
handle()),
 
  447                                                 viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
  452                                                 buffer_size_per_vector);
 
  462 template<
typename NumericT>
 
  465                                                 unsigned int internal_row_num,
 
  466                                                 unsigned int items_per_row,
 
  471                                                 unsigned int buffer_size)
 
  475   unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
 
  476   unsigned int glb_sz = gridDim.x * blockDim.x;
 
  482     unsigned int offset = 
row;
 
  483     for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
 
  486       sum += val ? p[coords[offset]] * val : 
NumericT(0);
 
  490     inner_prod_ApAp += sum * 
sum;
 
  491     inner_prod_pAp  += sum * p[
row];
 
  495   __shared__ 
NumericT shared_array_ApAp[256];
 
  496   __shared__ 
NumericT shared_array_pAp[256];
 
  497   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
  498   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
  504       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
  505       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
  510   if (threadIdx.x == 0) {
 
  511     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
  512     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
  517 template<
typename NumericT>
 
  523   unsigned int size = p.
size();
 
  524   unsigned int buffer_size_per_vector = 
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
 
  526   pipelined_cg_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  527                                                 viennacl::cuda_arg<NumericT>(A.
handle()),
 
  529                                                 static_cast<unsigned int>(A.
maxnnz()),
 
  534                                                 buffer_size_per_vector);
 
  543 template<
typename NumericT>
 
  545                                                        const unsigned int * column_indices,
 
  546                                                        const unsigned int * block_start,
 
  551                                                        unsigned int block_size,
 
  553                                                        unsigned int buffer_size)
 
  558   unsigned int blocks_per_threadblock = blockDim.x / block_size;
 
  559   unsigned int id_in_block = threadIdx.x % block_size;
 
  560   unsigned int num_blocks = (size - 1) / block_size + 1;
 
  561   unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
 
  562   unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
 
  564   for (
unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
 
  566     unsigned int row         = block_idx * block_size + id_in_block;
 
  567     unsigned int offset      = block_start[block_idx];
 
  568     unsigned int num_columns = columns_per_block[block_idx];
 
  571     for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
 
  573       unsigned int index = offset + item_id * block_size + id_in_block;
 
  576       sum += val ? (p[column_indices[index]] * val) : 0;
 
  582       inner_prod_ApAp += sum * 
sum;
 
  583       inner_prod_pAp  += sum * p[
row];
 
  588   __shared__ 
NumericT shared_array_ApAp[256];
 
  589   __shared__ 
NumericT shared_array_pAp[256];
 
  590   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
  591   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
  597       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
  598       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
  603   if (threadIdx.x == 0) {
 
  604     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
  605     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
  609 template<
typename NumericT>
 
  615   unsigned int size = p.
size();
 
  616   unsigned int buffer_size_per_vector = 
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
 
  618   pipelined_cg_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  619                                                        viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  620                                                        viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
  621                                                        viennacl::cuda_arg<NumericT>(A.
handle()),
 
  627                                                        buffer_size_per_vector);
 
  637 template<
typename NumericT>
 
  640                                                 const unsigned int * csr_rows,
 
  641                                                 const unsigned int * csr_cols,
 
  643                                                 unsigned int internal_row_num,
 
  644                                                 unsigned int items_per_row,
 
  649                                                 unsigned int buffer_size)
 
  653   unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
 
  654   unsigned int glb_sz = gridDim.x * blockDim.x;
 
  660     unsigned int offset = 
row;
 
  661     for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
 
  663       NumericT val = ell_elements[offset];
 
  665       sum += val ? p[ell_coords[offset]] * val : 
NumericT(0);
 
  668     unsigned int col_begin = csr_rows[
row];
 
  669     unsigned int col_end   = csr_rows[
row + 1];
 
  671     for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
 
  673       sum += p[csr_cols[item_id]] * csr_elements[item_id];
 
  677     inner_prod_ApAp += sum * 
sum;
 
  678     inner_prod_pAp  += sum * p[
row];
 
  682   __shared__ 
NumericT shared_array_ApAp[256];
 
  683   __shared__ 
NumericT shared_array_pAp[256];
 
  684   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
  685   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
  691       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
  692       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
  697   if (threadIdx.x == 0) {
 
  698     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
  699     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
  705 template<
typename NumericT>
 
  711   unsigned int size = p.
size();
 
  712   unsigned int buffer_size_per_vector = 
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
 
  714   pipelined_cg_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  715                                                 viennacl::cuda_arg<NumericT>(A.
handle()),
 
  716                                                 viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
  717                                                 viennacl::cuda_arg<unsigned int>(A.
handle4()),
 
  718                                                 viennacl::cuda_arg<NumericT>(A.
handle5()),
 
  720                                                 static_cast<unsigned int>(A.
ell_nnz()),
 
  725                                                 buffer_size_per_vector);
 
  733 template<
typename NumericT>
 
  739                                                    unsigned int chunk_size,
 
  740                                                    unsigned int chunk_offset)
 
  745   __shared__ 
NumericT shared_array[256];
 
  746   __shared__ 
NumericT shared_array_Ap_in_r0[256];
 
  748   shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x];
 
  749   shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * chunk_size];
 
  753     if (threadIdx.x < 
stride) {
 
  754       shared_array[threadIdx.x]          += shared_array[threadIdx.x + 
stride];
 
  755       shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x + 
stride];
 
  761   alpha = shared_array[0] / shared_array_Ap_in_r0[0];
 
  765   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
 
  769     value_s = residual[i] - alpha * Ap[i];
 
  770     inner_prod_contrib += value_s * value_s;
 
  777   shared_array[threadIdx.x] = inner_prod_contrib;
 
  782       shared_array[threadIdx.x] += shared_array[threadIdx.x + 
stride];
 
  786   if (threadIdx.x == 0)
 
  787     inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0];
 
  790 template<
typename NumericT>
 
  798   unsigned int size = 
static_cast<unsigned int>(s.
size());
 
  799   unsigned int chunk_size   = 
static_cast<unsigned int>(buffer_chunk_size);
 
  800   unsigned int chunk_offset = 
static_cast<unsigned int>(buffer_chunk_offset);
 
  812 template<
typename NumericT>
 
  827   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
 
  832     NumericT value_residual = residual[i];
 
  837     value_result   += alpha * value_p + omega * value_s;
 
  838     value_residual  = value_s - omega * value_As;
 
  839     value_p         = value_residual + beta * (value_p - omega * value_Ap);
 
  841     result[i]   = value_result;
 
  842     residual[i] = value_residual;
 
  844     inner_prod_r_r0star += value_residual * value_r0star;
 
  848   __shared__ 
NumericT shared_array[256];
 
  849   shared_array[threadIdx.x] = inner_prod_r_r0star;
 
  854       shared_array[threadIdx.x] += shared_array[threadIdx.x + 
stride];
 
  858   if (threadIdx.x == 0)
 
  859     inner_prod_buffer[blockIdx.x] = shared_array[0];
 
  863 template<
typename NumericT>
 
  870   (void)buffer_chunk_size;
 
  871   unsigned int size = 
static_cast<unsigned int>(result.
size());
 
  895 template<
unsigned int SubWarpSizeV, 
typename NumericT>
 
  897           const unsigned int * row_indices,
 
  898           const unsigned int * column_indices,
 
  905           unsigned int buffer_size,
 
  906           unsigned int buffer_offset)
 
  908   __shared__ 
NumericT shared_elements[256];
 
  913   const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
 
  914   const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1);
 
  915   const unsigned int block_start = blockIdx.x * block_increment;
 
  916   const unsigned int block_stop  = 
min(block_start + block_increment, size);
 
  918   for (
unsigned int row  = block_start + threadIdx.x / SubWarpSizeV;
 
  920                     row += blockDim.x / SubWarpSizeV)
 
  923     unsigned int row_end = row_indices[
row+1];
 
  924     for (
unsigned int i = row_indices[
row] + id_in_row; i < row_end; i += SubWarpSizeV)
 
  925       dot_prod += elements[i] * p[column_indices[i]];
 
  927     shared_elements[threadIdx.x] = 
dot_prod;
 
  928     if (1  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  1];
 
  929     if (2  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  2];
 
  930     if (4  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  4];
 
  931     if (8  < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^  8];
 
  932     if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
 
  936       Ap[
row] = shared_elements[threadIdx.x];
 
  937       inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x];
 
  938       inner_prod_pAp  +=                       p[
row] * shared_elements[threadIdx.x];
 
  939       inner_prod_r0Ap +=                  r0star[
row] * shared_elements[threadIdx.x];
 
  944   __shared__ 
NumericT shared_array_ApAp[256];
 
  945   __shared__ 
NumericT shared_array_pAp[256];
 
  946   __shared__ 
NumericT shared_array_r0Ap[256];
 
  947   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
  948   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
  949   shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
 
  955       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
  956       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
  957       shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
 
  962   if (threadIdx.x == 0) {
 
  963     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
  964     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
  965     inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
 
  971 template<
typename NumericT>
 
  973           const unsigned int * row_indices,
 
  974           const unsigned int * column_indices,
 
  975           const unsigned int * row_blocks,
 
  977           unsigned int num_blocks,
 
  983           unsigned int buffer_size,
 
  984           unsigned int buffer_offset)
 
  990   __shared__ 
NumericT     shared_elements[1024];
 
  992   for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
 
  994     unsigned int row_start = row_blocks[block_id];
 
  995     unsigned int row_stop  = row_blocks[block_id + 1];
 
  996     unsigned int element_start = row_indices[row_start];
 
  997     unsigned int element_stop = row_indices[row_stop];
 
  998     unsigned int rows_to_process = row_stop - row_start;
 
 1000     if (rows_to_process > 1)  
 
 1003       for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
 
 1004         shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
 
 1009       for (
unsigned int row = row_start + threadIdx.x; 
row < row_stop; 
row += blockDim.x)
 
 1012         unsigned int thread_row_start = row_indices[
row]     - element_start;
 
 1013         unsigned int thread_row_stop  = row_indices[
row + 1] - element_start;
 
 1014         for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
 
 1015           dot_prod += shared_elements[i];
 
 1017         inner_prod_ApAp += dot_prod * 
dot_prod;
 
 1026       shared_elements[threadIdx.x] = 0;
 
 1027       for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
 
 1028         shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
 
 1034         if (threadIdx.x < 
stride)
 
 1035           shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
 
 1038       if (threadIdx.x == 0)
 
 1040         Ap[row_start] = shared_elements[0];
 
 1041         inner_prod_ApAp += shared_elements[0] * shared_elements[0];
 
 1042         inner_prod_pAp  +=       p[row_start] * shared_elements[0];
 
 1043         inner_prod_r0Ap +=  r0star[row_start] * shared_elements[0];
 
 1051   __shared__ 
NumericT shared_array_ApAp[256];
 
 1052   __shared__ 
NumericT shared_array_pAp[256];
 
 1053   __shared__ 
NumericT shared_array_r0Ap[256];
 
 1054   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
 1055   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
 1056   shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
 
 1060     if (threadIdx.x < 
stride)
 
 1062       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
 1063       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
 1064       shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
 
 1069   if (threadIdx.x == 0) {
 
 1070     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
 1071     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
 1072     inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
 
 1079 template<
typename NumericT>
 
 1089   unsigned int chunk_size   = 
static_cast<unsigned int>(buffer_chunk_size);
 
 1090   unsigned int chunk_offset = 
static_cast<unsigned int>(buffer_chunk_offset);
 
 1092 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500 
 1093   if (
double(A.
nnz()) / 
double(A.
size1()) > 6.4) 
 
 1095     pipelined_bicgstab_csr_vec_mul_blocked_kernel<8,  NumericT><<<256, 256>>>(   
 
 1097   if (
double(A.
nnz()) / 
double(A.
size1()) > 12.0) 
 
 1099     pipelined_bicgstab_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>(   
 
 1101                                                                         viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
 1102                                                                         viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
 1103                                                                         viennacl::cuda_arg<NumericT>(A.
handle()),
 
 1116     pipelined_bicgstab_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
 1117                                                                 viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
 1118                                                                 viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
 1119                                                                 viennacl::cuda_arg<NumericT>(A.
handle()),
 
 1120                                                                 static_cast<unsigned int>(A.
blocks1()),
 
 1138 template<
typename NumericT>
 
 1141                                                 const unsigned int * group_boundaries,
 
 1147                                                 unsigned int buffer_size,
 
 1148                                                 unsigned int buffer_offset)
 
 1153   __shared__ 
unsigned int shared_rows[128];
 
 1154   __shared__ 
NumericT inter_results[128];
 
 1158   unsigned int group_start = group_boundaries[blockIdx.x];
 
 1159   unsigned int group_end   = group_boundaries[blockIdx.x + 1];
 
 1160   unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   
 
 1162   unsigned int local_index = 0;
 
 1164   for (
unsigned int k = 0; k < k_end; ++k)
 
 1166     local_index = group_start + k * blockDim.x + threadIdx.x;
 
 1168     tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
 
 1169     val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
 
 1172     if (threadIdx.x == 0 && k > 0)
 
 1174       if (tmp.x == shared_rows[blockDim.x-1])
 
 1175         val += inter_results[blockDim.x-1];
 
 1178         NumericT Ap_entry = inter_results[blockDim.x-1];
 
 1179         Ap[shared_rows[blockDim.x-1]] = Ap_entry;
 
 1180         inner_prod_ApAp += Ap_entry * Ap_entry;
 
 1181         inner_prod_pAp  += Ap_entry * p[shared_rows[blockDim.x-1]];
 
 1182         inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry;
 
 1188     shared_rows[threadIdx.x] = tmp.x;
 
 1189     inter_results[threadIdx.x] = val;
 
 1195       left = (threadIdx.x >= 
stride && tmp.x == shared_rows[threadIdx.x - 
stride]) ? inter_results[threadIdx.x - 
stride] : 0;
 
 1197       inter_results[threadIdx.x] += left;
 
 1202     if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
 
 1203         shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
 
 1205       NumericT Ap_entry = inter_results[threadIdx.x];
 
 1206       Ap[tmp.x] = Ap_entry;
 
 1207       inner_prod_ApAp += Ap_entry * Ap_entry;
 
 1208       inner_prod_pAp  += Ap_entry * p[tmp.x];
 
 1209       inner_prod_r0Ap += r0star[tmp.x] * Ap_entry;
 
 1215   if (local_index + 1 == group_end)
 
 1217     NumericT Ap_entry = inter_results[threadIdx.x];
 
 1218     Ap[tmp.x] = Ap_entry;
 
 1219     inner_prod_ApAp += Ap_entry * Ap_entry;
 
 1220     inner_prod_pAp  += Ap_entry * p[tmp.x];
 
 1221     inner_prod_r0Ap += Ap_entry * r0star[tmp.x];
 
 1225   __shared__ 
NumericT shared_array_ApAp[256];
 
 1226   __shared__ 
NumericT shared_array_pAp[256];
 
 1227   __shared__ 
NumericT shared_array_r0Ap[256];
 
 1228   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
 1229   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
 1230   shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
 
 1234     if (threadIdx.x < 
stride)
 
 1236       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
 1237       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
 1238       shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
 
 1243   if (threadIdx.x == 0) {
 
 1244     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
 1245     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
 1246     inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
 
 1252 template<
typename NumericT>
 
 1262   unsigned int chunk_size   = 
static_cast<unsigned int>(buffer_chunk_size);
 
 1263   unsigned int chunk_offset = 
static_cast<unsigned int>(buffer_chunk_offset);
 
 1267   pipelined_bicgstab_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle12()),
 
 1268                                                       viennacl::cuda_arg<NumericT>(A.
handle()),
 
 1269                                                       viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
 1286 template<
typename NumericT>
 
 1289                                                 unsigned int internal_row_num,
 
 1290                                                 unsigned int items_per_row,
 
 1296                                                 unsigned int buffer_size,
 
 1297                                                 unsigned int buffer_offset)
 
 1302   unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
 
 1303   unsigned int glb_sz = gridDim.x * blockDim.x;
 
 1309     unsigned int offset = 
row;
 
 1310     for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
 
 1313       sum += val ? p[coords[offset]] * val : 
NumericT(0);
 
 1317     inner_prod_ApAp += sum * 
sum;
 
 1318     inner_prod_pAp  += sum * p[
row];
 
 1319     inner_prod_r0Ap += sum * r0star[
row];
 
 1323   __shared__ 
NumericT shared_array_ApAp[256];
 
 1324   __shared__ 
NumericT shared_array_pAp[256];
 
 1325   __shared__ 
NumericT shared_array_r0Ap[256];
 
 1326   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
 1327   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
 1328   shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
 
 1332     if (threadIdx.x < 
stride)
 
 1334       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
 1335       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
 1336       shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
 
 1341   if (threadIdx.x == 0) {
 
 1342     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
 1343     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
 1344     inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
 
 1349 template<
typename NumericT>
 
 1359   unsigned int chunk_size   = 
static_cast<unsigned int>(buffer_chunk_size);
 
 1360   unsigned int chunk_offset = 
static_cast<unsigned int>(buffer_chunk_offset);
 
 1362   pipelined_bicgstab_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
 1363                                                       viennacl::cuda_arg<NumericT>(A.
handle()),
 
 1365                                                       static_cast<unsigned int>(A.
maxnnz()),
 
 1381 template<
typename NumericT>
 
 1383                                                              const unsigned int * column_indices,
 
 1384                                                              const unsigned int * block_start,
 
 1390                                                              unsigned int block_size,
 
 1392                                                              unsigned int buffer_size,
 
 1393                                                              unsigned int buffer_offset)
 
 1399   unsigned int blocks_per_threadblock = blockDim.x / block_size;
 
 1400   unsigned int id_in_block = threadIdx.x % block_size;
 
 1401   unsigned int num_blocks = (size - 1) / block_size + 1;
 
 1402   unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
 
 1403   unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
 
 1405   for (
unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
 
 1407     unsigned int row         = block_idx * block_size + id_in_block;
 
 1408     unsigned int offset      = block_start[block_idx];
 
 1409     unsigned int num_columns = columns_per_block[block_idx];
 
 1412     for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
 
 1414       unsigned int index = offset + item_id * block_size + id_in_block;
 
 1417       sum += val ? (p[column_indices[index]] * val) : 0;
 
 1423       inner_prod_ApAp += sum * 
sum;
 
 1424       inner_prod_pAp  += sum * p[
row];
 
 1425       inner_prod_r0Ap += sum * r0star[
row];
 
 1430   __shared__ 
NumericT shared_array_ApAp[256];
 
 1431   __shared__ 
NumericT shared_array_pAp[256];
 
 1432   __shared__ 
NumericT shared_array_r0Ap[256];
 
 1433   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
 1434   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
 1435   shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
 
 1439     if (threadIdx.x < 
stride)
 
 1441       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
 1442       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
 1443       shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
 
 1448   if (threadIdx.x == 0) {
 
 1449     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
 1450     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
 1451     inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
 
 1455 template<
typename NumericT>
 
 1465   unsigned int chunk_size   = 
static_cast<unsigned int>(buffer_chunk_size);
 
 1466   unsigned int chunk_offset = 
static_cast<unsigned int>(buffer_chunk_offset);
 
 1468   pipelined_bicgstab_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
 1469                                                              viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
 1470                                                              viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
 1471                                                              viennacl::cuda_arg<NumericT>(A.
handle()),
 
 1489 template<
typename NumericT>
 
 1492                                                       const unsigned int * csr_rows,
 
 1493                                                       const unsigned int * csr_cols,
 
 1495                                                       unsigned int internal_row_num,
 
 1496                                                       unsigned int items_per_row,
 
 1502                                                       unsigned int buffer_size,
 
 1503                                                       unsigned int buffer_offset)
 
 1508   unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
 
 1509   unsigned int glb_sz = gridDim.x * blockDim.x;
 
 1515     unsigned int offset = 
row;
 
 1516     for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
 
 1518       NumericT val = ell_elements[offset];
 
 1520       sum += val ? p[ell_coords[offset]] * val : 
NumericT(0);
 
 1523     unsigned int col_begin = csr_rows[
row];
 
 1524     unsigned int col_end   = csr_rows[
row + 1];
 
 1526     for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
 
 1528       sum += p[csr_cols[item_id]] * csr_elements[item_id];
 
 1532     inner_prod_ApAp += sum * 
sum;
 
 1533     inner_prod_pAp  += sum * p[
row];
 
 1534     inner_prod_r0Ap += sum * r0star[
row];
 
 1538   __shared__ 
NumericT shared_array_ApAp[256];
 
 1539   __shared__ 
NumericT shared_array_pAp[256];
 
 1540   __shared__ 
NumericT shared_array_r0Ap[256];
 
 1541   shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
 
 1542   shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
 
 1543   shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
 
 1547     if (threadIdx.x < 
stride)
 
 1549       shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
 
 1550       shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + 
stride];
 
 1551       shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
 
 1556   if (threadIdx.x == 0) {
 
 1557     inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
 
 1558     inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
 
 1559     inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
 
 1565 template<
typename NumericT>
 
 1575   unsigned int chunk_size   = 
static_cast<unsigned int>(buffer_chunk_size);
 
 1576   unsigned int chunk_offset = 
static_cast<unsigned int>(buffer_chunk_offset);
 
 1578   pipelined_bicgstab_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
 1579                                                       viennacl::cuda_arg<NumericT>(A.
handle()),
 
 1580                                                       viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
 1581                                                       viennacl::cuda_arg<unsigned int>(A.
handle4()),
 
 1582                                                       viennacl::cuda_arg<NumericT>(A.
handle5()),
 
 1584                                                       static_cast<unsigned int>(A.
ell_nnz()),
 
 1597 template <
typename T>
 
 1599                                                     unsigned int vk_offset,
 
 1602                                                     unsigned int R_offset,
 
 1603                                                     T 
const * inner_prod_buffer,
 
 1604                                                     unsigned int chunk_size,
 
 1605                                                     T * r_dot_vk_buffer,
 
 1606                                                     unsigned int chunk_offset,
 
 1609   __shared__ T shared_array[128];
 
 1613   shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size];
 
 1617     if (threadIdx.x < 
stride)
 
 1618       shared_array[threadIdx.x] += shared_array[threadIdx.x + 
stride];
 
 1623   norm_vk = sqrt(shared_array[0]);
 
 1625   T inner_prod_contrib = 0;
 
 1626   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
 
 1627     T value_vk = vk[i + vk_offset] / norm_vk;
 
 1629     inner_prod_contrib += residual[i] * value_vk;
 
 1631     vk[i + vk_offset] = value_vk;
 
 1636   shared_array[threadIdx.x] = inner_prod_contrib;
 
 1640     if (threadIdx.x < 
stride)
 
 1641       shared_array[threadIdx.x] += shared_array[threadIdx.x + 
stride];
 
 1645   if (threadIdx.x == 0)
 
 1646     r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0];
 
 1648   if (blockDim.x * blockIdx.x + threadIdx.x == 0)
 
 1649     R_buffer[R_offset] = norm_vk;
 
 1659 template <
typename T>
 
 1670   unsigned int R_offset = offset_in_R;
 
 1671   unsigned int chunk_size = buffer_chunk_size;
 
 1672   unsigned int chunk_offset = buffer_chunk_offset;
 
 1673   unsigned int size = v_k.
size();
 
 1690 template <
typename T>
 
 1695                                                            T * vi_in_vk_buffer,
 
 1696                                                            unsigned int chunk_size)
 
 1698   __shared__ T shared_array[7*128];
 
 1701   unsigned int k_base = 0;
 
 1704     unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
 
 1706     for (
unsigned int j=0; j<vecs_in_iteration; ++j)
 
 1707       shared_array[threadIdx.x + j*chunk_size] = 0;
 
 1709     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
 
 1713       for (
unsigned int j=0; j<vecs_in_iteration; ++j)
 
 1714         shared_array[threadIdx.x + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * 
internal_size];
 
 1721       if (threadIdx.x < 
stride) {
 
 1722         for (
unsigned int j=0; j<vecs_in_iteration; ++j)
 
 1723           shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + 
stride];
 
 1728     if (threadIdx.x == 0)
 
 1729       for (
unsigned int j=0; j<vecs_in_iteration; ++j)
 
 1730         vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = shared_array[j*chunk_size];
 
 1732     k_base += vecs_in_iteration;
 
 1737 template <
typename T>
 
 1745   unsigned int chunk_size = buffer_chunk_size;
 
 1746   unsigned int size = v_k_size;
 
 1748   unsigned int k = param_k;
 
 1750   pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 128>>>(
viennacl::cuda_arg(device_krylov_basis),
 
 1762 template <
typename T>
 
 1767                                                            T 
const * vi_in_vk_buffer,
 
 1768                                                            unsigned int chunk_size,
 
 1770                                                            unsigned int krylov_dim,
 
 1771                                                            T * inner_prod_buffer)
 
 1773   __shared__ T shared_array[7*128];
 
 1777   unsigned int k_base = 0;
 
 1780     unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
 
 1783     for (
unsigned int j=0; j<vecs_in_iteration; ++j)
 
 1784       shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + (k_base + j) * chunk_size];
 
 1788       if (threadIdx.x < 
stride) {
 
 1789         for (
unsigned int j=0; j<vecs_in_iteration; ++j)
 
 1790           shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + 
stride];
 
 1796     for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
 
 1800       for (
unsigned int j=0; j<vecs_in_iteration; ++j)
 
 1801         value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * 
internal_size];
 
 1802       vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;
 
 1807     if (blockIdx.x == 0)
 
 1808       for (
unsigned int j=0; j<vecs_in_iteration; ++j)
 
 1809         R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size];
 
 1812     k_base += vecs_in_iteration;
 
 1816   shared_array[threadIdx.x] = vk_dot_vk;
 
 1820     if (threadIdx.x < 
stride)
 
 1821       shared_array[threadIdx.x] += shared_array[threadIdx.x + 
stride];
 
 1825   if (threadIdx.x == 0)
 
 1826     inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0];
 
 1829 template <
typename T>
 
 1840   unsigned int chunk_size = buffer_chunk_size;
 
 1841   unsigned int size = v_k_size;
 
 1843   unsigned int k = param_k;
 
 1844   unsigned int krylov = krylov_dim;
 
 1846   pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 128>>>(
viennacl::cuda_arg(device_krylov_basis),
 
 1861 template <
typename T>
 
 1864                                                      T 
const * krylov_basis,
 
 1867                                                      T 
const * coefficients,
 
 1870   for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
 
 1872     T value_result = result[i] + coefficients[0] * residual[i];
 
 1874     for (
unsigned int j = 1; j < k; ++j)
 
 1875       value_result += coefficients[j] * krylov_basis[i + (j-1)*
internal_size];
 
 1877     result[i] = value_result;
 
 1881 template <
typename T>
 
 1890   unsigned int size = v_k_size;
 
 1892   unsigned int k = param_k;
 
 1906 template <
typename NumericT>
 
 1912   unsigned int size = p.
size();
 
 1913   unsigned int buffer_size_per_vector = 
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
 
 1915 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500 
 1916   if (
double(A.
nnz()) / 
double(A.
size1()) > 6.4) 
 
 1918     pipelined_cg_csr_vec_mul_blocked_kernel<8,  NumericT><<<256, 256>>>(   
 
 1920   if (
double(A.
nnz()) / 
double(A.
size1()) > 12.0) 
 
 1922     pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<128, 256>>>(   
 
 1924                                                                         viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
 1925                                                                         viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
 1926                                                                         viennacl::cuda_arg<NumericT>(A.
handle()),
 
 1931                                                                         buffer_size_per_vector
 
 1937     pipelined_cg_csr_vec_mul_adaptive_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
 1938                                                            viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
 1939                                                            viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
 1940                                                            viennacl::cuda_arg<NumericT>(A.
handle()),
 
 1941                                                            static_cast<unsigned int>(A.
blocks1()),
 
 1946                                                            buffer_size_per_vector);
 
 1952 template <
typename T>
 
 1958   unsigned int size = p.
size();
 
 1959   unsigned int buffer_size_per_vector = 
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
 
 1963   pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle12()),
 
 1964                                                 viennacl::cuda_arg<T>(A.
handle()),
 
 1965                                                 viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
 1970                                                 buffer_size_per_vector);
 
 1974 template <
typename T>
 
 1980   unsigned int size = p.
size();
 
 1981   unsigned int buffer_size_per_vector = 
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
 
 1983   pipelined_cg_ell_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
 1984                                                 viennacl::cuda_arg<T>(A.
handle()),
 
 1986                                                 static_cast<unsigned int>(A.
maxnnz()),
 
 1991                                                 buffer_size_per_vector);
 
 1995 template <
typename T>
 
 2001   unsigned int size = p.
size();
 
 2002   unsigned int buffer_size_per_vector = 
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
 
 2004   pipelined_cg_sliced_ell_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
 2005                                                        viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
 2006                                                        viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
 2007                                                        viennacl::cuda_arg<T>(A.
handle()),
 
 2013                                                        buffer_size_per_vector);
 
 2018 template <
typename T>
 
 2024   unsigned int size = p.
size();
 
 2025   unsigned int buffer_size_per_vector = 
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
 
 2027   pipelined_cg_hyb_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
 2028                                                 viennacl::cuda_arg<T>(A.
handle()),
 
 2029                                                 viennacl::cuda_arg<unsigned int>(A.
handle3()),
 
 2030                                                 viennacl::cuda_arg<unsigned int>(A.
handle4()),
 
 2031                                                 viennacl::cuda_arg<T>(A.
handle5()),
 
 2033                                                 static_cast<unsigned int>(A.
ell_nnz()),
 
 2038                                                 buffer_size_per_vector);
 
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, unsigned int block_size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T const *vi_in_vk_buffer, unsigned int chunk_size, T *R_buffer, unsigned int krylov_dim, T *inner_prod_buffer)
__global__ void pipelined_cg_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT *r, NumericT const *Ap, NumericT beta, NumericT *inner_prod_buffer, unsigned int size)
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector. 
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle3() const 
const vcl_size_t & size1() const 
Returns the number of rows. 
const handle_type & handle() const 
const handle_type & handle12() const 
Returns the OpenCL handle to the (row, column) index array. 
vcl_size_t internal_size1() const 
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, unsigned int block_size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_gmres_normalize_vk_kernel(T *vk, unsigned int vk_offset, T const *residual, T *R_buffer, unsigned int R_offset, T const *inner_prod_buffer, unsigned int chunk_size, T *r_dot_vk_buffer, unsigned int chunk_offset, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL. 
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm. 
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
const handle_type & handle4() const 
__global__ void pipelined_bicgstab_csr_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector. 
__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t rows_per_block() const 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
const handle_type & handle1() const 
Returns the OpenCL handle to the row index array. 
vcl_size_t internal_size1() const 
const vcl_size_t & nnz() const 
Returns the number of nonzero entries. 
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const handle_type & handle2() const 
__global__ void pipelined_cg_csr_vec_mul_blocked_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
__global__ void pipelined_bicgstab_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT omega, NumericT const *s, NumericT *residual, NumericT const *As, NumericT beta, NumericT const *Ap, NumericT const *r0star, NumericT *inner_prod_buffer, unsigned int size)
Sparse matrix class using the ELLPACK format for storing the nonzeros. 
__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
Sparse matrix class using the sliced ELLPACK with parameters C, . 
result_of::size_type< T >::type start(T const &obj)
__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
const handle_type & handle2() const 
Returns the OpenCL handle to the column index array. 
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
vcl_size_t maxnnz() const 
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
const handle_type & handle3() const 
Returns the OpenCL handle to the group start index array. 
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle3() const 
Returns the OpenCL handle to the row block array. 
void clear()
Resets all entries to zero. Does not change the size of the vector. 
Common routines for CUDA execution. 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
__global__ void pipelined_bicgstab_csr_vec_mul_blocked_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
__global__ void pipelined_gmres_update_result_kernel(T *result, T const *residual, T const *krylov_basis, unsigned int size, unsigned int internal_size, T const *coefficients, unsigned int k)
size_type size() const 
Returns the length of the vector (cf. std::vector) 
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t ell_nnz() const 
__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T *vi_in_vk_buffer, unsigned int chunk_size)
__global__ void pipelined_cg_csr_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_bicgstab_update_s_kernel(NumericT *s, NumericT const *residual, NumericT const *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int chunk_size, unsigned int chunk_offset)
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. 
const handle_type & handle5() const 
const vcl_size_t & blocks1() const 
Returns the internal number of row blocks for an adaptive SpMV. 
Implementation of the ViennaCL scalar class. 
void pipelined_gmres_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
NumericT min(std::vector< NumericT > const &v1)
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...