1 #ifndef VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP 
    2 #define VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP 
   45           const unsigned int * row_indices,
 
   46           const unsigned int * column_indices,
 
   49           unsigned int *influences_row,
 
   50           unsigned int *influences_id,
 
   51           unsigned int *influences_values
 
   54   unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
 
   55   unsigned int global_size = gridDim.x * blockDim.x;
 
   57   for (
unsigned int i = global_id; i < 
size1; i += global_size)
 
   59     unsigned int tmp = row_indices[i];
 
   60     influences_row[i] = tmp;
 
   61     influences_values[i] = row_indices[i+1] - tmp;
 
   64   for (
unsigned int i = global_id; i < nnz; i += global_size)
 
   65     influences_id[i] = column_indices[i];
 
   73 template<
typename NumericT>
 
   80   amg_influence_trivial_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
 
   81                                              viennacl::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
 
   82                                              static_cast<unsigned int>(A.
size1()),
 
   83                                              static_cast<unsigned int>(A.
nnz()),
 
   93 template<
typename NumericT>
 
   98   throw std::runtime_error(
"not implemented yet");
 
  102 template<
typename NumericT>
 
  122   unsigned int coarse_id = 0;
 
  125     coarse_ids.set(i, coarse_id);
 
  138 template<
typename IndexT>
 
  142                                         IndexT 
const *point_types,
 
  143                                         IndexT 
const *random_weights,
 
  146   unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
 
  147   unsigned int global_size = gridDim.x * blockDim.x;
 
  149   for (
unsigned int i = global_id; i < 
size; i += global_size)
 
  151     switch (point_types[i])
 
  160     work_random[i] = random_weights[i];
 
  166 template<
typename IndexT>
 
  168                                            IndexT 
const *work_random,
 
  169                                            IndexT 
const *work_index,
 
  171                                            IndexT       *work_random2,
 
  173                                            IndexT 
const *influences_row,
 
  174                                            IndexT 
const *influences_id,
 
  177   unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
 
  178   unsigned int global_size = gridDim.x * blockDim.x;
 
  180   for (
unsigned int i = global_id; i < 
size; i += global_size)
 
  183     unsigned int state  = work_state[i];
 
  184     unsigned int random = work_random[i];
 
  185     unsigned int index  = work_index[i];
 
  188     unsigned int j_stop = influences_row[i + 1];
 
  189     for (
unsigned int j = influences_row[i]; j < j_stop; ++j)
 
  191       unsigned int influenced_point_id = influences_id[j];
 
  194       if (state < work_state[influenced_point_id])
 
  196         state  = work_state[influenced_point_id];
 
  197         random = work_random[influenced_point_id];
 
  198         index  = work_index[influenced_point_id];
 
  200       else if (state == work_state[influenced_point_id])
 
  202         if (random < work_random[influenced_point_id])
 
  204           state  = work_state[influenced_point_id];
 
  205           random = work_random[influenced_point_id];
 
  206           index  = work_index[influenced_point_id];
 
  208         else if (random == work_random[influenced_point_id])
 
  210           if (index < work_index[influenced_point_id])
 
  212             state  = work_state[influenced_point_id];
 
  213             random = work_random[influenced_point_id];
 
  214             index  = work_index[influenced_point_id];
 
  221     work_state2[i]  = state;
 
  222     work_random2[i] = random;
 
  223     work_index2[i]  = index;
 
  228 template<
typename IndexT>
 
  230                                          IndexT 
const *work_index,
 
  232                                          IndexT *undecided_buffer,
 
  235   unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
 
  236   unsigned int global_size = gridDim.x * blockDim.x;
 
  238   unsigned int num_undecided = 0;
 
  239   for (
unsigned int i = global_id; i < 
size; i += global_size)
 
  241     unsigned int max_state  = work_state[i];
 
  242     unsigned int max_index  = work_index[i];
 
  248       else if (max_state == 2) 
 
  256   __shared__ 
unsigned int shared_buffer[256];
 
  257   shared_buffer[threadIdx.x] = num_undecided;
 
  262       shared_buffer[threadIdx.x] += shared_buffer[threadIdx.x+
stride];
 
  265   if (threadIdx.x == 0)
 
  266     undecided_buffer[blockIdx.x] = shared_buffer[0];
 
  273   unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
 
  274   unsigned int global_size = gridDim.x * blockDim.x;
 
  276   for (
unsigned int i = global_id; i < 
size; i += global_size)
 
  289 template<
typename NumericT>
 
  295   unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle());
 
  296   for (std::size_t i=0; i<random_weights.size(); ++i)
 
  297     random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.
size1());
 
  309   unsigned int num_undecided = 
static_cast<unsigned int>(A.
size1());
 
  313   unsigned int pmis_iters = 0;
 
  314   while (num_undecided > 0)
 
  326                                           static_cast<unsigned int>(A.
size1())
 
  334     for (
unsigned int r = 0; r < 2; ++r)
 
  345                                                static_cast<unsigned int>(A.
size1())
 
  350       work_state  = work_state2;
 
  351       work_random = work_random2;
 
  352       work_index  = work_index2;
 
  362                                            static_cast<unsigned int>(A.
size1())
 
  369     for (std::size_t i=0; i<undecided_buffer.
size(); ++i)
 
  370       num_undecided += undecided_buffer_host[i];
 
  385 template<
typename IndexT>
 
  388                                                  IndexT 
const *influences_row,
 
  389                                                  IndexT 
const *influences_id,
 
  392   unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
 
  393   unsigned int global_size = gridDim.x * blockDim.x;
 
  395   for (
unsigned int i = global_id; i < 
size; i += global_size)
 
  399       unsigned int coarse_index = coarse_ids[i];
 
  401       unsigned int j_stop = influences_row[i + 1];
 
  402       for (
unsigned int j = influences_row[i]; j < j_stop; ++j)
 
  404         unsigned int influenced_point_id = influences_id[j];
 
  405         coarse_ids[influenced_point_id] = coarse_index; 
 
  407         if (influenced_point_id != i) 
 
  415 template<
typename IndexT>
 
  418                                         IndexT 
const *influences_row,
 
  419                                         IndexT 
const *influences_id,
 
  422   unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
 
  423   unsigned int global_size = gridDim.x * blockDim.x;
 
  425   for (
unsigned int i = global_id; i < 
size; i += global_size)
 
  429       unsigned int j_stop = influences_row[i + 1];
 
  430       for (
unsigned int j = influences_row[i]; j < j_stop; ++j)
 
  432         unsigned int influenced_point_id = influences_id[j];
 
  436           coarse_ids[i] = coarse_ids[influenced_point_id];
 
  448   unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
 
  449   unsigned int global_size = gridDim.x * blockDim.x;
 
  451   for (
unsigned int i = global_id; i < 
size; i += global_size)
 
  465 template<
typename NumericT>
 
  479     throw std::runtime_error(
"Only MIS2 coarsening implemented. Selected coarsening not available with CUDA backend!");
 
  490                                                  static_cast<unsigned int>(A.
size1())
 
  502                                         static_cast<unsigned int>(A.
size1())
 
  511                                          static_cast<unsigned int>(A.
size1())
 
  525 template<
typename InternalT1>
 
  533   default: 
throw std::runtime_error(
"not implemented yet");
 
  542 template<
typename NumericT>
 
  544                                        unsigned int *P_col_buffer,
 
  546                                        unsigned int *coarse_ids,
 
  549   unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
 
  550   unsigned int global_size = gridDim.x * blockDim.x;
 
  552   for (
unsigned int i = global_id; i < 
size; i += global_size)
 
  555     P_col_buffer[i] = coarse_ids[i];
 
  571 template<
typename NumericT>
 
  580   amg_interpol_ag_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(P.
handle1().cuda_handle()),
 
  581                                        viennacl::cuda_arg<unsigned int>(P.
handle2().cuda_handle()),
 
  582                                        viennacl::cuda_arg<NumericT>(P.
handle().cuda_handle()),
 
  584                                        static_cast<unsigned int>(A.
size1())
 
  593 template<
typename NumericT>
 
  595           const unsigned int *A_row_indices,
 
  596           const unsigned int *A_col_indices,
 
  598           unsigned int A_size1,
 
  600           unsigned int *Jacobi_row_indices,
 
  601           unsigned int *Jacobi_col_indices,
 
  606   unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
 
  607   unsigned int global_size = gridDim.x * blockDim.x;
 
  609   for (
unsigned int row = global_id; 
row < A_size1; 
row += global_size)
 
  611     unsigned int row_begin = A_row_indices[
row];
 
  612     unsigned int row_end   = A_row_indices[
row+1];
 
  614     Jacobi_row_indices[
row] = row_begin;
 
  618     for (
unsigned int j = row_begin; j < row_end; ++j)
 
  620       if (A_col_indices[j] == 
row)
 
  622         diag = A_elements[j];
 
  628     for (
unsigned int j = row_begin; j < row_end; ++j)
 
  630       unsigned int col_index = A_col_indices[j];
 
  631       Jacobi_col_indices[j] = col_index;
 
  633       if (col_index == 
row)
 
  634         Jacobi_elements[j] = 
NumericT(1) - omega;
 
  636         Jacobi_elements[j] = - omega * A_elements[j] / 
diag;
 
  641     Jacobi_row_indices[A_size1] = A_nnz; 
 
  653 template<
typename NumericT>
 
  667   amg_interpol_sa_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
 
  668                                        viennacl::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
 
  669                                        viennacl::cuda_arg<NumericT>(A.
handle().cuda_handle()),
 
  670                                        static_cast<unsigned int>(A.
size1()),
 
  671                                        static_cast<unsigned int>(A.
nnz()),
 
  672                                        viennacl::cuda_arg<unsigned int>(Jacobi.handle1().cuda_handle()),
 
  673                                        viennacl::cuda_arg<unsigned int>(Jacobi.handle2().cuda_handle()),
 
  674                                        viennacl::cuda_arg<NumericT>(Jacobi.handle().cuda_handle()),
 
  692 template<
typename MatrixT>
 
  702   default: 
throw std::runtime_error(
"Not implemented yet!");
 
  707 template<
typename NumericT>
 
  709           const unsigned int * row_indices,
 
  710           const unsigned int * column_indices,
 
  713           unsigned int B_row_start,     
unsigned int B_col_start,
 
  714           unsigned int B_row_inc,       
unsigned int B_col_inc,
 
  715           unsigned int B_row_size,      
unsigned int B_col_size,
 
  716           unsigned int B_internal_rows, 
unsigned int B_internal_cols)
 
  718   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
  720                     row += gridDim.x * blockDim.x)
 
  722     unsigned int row_end = row_indices[
row+1];
 
  723     for (
unsigned int j = row_indices[
row]; j<row_end; j++)
 
  724       B[(B_row_start + 
row * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j];
 
  729 template<
typename NumericT, 
unsigned int AlignmentV>
 
  733   compressed_matrix_assign_to_dense<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
 
  734                                                   viennacl::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
 
  735                                                   viennacl::cuda_arg<NumericT>(A.
handle().cuda_handle()),
 
  736                                                   viennacl::cuda_arg<NumericT>(B),
 
  748 template<
typename NumericT>
 
  750           const unsigned int * row_indices,
 
  751           const unsigned int * column_indices,
 
  759   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
  761                     row += gridDim.x * blockDim.x)
 
  765     unsigned int row_end = row_indices[
row+1];
 
  766     for (
unsigned int j = row_indices[
row]; j < row_end; ++j)
 
  768       unsigned int col = column_indices[j];
 
  772         sum += elements[j] * x_old[col];
 
  790 template<
typename NumericT>
 
  798   for (
unsigned int i=0; i<iterations; ++i)
 
  802     compressed_matrix_smooth_jacobi_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
 
  803                                                          viennacl::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
 
  804                                                          viennacl::cuda_arg<NumericT>(A.
handle().cuda_handle()),
 
  805                                                          static_cast<NumericT>(weight),
 
  809                                                          static_cast<unsigned int>(rhs_smooth.
size())
 
void amg_influence_trivial(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Routine for taking all connections in the matrix as strong. 
void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context &amg_context)
Assign IDs to coarse points. 
Helper class implementing an array on the host. Default case: No conversion necessary. 
__global__ void amg_agg_merge_undecided_2(unsigned int *point_types, unsigned int size)
amg_coarsening_method get_coarsening_method() const 
Returns the current coarsening strategy. 
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'. 
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector. 
__global__ void compressed_matrix_smooth_jacobi_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT weight, const NumericT *x_old, NumericT *x_new, const NumericT *rhs, unsigned int size)
void amg_influence_advanced(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Routine for extracting strongly connected points considering a user-provided threshold value...
const vcl_size_t & size1() const 
Returns the number of rows. 
__global__ void amg_pmis2_mark_mis_nodes(IndexT const *work_state, IndexT const *work_index, IndexT *point_types, IndexT *undecided_buffer, unsigned int size)
CUDA kernel for marking MIS and non-MIS nodes. 
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.) 
__global__ void amg_pmis2_init_workdata(IndexT *work_state, IndexT *work_random, IndexT *work_index, IndexT const *point_types, IndexT const *random_weights, unsigned int size)
CUDA kernel initializing the work vectors at each PMIS iteration. 
double get_jacobi_weight() const 
Returns the Jacobi smoother weight (damping). 
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
result_of::size_type< T >::type start1(T const &obj)
void amg_interpol_ag(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &P, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA) 
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM. 
void amg_coarse(InternalT1 &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Calls the right coarsening procedure. 
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void amg_agg_merge_undecided(IndexT *point_types, IndexT *coarse_ids, IndexT const *influences_row, IndexT const *influences_id, unsigned int size)
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
const handle_type & handle1() const 
Returns the OpenCL handle to the row index array. 
const vcl_size_t & nnz() const 
Returns the number of nonzero entries. 
__global__ void amg_influence_trivial_kernel(const unsigned int *row_indices, const unsigned int *column_indices, unsigned int size1, unsigned int nnz, unsigned int *influences_row, unsigned int *influences_id, unsigned int *influences_values)
void amg_interpol(MatrixT const &A, MatrixT &P, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Dispatcher for building the interpolation matrix. 
__global__ void amg_interpol_sa_kernel(const unsigned int *A_row_indices, const unsigned int *A_col_indices, const NumericT *A_elements, unsigned int A_size1, unsigned int A_nnz, unsigned int *Jacobi_row_indices, unsigned int *Jacobi_col_indices, NumericT *Jacobi_elements, NumericT omega)
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
void generate_row_block_information()
Builds the row block information needed for fast sparse matrix-vector multiplications. 
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
result_of::size_type< T >::type start2(T const &obj)
void assign_to_dense(viennacl::compressed_matrix< NumericT, AlignmentV > const &A, viennacl::matrix_base< NumericT > &B)
const handle_type & handle2() const 
Returns the OpenCL handle to the column index array. 
void amg_coarse_ag_stage1_mis2(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
AG (aggregation based) coarsening, single-threaded version of stage 1. 
__global__ void amg_interpol_ag_kernel(unsigned int *P_row_buffer, unsigned int *P_col_buffer, NumericT *P_elements, unsigned int *coarse_ids, unsigned int size)
void smooth_jacobi(unsigned int iterations, compressed_matrix< NumericT > const &A, vector< NumericT > &x, vector< NumericT > &x_backup, vector< NumericT > const &rhs_smooth, NumericT weight)
Damped Jacobi Smoother (CUDA version) 
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
__global__ void amg_agg_propagate_coarse_indices(IndexT *point_types, IndexT *coarse_ids, IndexT const *influences_row, IndexT const *influences_id, unsigned int size)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
A tag for algebraic multigrid (AMG). Used to transport information from the user to the implementatio...
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object. 
void amg_interpol_sa(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &P, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA) 
size_type size() const 
Returns the length of the vector (cf. std::vector) 
viennacl::vector< unsigned int > point_types_
__global__ void compressed_matrix_assign_to_dense(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols)
amg_interpolation_method get_interpolation_method() const 
Returns the current interpolation method. 
vcl_size_t raw_size() const 
Returns the number of bytes of the currently active buffer. 
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. 
Helper classes and functions for the AMG preconditioner. Experimental. 
viennacl::vector< unsigned int > coarse_id_
__global__ void amg_pmis2_reset_state(unsigned int *point_types, unsigned int size)
CUDA kernel for resetting non-MIS (i.e. coarse) points to undecided so that subsequent kernels work...
const handle_type & handle() const 
Returns the memory handle. 
viennacl::vector< unsigned int > influence_values_
__global__ void amg_pmis2_max_neighborhood(IndexT const *work_state, IndexT const *work_random, IndexT const *work_index, IndexT *work_state2, IndexT *work_random2, IndexT *work_index2, IndexT const *influences_row, IndexT const *influences_id, unsigned int size)
CUDA kernel propagating the state triple (status, weight, nodeID) to neighbors using a max()-operatio...
viennacl::vector< unsigned int > influence_ids_
void amg_coarse_ag(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG) ...
void switch_memory_context(viennacl::context new_ctx)
void amg_influence(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Dispatcher for influence processing. 
viennacl::vector< unsigned int > influence_jumper_