1 #ifndef VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_ 
   46 template<
typename IndexT> 
 
   48           const IndexT * A_row_indices,
 
   49           const IndexT * A_col_indices,
 
   51           unsigned int * L_row_indices)
 
   53   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
   55                     row += gridDim.x * blockDim.x)
 
   57     unsigned int row_begin = A_row_indices[
row];
 
   58     unsigned int row_end   = A_row_indices[
row+1];
 
   60     unsigned int num_entries_L = 0;
 
   61     for (
unsigned int j=row_begin; j<row_end; ++j)
 
   63       unsigned int col = A_col_indices[j];
 
   68     L_row_indices[
row] = num_entries_L;
 
   72 template<
typename NumericT>
 
   74           unsigned int const *A_row_indices,
 
   75           unsigned int const *A_col_indices,
 
   79           unsigned int const *L_row_indices,
 
   80           unsigned int       *L_col_indices,
 
   83   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
   85                     row += gridDim.x * blockDim.x)
 
   87     unsigned int row_begin = A_row_indices[
row];
 
   88     unsigned int row_end   = A_row_indices[
row+1];
 
   90     unsigned int index_L = L_row_indices[
row];
 
   91     for (
unsigned int j = row_begin; j < row_end; ++j)
 
   93       unsigned int col = A_col_indices[j];
 
   98         L_col_indices[index_L] = col;
 
   99         L_elements[index_L]    = value;
 
  106 template<
typename NumericT>
 
  113   extract_L_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  114                                    viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  115                                    static_cast<unsigned int>(A.
size1()),
 
  116                                    viennacl::cuda_arg<unsigned int>(L.
handle1())
 
  130   extract_L_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  131                                    viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  132                                    viennacl::cuda_arg<NumericT>(A.
handle()),
 
  133                                    static_cast<unsigned int>(A.
size1()),
 
  134                                    viennacl::cuda_arg<unsigned int>(L.
handle1()),
 
  135                                    viennacl::cuda_arg<unsigned int>(L.
handle2()),
 
  136                                    viennacl::cuda_arg<NumericT>(L.
handle())
 
  147 template<
typename NumericT>
 
  149           unsigned int const *A_row_indices,
 
  150           unsigned int const *A_col_indices,
 
  152           unsigned int A_size1,
 
  156   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
  158                     row += gridDim.x * blockDim.x)
 
  160     unsigned int row_begin = A_row_indices[
row];
 
  161     unsigned int row_end   = A_row_indices[
row+1];
 
  163     for (
unsigned int j = row_begin; j < row_end; ++j)
 
  165       unsigned int col = A_col_indices[j];
 
  168         D_elements[
row] = 
NumericT(1) / sqrt(fabs(A_elements[j]));
 
  176 template<
typename NumericT>
 
  178           unsigned int const *R_row_indices,
 
  179           unsigned int const *R_col_indices,
 
  181           unsigned int R_size1,
 
  185   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
  187                     row += gridDim.x * blockDim.x)
 
  189     unsigned int row_begin = R_row_indices[
row];
 
  190     unsigned int row_end   = R_row_indices[
row+1];
 
  194     for (
unsigned int j = row_begin; j < row_end; ++j)
 
  195       R_elements[j] *= D_row * D_elements[R_col_indices[j]];
 
  202 template<
typename NumericT>
 
  209   ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  210                                    viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  211                                    viennacl::cuda_arg<NumericT>(A.
handle()),
 
  212                                    static_cast<unsigned int>(A.
size1()),
 
  218   ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.
handle1()),
 
  219                                    viennacl::cuda_arg<unsigned int>(L.
handle2()),
 
  220                                    viennacl::cuda_arg<NumericT>(L.
handle()),
 
  221                                    static_cast<unsigned int>(L.
size1()),
 
  230 template<
typename NumericT>
 
  232           unsigned int const *L_row_indices,
 
  233           unsigned int const *L_col_indices,
 
  236           unsigned int L_size1,
 
  239   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
  241                     row += gridDim.x * blockDim.x)
 
  246     unsigned int row_Li_start = L_row_indices[
row];
 
  247     unsigned int row_Li_end   = L_row_indices[
row + 1];
 
  249     for (
unsigned int i = row_Li_start; i < row_Li_end; ++i)
 
  251       unsigned int col = L_col_indices[i];
 
  253       unsigned int row_Lj_start = L_row_indices[col];
 
  254       unsigned int row_Lj_end   = L_row_indices[col + 1];
 
  257       unsigned int index_Lj = row_Lj_start;
 
  258       unsigned int col_Lj = L_col_indices[index_Lj];
 
  260       for (
unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
 
  262         unsigned int col_Li = L_col_indices[index_Li];
 
  265         while (col_Lj < col_Li)
 
  268           col_Lj = L_col_indices[index_Lj];
 
  271         if (col_Lj == col_Li)
 
  272           s -= L_backup[index_Li] * L_backup[index_Lj];
 
  276       L_elements[i] = (
row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]);  
 
  284 template<
typename NumericT>
 
  292   icc_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.
handle1()),
 
  293                                             viennacl::cuda_arg<unsigned int>(L.
handle2()),
 
  294                                             viennacl::cuda_arg<NumericT>(L.
handle()),
 
  295                                             viennacl::cuda_arg<NumericT>(L_backup),
 
  296                                             static_cast<unsigned int>(L.
size1()),
 
  298                                             viennacl::cuda_arg<NumericT>(aij_L.
handle())
 
  307 template<
typename IndexT> 
 
  309           const IndexT * A_row_indices,
 
  310           const IndexT * A_col_indices,
 
  311           unsigned int A_size1,
 
  313           unsigned int * L_row_indices,
 
  315           unsigned int * U_row_indices)
 
  317   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
  319                     row += gridDim.x * blockDim.x)
 
  321     unsigned int row_begin = A_row_indices[
row];
 
  322     unsigned int row_end   = A_row_indices[
row+1];
 
  324     unsigned int num_entries_L = 0;
 
  325     unsigned int num_entries_U = 0;
 
  326     for (
unsigned int j=row_begin; j<row_end; ++j)
 
  328       unsigned int col = A_col_indices[j];
 
  335     L_row_indices[
row] = num_entries_L;
 
  336     U_row_indices[
row] = num_entries_U;
 
  340 template<
typename NumericT>
 
  342           unsigned int const *A_row_indices,
 
  343           unsigned int const *A_col_indices,
 
  345           unsigned int A_size1,
 
  347           unsigned int const *L_row_indices,
 
  348           unsigned int       *L_col_indices,
 
  351           unsigned int const *U_row_indices,
 
  352           unsigned int       *U_col_indices,
 
  355   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
  357                     row += gridDim.x * blockDim.x)
 
  359     unsigned int row_begin = A_row_indices[
row];
 
  360     unsigned int row_end   = A_row_indices[
row+1];
 
  362     unsigned int index_L = L_row_indices[
row];
 
  363     unsigned int index_U = U_row_indices[
row];
 
  364     for (
unsigned int j = row_begin; j < row_end; ++j)
 
  366       unsigned int col = A_col_indices[j];
 
  371         L_col_indices[index_L] = col;
 
  372         L_elements[index_L]    = value;
 
  378         U_col_indices[index_U] = col;
 
  379         U_elements[index_U]    = value;
 
  386 template<
typename NumericT>
 
  394   extract_LU_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  395                                     viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  396                                     static_cast<unsigned int>(A.
size1()),
 
  397                                     viennacl::cuda_arg<unsigned int>(L.
handle1()),
 
  398                                     viennacl::cuda_arg<unsigned int>(U.
handle1())
 
  416   extract_LU_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  417                                     viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  418                                     viennacl::cuda_arg<NumericT>(A.
handle()),
 
  419                                     static_cast<unsigned int>(A.
size1()),
 
  420                                     viennacl::cuda_arg<unsigned int>(L.
handle1()),
 
  421                                     viennacl::cuda_arg<unsigned int>(L.
handle2()),
 
  422                                     viennacl::cuda_arg<NumericT>(L.
handle()),
 
  423                                     viennacl::cuda_arg<unsigned int>(U.
handle1()),
 
  424                                     viennacl::cuda_arg<unsigned int>(U.
handle2()),
 
  425                                     viennacl::cuda_arg<NumericT>(U.
handle())
 
  437 template<
typename NumericT>
 
  445   ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
 
  446                                    viennacl::cuda_arg<unsigned int>(A.
handle2()),
 
  447                                    viennacl::cuda_arg<NumericT>(A.
handle()),
 
  448                                    static_cast<unsigned int>(A.
size1()),
 
  449                                    viennacl::cuda_arg<NumericT>(D.handle())
 
  454   ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.
handle1()),
 
  455                                    viennacl::cuda_arg<unsigned int>(L.
handle2()),
 
  456                                    viennacl::cuda_arg<NumericT>(L.
handle()),
 
  457                                    static_cast<unsigned int>(L.
size1()),
 
  458                                    viennacl::cuda_arg<NumericT>(D.handle())
 
  463   ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(U.
handle1()),
 
  464                                    viennacl::cuda_arg<unsigned int>(U.
handle2()),
 
  465                                    viennacl::cuda_arg<NumericT>(U.
handle()),
 
  466                                    static_cast<unsigned int>(U.
size1()),
 
  467                                    viennacl::cuda_arg<NumericT>(D.handle())
 
  475 template<
typename NumericT>
 
  477           unsigned int const *L_row_indices,
 
  478           unsigned int const *L_col_indices,
 
  481           unsigned int L_size1,
 
  485           unsigned int const *U_trans_row_indices,
 
  486           unsigned int const *U_trans_col_indices,
 
  492   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
  494                     row += gridDim.x * blockDim.x)
 
  499     unsigned int row_L_start = L_row_indices[
row];
 
  500     unsigned int row_L_end   = L_row_indices[
row + 1];
 
  502     for (
unsigned int j = row_L_start; j < row_L_end; ++j)
 
  504       unsigned int col = L_col_indices[j];
 
  509       unsigned int row_U_start = U_trans_row_indices[col];
 
  510       unsigned int row_U_end   = U_trans_row_indices[col + 1];
 
  513       unsigned int index_U = row_U_start;
 
  514       unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1;
 
  516       for (
unsigned int k = row_L_start; k < j; ++k)
 
  518         unsigned int col_L = L_col_indices[k];
 
  521         while (col_U < col_L)
 
  524           col_U = U_trans_col_indices[index_U];
 
  528           sum += L_backup[k] * U_trans_backup[index_U];
 
  532       L_elements[j] = (aij_L[j] - 
sum) / U_trans_backup[row_U_end - 1];  
 
  539     unsigned int row_U_start = U_trans_row_indices[
row];
 
  540     unsigned int row_U_end   = U_trans_row_indices[
row + 1];
 
  541     for (
unsigned int j = row_U_start; j < row_U_end; ++j)
 
  543       unsigned int col = U_trans_col_indices[j];
 
  545       row_L_start = L_row_indices[col];
 
  546       row_L_end   = L_row_indices[col + 1];
 
  549       unsigned int index_L = row_L_start;
 
  550       unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1;
 
  552       for (
unsigned int k = row_U_start; k < j; ++k)
 
  554         unsigned int col_U = U_trans_col_indices[k];
 
  557         while (col_L < col_U)
 
  560           col_L = L_col_indices[index_L];
 
  564           sum += L_backup[index_L] * U_trans_backup[k];
 
  568       U_trans_elements[j] = aij_U_trans[j] - 
sum;
 
  575 template<
typename NumericT>
 
  589   ilu_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.
handle1()),
 
  590                                             viennacl::cuda_arg<unsigned int>(L.
handle2()),
 
  591                                             viennacl::cuda_arg<NumericT>(L.
handle()),
 
  592                                             viennacl::cuda_arg<NumericT>(L_backup),
 
  593                                             static_cast<unsigned int>(L.
size1()),
 
  595                                             viennacl::cuda_arg<NumericT>(aij_L.
handle()),
 
  597                                             viennacl::cuda_arg<unsigned int>(U_trans.
handle1()),
 
  598                                             viennacl::cuda_arg<unsigned int>(U_trans.
handle2()),
 
  599                                             viennacl::cuda_arg<NumericT>(U_trans.
handle()),
 
  600                                             viennacl::cuda_arg<NumericT>(U_backup),
 
  602                                             viennacl::cuda_arg<NumericT>(aij_U_trans.
handle())
 
  610 template<
typename NumericT>
 
  612           unsigned int const *R_row_indices,
 
  613           unsigned int const *R_col_indices,
 
  615           unsigned int R_size1,
 
  619   for (
unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
 
  621                     row += gridDim.x * blockDim.x)
 
  623     unsigned int row_begin = R_row_indices[
row];
 
  624     unsigned int row_end   = R_row_indices[
row+1];
 
  628     for (
unsigned int j = row_begin; j < row_end; ++j)
 
  630       unsigned int col = R_col_indices[j];
 
  633         diag = R_elements[j];
 
  641     for (
unsigned int j = row_begin; j < row_end; ++j)
 
  642       R_elements[j] /= -diag;
 
  648 template<
typename NumericT>
 
  652   ilu_form_neumann_matrix_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(R.
handle1()),
 
  653                                                viennacl::cuda_arg<unsigned int>(R.
handle2()),
 
  654                                                viennacl::cuda_arg<NumericT>(R.
handle()),
 
  655                                                static_cast<unsigned int>(R.
size1()),
 
  656                                                viennacl::cuda_arg<NumericT>(diag_R.
handle())
 
__global__ void ilu_chow_patel_sweep_kernel(unsigned int const *L_row_indices, unsigned int const *L_col_indices, NumericT *L_elements, NumericT const *L_backup, unsigned int L_size1, NumericT const *aij_L, unsigned int const *U_trans_row_indices, unsigned int const *U_trans_col_indices, NumericT *U_trans_elements, NumericT const *U_trans_backup, NumericT const *aij_U_trans)
CUDA kernel for one Chow-Patel-ILU sweep. 
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector. 
__global__ void extract_LU_kernel_2(unsigned int const *A_row_indices, unsigned int const *A_col_indices, NumericT const *A_elements, unsigned int A_size1, unsigned int const *L_row_indices, unsigned int *L_col_indices, NumericT *L_elements, unsigned int const *U_row_indices, unsigned int *U_col_indices, NumericT *U_elements)
Generic size and resize functionality for different vector and matrix types. 
void extract_LU(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &L, compressed_matrix< NumericT > &U)
Implementations of vector operations. 
const vcl_size_t & size1() const 
Returns the number of rows. 
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
This file provides the forward declarations for the main types used within ViennaCL. 
__global__ void ilu_form_neumann_matrix_kernel(unsigned int const *R_row_indices, unsigned int const *R_col_indices, NumericT *R_elements, unsigned int R_size1, NumericT *D_elements)
Determines row and column increments for matrices and matrix proxies. 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
const handle_type & handle1() const 
Returns the OpenCL handle to the row index array. 
void extract_L(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &L)
void ilu_form_neumann_matrix(compressed_matrix< NumericT > &R, vector< NumericT > &diag_R)
void generate_row_block_information()
Builds the row block information needed for fast sparse matrix-vector multiplications. 
__global__ void extract_L_kernel_2(unsigned int const *A_row_indices, unsigned int const *A_col_indices, NumericT const *A_elements, unsigned int A_size1, unsigned int const *L_row_indices, unsigned int *L_col_indices, NumericT *L_elements)
__global__ void ilu_scale_kernel_1(unsigned int const *A_row_indices, unsigned int const *A_col_indices, NumericT const *A_elements, unsigned int A_size1, NumericT *D_elements)
__global__ void extract_L_kernel_1(const IndexT *A_row_indices, const IndexT *A_col_indices, unsigned int A_size1, unsigned int *L_row_indices)
const handle_type & handle2() const 
Returns the OpenCL handle to the column index array. 
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
void ilu_scale(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &L, compressed_matrix< NumericT > &U)
Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L a...
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc. 
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
void memory_copy(mem_handle const &src_buffer, mem_handle &dst_buffer, vcl_size_t src_offset, vcl_size_t dst_offset, vcl_size_t bytes_to_copy)
Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' to memory starting at address 'ds...
void icc_scale(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &L)
Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L a...
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object. 
__global__ void icc_chow_patel_sweep_kernel(unsigned int const *L_row_indices, unsigned int const *L_col_indices, NumericT *L_elements, NumericT const *L_backup, unsigned int L_size1, NumericT const *aij_L)
CUDA kernel for one Chow-Patel-ICC sweep. 
__global__ void ilu_scale_kernel_2(unsigned int const *R_row_indices, unsigned int const *R_col_indices, NumericT *R_elements, unsigned int R_size1, NumericT *D_elements)
Scales values in a matrix such that output = D * input * D, where D is a diagonal matrix (only the di...
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
vcl_size_t raw_size() const 
Returns the number of bytes of the currently active buffer. 
void exclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an exclusive scan. 
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
void memory_create(mem_handle &handle, vcl_size_t size_in_bytes, viennacl::context const &ctx, const void *host_ptr=NULL)
Creates an array of the specified size. If the second argument is provided, the buffer is initialized...
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. 
void ilu_chow_patel_sweep(compressed_matrix< NumericT > &L, vector< NumericT > const &aij_L, compressed_matrix< NumericT > &U_trans, vector< NumericT > const &aij_U_trans)
Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) ...
Implementation of the ViennaCL scalar class. 
__global__ void extract_LU_kernel_1(const IndexT *A_row_indices, const IndexT *A_col_indices, unsigned int A_size1, unsigned int *L_row_indices, unsigned int *U_row_indices)
const handle_type & handle() const 
Returns the memory handle. 
Simple enable-if variant that uses the SFINAE pattern. 
void icc_chow_patel_sweep(compressed_matrix< NumericT > &L, vector< NumericT > const &aij_L)
Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) ...