1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_ 
   41 template<
typename NumericT>
 
   43           const unsigned int * row_indices,
 
   44           const unsigned int * column_indices,
 
   49   __shared__  
unsigned int col_index_buffer[128];
 
   50   __shared__  
NumericT element_buffer[128];
 
   51   __shared__  
NumericT vector_buffer[128];
 
   53   unsigned int nnz = row_indices[
size];
 
   54   unsigned int current_row = 0;
 
   55   unsigned int row_at_window_start = 0;
 
   56   NumericT current_vector_entry = vector[0];
 
   57   unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
 
   58   unsigned int next_row = row_indices[1];
 
   60   for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
 
   65       element_buffer[threadIdx.x] = elements[i];
 
   66       unsigned int tmp = column_indices[i];
 
   67       col_index_buffer[threadIdx.x] = tmp;
 
   68       vector_buffer[threadIdx.x] = vector[tmp];
 
   77       for (
unsigned int k=0; k<blockDim.x; ++k)
 
   79         if (current_row < size && i+k == next_row) 
 
   81           vector[current_row] = current_vector_entry;
 
   83           if (current_row < size) 
 
   85             next_row = row_indices[current_row+1];
 
   86             current_vector_entry = vector[current_row];
 
   90         if (current_row < size && col_index_buffer[k] < current_row) 
 
   92           if (col_index_buffer[k] < row_at_window_start) 
 
   93             current_vector_entry -= element_buffer[k] * vector_buffer[k];
 
   94           else if (col_index_buffer[k] < current_row) 
 
   95             current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
 
  100       row_at_window_start = current_row;
 
  109 template<
typename NumericT>
 
  111           const unsigned int * row_indices,
 
  112           const unsigned int * column_indices,
 
  117   __shared__  
unsigned int col_index_buffer[128];
 
  118   __shared__  
NumericT element_buffer[128];
 
  119   __shared__  
NumericT vector_buffer[128];
 
  121   unsigned int nnz = row_indices[
size];
 
  122   unsigned int current_row = 0;
 
  123   unsigned int row_at_window_start = 0;
 
  124   NumericT current_vector_entry = vector[0];
 
  126   unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
 
  127   unsigned int next_row = row_indices[1];
 
  129   for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
 
  134       element_buffer[threadIdx.x] = elements[i];
 
  135       unsigned int tmp = column_indices[i];
 
  136       col_index_buffer[threadIdx.x] = tmp;
 
  137       vector_buffer[threadIdx.x] = vector[tmp];
 
  143     if (threadIdx.x == 0)
 
  146       for (
unsigned int k=0; k<blockDim.x; ++k)
 
  148         if (current_row < size && i+k == next_row) 
 
  150           vector[current_row] = current_vector_entry / diagonal_entry;
 
  152           if (current_row < size) 
 
  154             next_row = row_indices[current_row+1];
 
  155             current_vector_entry = vector[current_row];
 
  159         if (current_row < size && col_index_buffer[k] < current_row) 
 
  161           if (col_index_buffer[k] < row_at_window_start) 
 
  162             current_vector_entry -= element_buffer[k] * vector_buffer[k];
 
  163           else if (col_index_buffer[k] < current_row) 
 
  164             current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
 
  166         else if (col_index_buffer[k] == current_row)
 
  167           diagonal_entry = element_buffer[k];
 
  171       row_at_window_start = current_row;
 
  179 template<
typename NumericT>
 
  181           const unsigned int * row_indices,
 
  182           const unsigned int * column_indices,
 
  187   __shared__  
unsigned int col_index_buffer[128];
 
  188   __shared__  
NumericT element_buffer[128];
 
  189   __shared__  
NumericT vector_buffer[128];
 
  191   unsigned int nnz = row_indices[
size];
 
  192   unsigned int current_row = size-1;
 
  193   unsigned int row_at_window_start = size-1;
 
  194   NumericT current_vector_entry = vector[size-1];
 
  195   unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
 
  196   unsigned int next_row = row_indices[size-1];
 
  198   unsigned int i = loop_end + threadIdx.x;
 
  204       element_buffer[threadIdx.x] = elements[i];
 
  205       unsigned int tmp = column_indices[i];
 
  206       col_index_buffer[threadIdx.x] = tmp;
 
  207       vector_buffer[threadIdx.x] = vector[tmp];
 
  213     if (threadIdx.x == 0)
 
  216       for (
unsigned int k2=0; k2<blockDim.x; ++k2)
 
  218         unsigned int k = (blockDim.x - k2) - 1;
 
  223         if (col_index_buffer[k] > row_at_window_start) 
 
  224           current_vector_entry -= element_buffer[k] * vector_buffer[k];
 
  225         else if (col_index_buffer[k] > current_row) 
 
  226           current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
 
  230           vector[current_row] = current_vector_entry;
 
  234             next_row = row_indices[current_row];
 
  235             current_vector_entry = vector[current_row];
 
  242       row_at_window_start = current_row;
 
  256 template<
typename NumericT>
 
  258           const unsigned int * row_indices,
 
  259           const unsigned int * column_indices,
 
  264   __shared__  
unsigned int col_index_buffer[128];
 
  265   __shared__  
NumericT element_buffer[128];
 
  266   __shared__  
NumericT vector_buffer[128];
 
  268   unsigned int nnz = row_indices[
size];
 
  269   unsigned int current_row = size-1;
 
  270   unsigned int row_at_window_start = size-1;
 
  271   NumericT current_vector_entry = vector[size-1];
 
  273   unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
 
  274   unsigned int next_row = row_indices[size-1];
 
  276   unsigned int i = loop_end + threadIdx.x;
 
  282       element_buffer[threadIdx.x] = elements[i];
 
  283       unsigned int tmp = column_indices[i];
 
  284       col_index_buffer[threadIdx.x] = tmp;
 
  285       vector_buffer[threadIdx.x] = vector[tmp];
 
  291     if (threadIdx.x == 0)
 
  294       for (
unsigned int k2=0; k2<blockDim.x; ++k2)
 
  296         unsigned int k = (blockDim.x - k2) - 1;
 
  301         if (col_index_buffer[k] > row_at_window_start) 
 
  302           current_vector_entry -= element_buffer[k] * vector_buffer[k];
 
  303         else if (col_index_buffer[k] > current_row) 
 
  304           current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
 
  305         else if (col_index_buffer[k] == current_row)
 
  306           diagonal_entry = element_buffer[k];
 
  310           vector[current_row] = current_vector_entry / diagonal_entry;
 
  314             next_row = row_indices[current_row];
 
  315             current_vector_entry = vector[current_row];
 
  322       row_at_window_start = current_row;
 
  341 template<
typename NumericT>
 
  343           const unsigned int * row_indices,
 
  344           const unsigned int * column_indices,
 
  353     unsigned int row_start = row_indices[
row];
 
  354     unsigned int row_stop  = row_indices[
row + 1];
 
  355     for (
unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x)
 
  357       unsigned int col_index = column_indices[entry_index];
 
  359         vector[col_index] -= result_entry * elements[entry_index];
 
  366 template<
typename NumericT>
 
  368           const unsigned int * row_indices,
 
  369           const unsigned int * column_indices,
 
  374   __shared__  
unsigned int row_index_lookahead[256];
 
  375   __shared__  
unsigned int row_index_buffer[256];
 
  377   unsigned int row_index;
 
  378   unsigned int col_index;
 
  380   unsigned int nnz = row_indices[
size];
 
  381   unsigned int row_at_window_start = 0;
 
  382   unsigned int row_at_window_end = 0;
 
  383   unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
 
  385   for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
 
  387     col_index    = (i < nnz) ? column_indices[i] : 0;
 
  388     matrix_entry = (i < nnz) ? elements[i]       : 0;
 
  389     row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < 
size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
 
  395       unsigned int row_index_inc = 0;
 
  396       while (i >= row_index_lookahead[row_index_inc + 1])
 
  398       row_index = row_at_window_start + row_index_inc;
 
  399       row_index_buffer[threadIdx.x] = row_index;
 
  404       row_index_buffer[threadIdx.x] = size - 1;
 
  409     row_at_window_start = row_index_buffer[0];
 
  410     row_at_window_end   = row_index_buffer[blockDim.x - 1];
 
  413     for (
unsigned int row = row_at_window_start; 
row <= row_at_window_end; ++
row)
 
  417       if ( (row_index == 
row) && (col_index > 
row) )
 
  418         vector[col_index] -= result_entry * matrix_entry;
 
  423     row_at_window_start = row_at_window_end;
 
  428 template<
typename NumericT>
 
  430           const unsigned int * row_indices,
 
  431           const unsigned int * column_indices,
 
  437   __shared__  
unsigned int row_index_lookahead[256];
 
  438   __shared__  
unsigned int row_index_buffer[256];
 
  440   unsigned int row_index;
 
  441   unsigned int col_index;
 
  443   unsigned int nnz = row_indices[
size];
 
  444   unsigned int row_at_window_start = 0;
 
  445   unsigned int row_at_window_end = 0;
 
  446   unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
 
  448   for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
 
  450     col_index    = (i < nnz) ? column_indices[i] : 0;
 
  451     matrix_entry = (i < nnz) ? elements[i]       : 0;
 
  452     row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < 
size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
 
  458       unsigned int row_index_inc = 0;
 
  459       while (i >= row_index_lookahead[row_index_inc + 1])
 
  461       row_index = row_at_window_start + row_index_inc;
 
  462       row_index_buffer[threadIdx.x] = row_index;
 
  467       row_index_buffer[threadIdx.x] = size - 1;
 
  472     row_at_window_start = row_index_buffer[0];
 
  473     row_at_window_end   = row_index_buffer[blockDim.x - 1];
 
  476     for (
unsigned int row = row_at_window_start; 
row <= row_at_window_end; ++
row)
 
  480       if ( (row_index == 
row) && (col_index > 
row) )
 
  481         vector[col_index] -= result_entry * matrix_entry;
 
  486     row_at_window_start = row_at_window_end;
 
  490   for (
unsigned int i = threadIdx.x; i < size; i += blockDim.x)
 
  491     vector[i] /= diagonal_entries[i];
 
  496 template<
typename NumericT>
 
  498           const unsigned int * row_indices,
 
  499           const unsigned int * column_indices,
 
  504   __shared__  
unsigned int row_index_lookahead[256];
 
  505   __shared__  
unsigned int row_index_buffer[256];
 
  507   unsigned int row_index;
 
  508   unsigned int col_index;
 
  510   unsigned int nnz = row_indices[
size];
 
  511   unsigned int row_at_window_start = 
size;
 
  512   unsigned int row_at_window_end;
 
  513   unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
 
  515   for (
unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
 
  517     unsigned int i = (nnz - i2) - 1;
 
  518     col_index    = (i2 < nnz) ? column_indices[i] : 0;
 
  519     matrix_entry = (i2 < nnz) ? elements[i]       : 0;
 
  520     row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
 
  526       unsigned int row_index_dec = 0;
 
  527       while (row_index_lookahead[row_index_dec] > i)
 
  529       row_index = row_at_window_start - row_index_dec;
 
  530       row_index_buffer[threadIdx.x] = row_index;
 
  535       row_index_buffer[threadIdx.x] = 0;
 
  540     row_at_window_start = row_index_buffer[0];
 
  541     row_at_window_end   = row_index_buffer[blockDim.x - 1];
 
  544     for (
unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
 
  546       unsigned int row = row_at_window_start - row2;
 
  549       if ( (row_index == row) && (col_index < 
row) )
 
  550         vector[col_index] -= result_entry * matrix_entry;
 
  555     row_at_window_start = row_at_window_end;
 
  562 template<
typename NumericT>
 
  564           const unsigned int * row_indices,
 
  565           const unsigned int * column_indices,
 
  574   for (
unsigned int row2 = 0; row2 < 
size; ++row2)
 
  576     unsigned int row = (size - row2) - 1;
 
  577     result_entry = vector[
row] / diagonal_entries[
row];
 
  579     unsigned int row_start = row_indices[
row];
 
  580     unsigned int row_stop  = row_indices[row + 1];
 
  581     for (
unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index)
 
  583       unsigned int col_index = column_indices[entry_index];
 
  585         vector[col_index] -= result_entry * elements[entry_index];
 
  590     if (threadIdx.x == 0)
 
  591       vector[
row] = result_entry;
 
  596 template<
typename NumericT>
 
  598           const unsigned int * row_indices,
 
  599           const unsigned int * column_indices,
 
  605   __shared__  
unsigned int row_index_lookahead[256];
 
  606   __shared__  
unsigned int row_index_buffer[256];
 
  608   unsigned int row_index;
 
  609   unsigned int col_index;
 
  611   unsigned int nnz = row_indices[
size];
 
  612   unsigned int row_at_window_start = 
size;
 
  613   unsigned int row_at_window_end;
 
  614   unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
 
  616   for (
unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
 
  618     unsigned int i = (nnz - i2) - 1;
 
  619     col_index    = (i2 < nnz) ? column_indices[i] : 0;
 
  620     matrix_entry = (i2 < nnz) ? elements[i]       : 0;
 
  621     row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
 
  627       unsigned int row_index_dec = 0;
 
  628       while (row_index_lookahead[row_index_dec] > i)
 
  630       row_index = row_at_window_start - row_index_dec;
 
  631       row_index_buffer[threadIdx.x] = row_index;
 
  636       row_index_buffer[threadIdx.x] = 0;
 
  641     row_at_window_start = row_index_buffer[0];
 
  642     row_at_window_end   = row_index_buffer[blockDim.x - 1];
 
  645     for (
unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
 
  647       unsigned int row = row_at_window_start - row2;
 
  650       if ( (row_index == row) && (col_index < 
row) )
 
  651         vector[col_index] -= result_entry * matrix_entry;
 
  656     row_at_window_start = row_at_window_end;
 
  661   for (
unsigned int i = threadIdx.x; i < size; i += blockDim.x)
 
  662     vector[i] /= diagonal_entries[i];
 
  667 template<
typename NumericT>
 
  669           const unsigned int * row_jumper_L,      
 
  670           const unsigned int * column_indices_L,
 
  672           const unsigned int * block_offsets,
 
  676   unsigned int col_start = block_offsets[2*blockIdx.x];
 
  677   unsigned int col_stop  = block_offsets[2*blockIdx.x+1];
 
  678   unsigned int row_start = row_jumper_L[col_start];
 
  679   unsigned int row_stop;
 
  682   if (col_start >= col_stop)
 
  686   for (
unsigned int col = col_start; col < col_stop; ++col)
 
  688     result_entry = result[col];
 
  689     row_stop = row_jumper_L[col + 1];
 
  690     for (
unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
 
  691       result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index];
 
  692     row_start = row_stop; 
 
  699 template<
typename NumericT>
 
  701           const unsigned int * row_jumper_U,      
 
  702           const unsigned int * column_indices_U,
 
  705           const unsigned int * block_offsets,
 
  709   unsigned int col_start = block_offsets[2*blockIdx.x];
 
  710   unsigned int col_stop  = block_offsets[2*blockIdx.x+1];
 
  711   unsigned int row_start;
 
  712   unsigned int row_stop;
 
  715   if (col_start >= col_stop)
 
  719   for (
unsigned int iter = 0; iter < col_stop - col_start; ++iter)
 
  721     unsigned int col = (col_stop - iter) - 1;
 
  722     result_entry = result[col] / diagonal_U[col];
 
  723     row_start = row_jumper_U[col];
 
  724     row_stop  = row_jumper_U[col + 1];
 
  725     for (
unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
 
  726       result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];
 
  731   for (
unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x)
 
  732     result[col] /= diagonal_U[col];
 
__global__ void csr_unit_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_forward_kernel2(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_backward_kernel2(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
This file provides the forward declarations for the main types used within ViennaCL. 
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
__global__ void csr_trans_unit_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_block_trans_lu_backward(const unsigned int *row_jumper_U, const unsigned int *column_indices_U, const NumericT *elements_U, const NumericT *diagonal_U, const unsigned int *block_offsets, NumericT *result, unsigned int size)
__global__ void csr_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_unit_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__global__ void csr_trans_unit_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
__global__ void csr_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
__global__ void csr_block_trans_unit_lu_forward(const unsigned int *row_jumper_L, const unsigned int *column_indices_L, const NumericT *elements_L, const unsigned int *block_offsets, NumericT *result, unsigned int size)