1 #ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP 
    2 #define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP 
   40 template<
typename NumericT>
 
   43           unsigned int A_start1, 
unsigned int A_start2,
 
   44           unsigned int A_inc1,   
unsigned int A_inc2,
 
   45           unsigned int A_size1,  
unsigned int A_size2,
 
   46           unsigned int A_internal_size1, 
unsigned int A_internal_size2,
 
   50           unsigned int B_start1, 
unsigned int B_start2,
 
   51           unsigned int B_inc1,   
unsigned int B_inc2,
 
   52           unsigned int B_size1,  
unsigned int B_size2,
 
   53           unsigned int B_internal_size1, 
unsigned int B_internal_size2,
 
   61   for (
unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
 
   63     unsigned int row = A_size1 - 1 - row_cnt;
 
   72           B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
 
   73                                                                                                               : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
 
   75           B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
 
   76                                                                                                               : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
 
   83       temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
 
   85       temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
 
   88     for  (
unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
 
   91         entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
 
   93         entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
 
   96         B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
 
   98         B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
 
  106 template<
typename NumericT>
 
  109           unsigned int A_start1, 
unsigned int A_start2,
 
  110           unsigned int A_inc1,   
unsigned int A_inc2,
 
  111           unsigned int A_size1,  
unsigned int A_size2,
 
  112           unsigned int A_internal_size1, 
unsigned int A_internal_size2,
 
  116           unsigned int B_start1, 
unsigned int B_start2,
 
  117           unsigned int B_inc1,   
unsigned int B_inc2,
 
  118           unsigned int B_size1,  
unsigned int B_size2,
 
  119           unsigned int B_internal_size1, 
unsigned int B_internal_size2,
 
  127   for (
unsigned int row = 0; 
row < A_size1; ++
row)
 
  134       if (threadIdx.x == 0)
 
  137           B[(
row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
 
  138                                                                                                               : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
 
  140           B[(
row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
 
  141                                                                                                               : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
 
  148       temp = B[(
row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
 
  150       temp = B[(
row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
 
  153     for  (
unsigned int elim = 
row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x)
 
  156         entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)];
 
  158         entry_A = A[(elim * A_inc1 + A_start1) + (
row * A_inc2 + A_start2) * A_internal_size1];
 
  161         B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
 
  163         B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
 
  176   template<
typename TagT>
 
  182   template<
typename TagT>
 
  188   template<
typename Matrix1T, 
typename Matrix2T, 
typename SolverTagT>
 
  191                           SolverTagT 
const & tag)
 
  196     dim3 grid(B.size2());
 
  252 template<
typename NumericT, 
typename SolverTagT>
 
  265 template<
typename NumericT>
 
  268           unsigned int A_start1, 
unsigned int A_start2,
 
  269           unsigned int A_inc1,   
unsigned int A_inc2,
 
  270           unsigned int A_size1,  
unsigned int A_size2,
 
  271           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  273           unsigned int v_start,
 
  277           unsigned int options)
 
  280   unsigned int unit_diagonal_flag  = (options & (1 << 0));
 
  282   unsigned int is_lower_solve      = (options & (1 << 2));
 
  284   for (
unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)    
 
  286     row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
 
  287     if (!unit_diagonal_flag)
 
  290       if (threadIdx.x == 0)
 
  291         v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
 
  296     temp = v[row * v_inc + v_start];
 
  298     for (
int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
 
  299             elim < (is_lower_solve ? A_size1 : row);
 
  301       v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row  * A_inc2 + A_start2)];
 
  306 template<
typename NumericT>
 
  309           unsigned int A_start1, 
unsigned int A_start2,
 
  310           unsigned int A_inc1,   
unsigned int A_inc2,
 
  311           unsigned int A_size1,  
unsigned int A_size2,
 
  312           unsigned int A_internal_size1,  
unsigned int A_internal_size2,
 
  314           unsigned int v_start,
 
  317           unsigned int options)
 
  320   unsigned int unit_diagonal_flag  = (options & (1 << 0));
 
  322   unsigned int is_lower_solve      = (options & (1 << 2));
 
  324   for (
unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)    
 
  326     row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
 
  327     if (!unit_diagonal_flag)
 
  330       if (threadIdx.x == 0)
 
  331         v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
 
  336     temp = v[row * v_inc + v_start];
 
  338     for (
int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
 
  339             elim < (is_lower_solve ? A_size1 : row);
 
  341       v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) + (row  * A_inc2 + A_start2) * A_internal_size1];
 
  353   template<
typename MatrixT, 
typename VectorT>
 
  356                                  unsigned int options)
 
  397 template<
typename NumericT, 
typename SolverTagT>
 
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
void inplace_solve_vector_impl(MatrixT const &mat, VectorT &vec, unsigned int options)
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
__global__ void triangular_substitute_inplace_row_kernel(NumericT const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
Implementation of the dense matrix class. 
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
__global__ void matrix_matrix_lower_solve_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool unit_diagonal)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.) 
A tag class representing a lower triangular matrix. 
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL. 
result_of::size_type< T >::type start1(T const &obj)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void matrix_matrix_upper_solve_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool unit_diagonal)
bool is_unit_solve(TagT const &tag)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
result_of::size_type< T >::type start2(T const &obj)
A tag class representing an upper triangular matrix. 
result_of::size_type< T >::type start(T const &obj)
__global__ void triangular_substitute_inplace_col_kernel(NumericT const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
void inplace_solve_impl(Matrix1T const &A, Matrix2T &B, SolverTagT const &tag)
Common routines for CUDA execution. 
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
A tag class representing a lower triangular matrix with unit diagonal. 
unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. 
A tag class representing an upper triangular matrix with unit diagonal. 
bool is_upper_solve(TagT const &tag)