1 #ifndef VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_ 
    2 #define VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_ 
   41 #ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE 
   42   #define VIENNACL_OPENMP_VECTOR_MIN_SIZE  5000 
   60   template<
typename NumericT>
 
   73     value_type   
const * elements    = detail::extract_raw_pointer<value_type>(A.
handle());
 
   74     unsigned int const *  row_buffer = detail::extract_raw_pointer<unsigned int>(A.
handle1());
 
   75     unsigned int const *  col_buffer = detail::extract_raw_pointer<unsigned int>(A.
handle2());
 
   76     value_type         * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
 
   78     value_type inner_prod_ApAp = 0;
 
   79     value_type inner_prod_pAp = 0;
 
   80     value_type inner_prod_Ap_r0star = 0;
 
   81     for (
long row = 0; row < static_cast<long>(A.
size1()); ++
row)
 
   88         dot_prod += elements[i] * p_buf[col_buffer[i]];
 
   92       inner_prod_ApAp += dot_prod * 
dot_prod;
 
   93       inner_prod_pAp  += val_p_diag * 
dot_prod;
 
   94       inner_prod_Ap_r0star += r0star ? dot_prod * r0star[
static_cast<vcl_size_t>(
row)] : value_type(0);
 
   97     data_buffer[    buffer_chunk_size] = inner_prod_ApAp;
 
   98     data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
 
  100       data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
 
  111   template<
typename NumericT>
 
  124     value_type   
const * elements     = detail::extract_raw_pointer<value_type>(A.
handle());
 
  125     unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(A.
handle12());
 
  126     value_type         * data_buffer  = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
 
  134       Ap_buf[coord_buffer[2*i]] += elements[i] * p_buf[coord_buffer[2*i+1]];
 
  138     value_type inner_prod_ApAp = 0;
 
  139     value_type inner_prod_pAp = 0;
 
  140     value_type inner_prod_Ap_r0star = 0;
 
  146       inner_prod_ApAp += value_Ap * value_Ap;
 
  147       inner_prod_pAp  += value_Ap * value_p;
 
  148       inner_prod_Ap_r0star += r0star ? value_Ap * r0star[i] : value_type(0);
 
  151     data_buffer[    buffer_chunk_size] = inner_prod_ApAp;
 
  152     data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
 
  154       data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
 
  164   template<
typename NumericT>
 
  177     value_type   
const * elements     = detail::extract_raw_pointer<value_type>(A.
handle());
 
  178     unsigned int const * coords       = detail::extract_raw_pointer<unsigned int>(A.
handle2());
 
  179     value_type         * data_buffer  = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
 
  181     value_type inner_prod_ApAp = 0;
 
  182     value_type inner_prod_pAp = 0;
 
  183     value_type inner_prod_Ap_r0star = 0;
 
  189       for (
unsigned int item_id = 0; item_id < A.
internal_maxnnz(); ++item_id)
 
  192         value_type val = elements[offset];
 
  195           sum += (p_buf[coords[offset]] * val);
 
  199       inner_prod_ApAp += sum * 
sum;
 
  200       inner_prod_pAp  += val_p_diag * 
sum;
 
  201       inner_prod_Ap_r0star += r0star ? sum * r0star[
row] : value_type(0);
 
  204     data_buffer[    buffer_chunk_size] = inner_prod_ApAp;
 
  205     data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
 
  207       data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
 
  217   template<
typename NumericT, 
typename IndexT>
 
  230     value_type 
const * elements          = detail::extract_raw_pointer<value_type>(A.
handle());
 
  231     IndexT     
const * columns_per_block = detail::extract_raw_pointer<IndexT>(A.
handle1());
 
  232     IndexT     
const * column_indices    = detail::extract_raw_pointer<IndexT>(A.
handle2());
 
  233     IndexT     
const * block_start       = detail::extract_raw_pointer<IndexT>(A.
handle3());
 
  234     value_type         * data_buffer     = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
 
  239     value_type inner_prod_ApAp = 0;
 
  240     value_type inner_prod_pAp = 0;
 
  241     value_type inner_prod_Ap_r0star = 0;
 
  242     for (
vcl_size_t block_idx = 0; block_idx < num_blocks; ++block_idx)
 
  244       vcl_size_t current_columns_per_block = columns_per_block[block_idx];
 
  246       for (
vcl_size_t i=0; i<result_values.size(); ++i)
 
  247         result_values[i] = 0;
 
  249       for (IndexT column_entry_index = 0;
 
  250                   column_entry_index < current_columns_per_block;
 
  251                 ++column_entry_index)
 
  256         for (IndexT row_in_block = 0; row_in_block < A.
rows_per_block(); ++row_in_block)
 
  258           value_type val = elements[stride_start + row_in_block];
 
  260           result_values[row_in_block] += val ? p_buf[column_indices[stride_start + row_in_block]] * val : 0;
 
  265       for (IndexT row_in_block = 0; row_in_block < A.
rows_per_block(); ++row_in_block)
 
  270           value_type row_result = result_values[row_in_block];
 
  272           Ap_buf[
row] = row_result;
 
  273           inner_prod_ApAp += row_result * row_result;
 
  274           inner_prod_pAp  += p_buf[
row] * row_result;
 
  275           inner_prod_Ap_r0star += r0star ? row_result * r0star[
row] : value_type(0);
 
  280     data_buffer[    buffer_chunk_size] = inner_prod_ApAp;
 
  281     data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
 
  283       data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
 
  293   template<
typename NumericT>
 
  303     typedef unsigned int index_type;
 
  307     value_type 
const * elements          = detail::extract_raw_pointer<value_type>(A.
handle());
 
  308     index_type 
const * coords            = detail::extract_raw_pointer<index_type>(A.
handle2());
 
  309     value_type 
const * csr_elements      = detail::extract_raw_pointer<value_type>(A.
handle5());
 
  310     index_type 
const * csr_row_buffer    = detail::extract_raw_pointer<index_type>(A.
handle3());
 
  311     index_type 
const * csr_col_buffer    = detail::extract_raw_pointer<index_type>(A.
handle4());
 
  312     value_type         * data_buffer     = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
 
  314     value_type inner_prod_ApAp = 0;
 
  315     value_type inner_prod_pAp = 0;
 
  316     value_type inner_prod_Ap_r0star = 0;
 
  328         value_type val = elements[offset];
 
  331           sum += p_buf[coords[offset]] * val;
 
  340       for (
vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
 
  341         sum += p_buf[csr_col_buffer[item_id]] * csr_elements[item_id];
 
  344       inner_prod_ApAp += sum * 
sum;
 
  345       inner_prod_pAp  += val_p_diag * 
sum;
 
  346       inner_prod_Ap_r0star += r0star ? sum * r0star[
row] : value_type(0);
 
  349     data_buffer[    buffer_chunk_size] = inner_prod_ApAp;
 
  350     data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
 
  352       data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
 
  366 template<
typename NumericT>
 
  377   value_type       * data_result = detail::extract_raw_pointer<value_type>(result);
 
  378   value_type       * data_p      = detail::extract_raw_pointer<value_type>(p);
 
  379   value_type       * data_r      = detail::extract_raw_pointer<value_type>(r);
 
  380   value_type 
const * data_Ap     = detail::extract_raw_pointer<value_type>(Ap);
 
  381   value_type       * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
 
  386   value_type inner_prod_r = 0;
 
  387   for (
long i = 0; i < static_cast<long>(
size); ++i)
 
  389     value_type value_p = data_p[
static_cast<vcl_size_t>(i)];
 
  390     value_type value_r = data_r[
static_cast<vcl_size_t>(i)];
 
  393     data_result[
static_cast<vcl_size_t>(i)] += alpha * value_p;
 
  394     value_r -= alpha * data_Ap[
static_cast<vcl_size_t>(i)];
 
  395     value_p  = value_r + beta * value_p;
 
  396     inner_prod_r += value_r * value_r;
 
  402   data_buffer[0] = inner_prod_r;
 
  412 template<
typename NumericT>
 
  430 template<
typename NumericT>
 
  447 template<
typename NumericT>
 
  464 template<
typename NumericT, 
typename IndexT>
 
  483 template<
typename NumericT>
 
  503 template<
typename NumericT>
 
  513   value_type       * data_s      = detail::extract_raw_pointer<value_type>(s);
 
  514   value_type       * data_r      = detail::extract_raw_pointer<value_type>(r);
 
  515   value_type 
const * data_Ap     = detail::extract_raw_pointer<value_type>(Ap);
 
  516   value_type       * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
 
  522   value_type r_in_r0 = 0;
 
  523   value_type Ap_in_r0 = 0;
 
  524   for (
vcl_size_t i=0; i<buffer_chunk_size; ++i)
 
  526      r_in_r0 += data_buffer[i];
 
  527     Ap_in_r0 += data_buffer[i + 3 * buffer_chunk_size];
 
  529   value_type alpha = r_in_r0 / Ap_in_r0;
 
  532   value_type inner_prod_s = 0;
 
  533   for (
long i = 0; i < static_cast<long>(
size); ++i)
 
  535     value_type value_s  = data_s[
static_cast<vcl_size_t>(i)];
 
  537     value_s = data_r[
static_cast<vcl_size_t>(i)] - alpha * data_Ap[static_cast<vcl_size_t>(i)];
 
  538     inner_prod_s += value_s * value_s;
 
  543   data_buffer[buffer_chunk_offset] = inner_prod_s;
 
  553  template<
typename NumericT>
 
  563    value_type       * data_result   = detail::extract_raw_pointer<value_type>(result);
 
  564    value_type       * data_p        = detail::extract_raw_pointer<value_type>(p);
 
  565    value_type 
const * data_s        = detail::extract_raw_pointer<value_type>(s);
 
  566    value_type       * data_residual = detail::extract_raw_pointer<value_type>(residual);
 
  567    value_type 
const * data_As       = detail::extract_raw_pointer<value_type>(As);
 
  568    value_type 
const * data_Ap       = detail::extract_raw_pointer<value_type>(Ap);
 
  569    value_type 
const * data_r0star   = detail::extract_raw_pointer<value_type>(r0star);
 
  570    value_type       * data_buffer   = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
 
  574    value_type inner_prod_r_r0star = 0;
 
  575    for (
long i = 0; i < static_cast<long>(
size); ++i)
 
  578      value_type value_result   = data_result[index];
 
  579      value_type value_p        = data_p[index];
 
  580      value_type value_s        = data_s[index];
 
  581      value_type value_residual = data_residual[index];
 
  582      value_type value_As       = data_As[index];
 
  583      value_type value_Ap       = data_Ap[index];
 
  584      value_type value_r0star   = data_r0star[index];
 
  586      value_result   += alpha * value_p + omega * value_s;
 
  587      value_residual  = value_s - omega * value_As;
 
  588      value_p         = value_residual + beta * (value_p - omega * value_Ap);
 
  589      inner_prod_r_r0star += value_residual * value_r0star;
 
  591      data_result[index]   = value_result;
 
  592      data_residual[index] = value_residual;
 
  593      data_p[index]        = value_p;
 
  596    (void)buffer_chunk_size; 
 
  597    data_buffer[0] = inner_prod_r_r0star;
 
  606  template<
typename NumericT>
 
  615    NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);
 
  626  template<
typename NumericT>
 
  635    NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);
 
  646  template<
typename NumericT>
 
  655    NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);
 
  666  template<
typename NumericT, 
typename IndexT>
 
  675    NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);
 
  686  template<
typename NumericT>
 
  695    NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);
 
  710 template <
typename T>
 
  720   typedef T        value_type;
 
  722   value_type       * data_v_k      = detail::extract_raw_pointer<value_type>(v_k);
 
  723   value_type 
const * data_residual = detail::extract_raw_pointer<value_type>(residual);
 
  724   value_type       * data_R        = detail::extract_raw_pointer<value_type>(R_buffer);
 
  725   value_type 
const * data_buffer   = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
 
  726   value_type       * data_r_dot_vk = detail::extract_raw_pointer<value_type>(r_dot_vk_buffer);
 
  733   value_type norm_vk = 0;
 
  734   for (
vcl_size_t i=0; i<buffer_chunk_size; ++i)
 
  735    norm_vk += data_buffer[i + buffer_chunk_size];
 
  736   norm_vk = std::sqrt(norm_vk);
 
  737   data_R[offset_in_R] = norm_vk;
 
  740   value_type inner_prod_r_dot_vk = 0;
 
  741   for (
long i = 0; i < static_cast<long>(
size); ++i)
 
  743     value_type value_vk = data_v_k[
static_cast<vcl_size_t>(i) + vk_start] / norm_vk;
 
  745     inner_prod_r_dot_vk += data_residual[
static_cast<vcl_size_t>(i)] * value_vk;
 
  747     data_v_k[
static_cast<vcl_size_t>(i) + vk_start] = value_vk;
 
  750   data_r_dot_vk[buffer_chunk_offset] = inner_prod_r_dot_vk;
 
  759 template <
typename T>
 
  767   typedef T        value_type;
 
  769   value_type 
const * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
 
  770   value_type       * data_inner_prod   = detail::extract_raw_pointer<value_type>(vi_in_vk_buffer);
 
  774     data_inner_prod[j*buffer_chunk_size] = value_type(0);
 
  779     value_type value_vk = data_krylov_basis[
static_cast<vcl_size_t>(i) + k * v_k_internal_size];
 
  782       data_inner_prod[j*buffer_chunk_size] += data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size] * value_vk;
 
  791 template <
typename T>
 
  802   typedef T        value_type;
 
  804   value_type * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
 
  806   std::vector<T> values_vi_in_vk(k);
 
  809   for (std::size_t i=0; i<k; ++i)
 
  810     for (
vcl_size_t j=0; j<buffer_chunk_size; ++j)
 
  811       values_vi_in_vk[i] += vi_in_vk_buffer[i*buffer_chunk_size + j];
 
  815   value_type norm_vk = 0;
 
  818     value_type value_vk = data_krylov_basis[
static_cast<vcl_size_t>(i) + k * v_k_internal_size];
 
  821       value_vk -= values_vi_in_vk[j] * data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size];
 
  823     norm_vk += value_vk * value_vk;
 
  824     data_krylov_basis[
static_cast<vcl_size_t>(i) + k * v_k_internal_size] = value_vk;
 
  828   for (std::size_t i=0; i<k; ++i)
 
  829     R_buffer[i + k * krylov_dim] = values_vi_in_vk[i];
 
  831   inner_prod_buffer[buffer_chunk_size] = norm_vk;
 
  835 template <
typename T>
 
  844   typedef T        value_type;
 
  846   value_type       * data_result       = detail::extract_raw_pointer<value_type>(result);
 
  847   value_type 
const * data_residual     = detail::extract_raw_pointer<value_type>(residual);
 
  848   value_type 
const * data_krylov_basis = detail::extract_raw_pointer<value_type>(krylov_basis);
 
  849   value_type 
const * data_coefficients = detail::extract_raw_pointer<value_type>(coefficients);
 
  853     value_type value_result = data_result[i];
 
  855     value_result += data_coefficients[0] * data_residual[i];
 
  857       value_result += data_coefficients[j] * data_krylov_basis[i + (j-1) * v_k_internal_size];
 
  859     data_result[i] = value_result;
 
  865 template <
typename MatrixType, 
typename T>
 
vcl_size_t internal_ellnnz() const 
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm. 
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector. 
Generic size and resize functionality for different vector and matrix types. 
const handle_type & handle3() const 
const vcl_size_t & size1() const 
Returns the number of rows. 
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
const handle_type & handle() const 
const handle_type & handle12() const 
Returns the OpenCL handle to the (row, column) index array. 
vcl_size_t internal_size1() const 
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
Performs a joint vector update operation needed for an efficient pipelined CG algorithm. 
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Computes the second reduction stage for multiple inner products , i=0..k-1, then updates v_k -=  v_i and computes the first reduction stage for ||v_k||. 
This file provides the forward declarations for the main types used within ViennaCL. 
vcl_size_t nnz() const 
Returns the number of nonzero entries. 
Determines row and column increments for matrices and matrix proxies. 
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
const handle_type & handle4() const 
vcl_size_t rows_per_block() const 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined BiCGStab a...
const handle_type & handle1() const 
Returns the OpenCL handle to the row index array. 
vcl_size_t internal_size1() const 
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
const handle_type & handle2() const 
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
Sparse matrix class using the ELLPACK format for storing the nonzeros. 
Sparse matrix class using the sliced ELLPACK with parameters C, . 
result_of::size_type< T >::type start(T const &obj)
const handle_type & handle2() const 
Returns the OpenCL handle to the column index array. 
Common routines for single-threaded or OpenMP-enabled execution on CPU. 
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc. 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorit...
void pipelined_prod_impl(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, NumericT const *r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Implementation of a fused matrix-vector product with a compressed_matrix for an efficient pipelined C...
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
size_type size() const 
Returns the length of the vector (cf. std::vector) 
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t k)
Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1}. 
const handle_type & handle5() const 
vcl_size_t internal_maxnnz() const 
Defines the action of certain unary and binary operators and its arguments (for host execution)...
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
Computes first reduction stage for multiple inner products , i=0..k-1. 
Implementation of the ViennaCL scalar class. 
const handle_type & handle() const 
Returns the memory handle. 
void pipelined_gmres_prod(MatrixType const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
Simple enable-if variant that uses the SFINAE pattern. 
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...