31 template<
class T, 
class F>
 
   35   for (std::size_t i = 0; i < M.
size1(); ++i)
 
   36     for (std::size_t j = 0; j < M.
size2(); ++j)
 
   45   for (std::size_t i = 0; i < cx.size(); ++i)
 
   46     cx[i] = T(rand())/T(RAND_MAX);
 
   51 void bench(
size_t BLAS1_N, 
size_t BLAS2_M, 
size_t BLAS2_N, 
size_t BLAS3_M, 
size_t BLAS3_N, 
size_t BLAS3_K, std::string 
const & prefix)
 
   59   double time_previous, time_spent;
 
   61   double time_per_benchmark = 1;
 
   63 #define BENCHMARK_OP(OPERATION, NAME, PERF, INDEX) \ 
   65   viennacl::backend::finish();\ 
   69   while (time_spent < time_per_benchmark) \ 
   71     time_previous = timer.get(); \ 
   73     viennacl::backend::finish(); \ 
   74     time_spent += timer.get() - time_previous; \ 
   77   time_spent/=(double)Nruns; \ 
   78   std::cout << prefix << NAME " : " << PERF << " " INDEX << std::endl; \ 
   92     BENCHMARK_OP(x = y,                
"COPY", std::setprecision(3) << 
double(2*BLAS1_N*
sizeof(T))/time_spent * 1e-9, 
"GB/s")
 
   93     BENCHMARK_OP(x = y + alpha*x,      
"AXPY", std::setprecision(3) << 
double(3*BLAS1_N*
sizeof(T))/time_spent * 1e-9, 
"GB/s")
 
   94     BENCHMARK_OP(s = 
inner_prod(x, y), 
"DOT",  std::setprecision(3) << 
double(2*BLAS1_N*
sizeof(T))/time_spent * 1e-9, 
"GB/s")
 
  107     BENCHMARK_OP(y = 
prod(A, x),        
"GEMV-N", std::setprecision(3) << 
double((BLAS2_M + BLAS2_N + BLAS2_M*BLAS2_N)*
sizeof(T))/time_spent * 1e-9, 
"GB/s")
 
  108     BENCHMARK_OP(x = 
prod(
trans(A), y), 
"GEMV-T", std::setprecision(3) << 
double((BLAS2_M + BLAS2_N + BLAS2_M*BLAS2_N)*
sizeof(T))/time_spent * 1e-9, 
"GB/s")
 
  121     BENCHMARK_OP(C = 
prod(A, B),                 
"GEMM-NN",      
double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, 
"GFLOPs/s");
 
  122     BENCHMARK_OP(C = 
prod(A, 
trans(BT)),         
"GEMM-NT",      
double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, 
"GFLOPs/s");
 
  123     BENCHMARK_OP(C = 
prod(
trans(AT), B),         
"GEMM-TN",      
double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, 
"GFLOPs/s");
 
  133 #ifdef VIENNACL_WITH_OPENCL 
  134   std::cout << std::endl;
 
  135   std::cout << 
"----------------------------------------------" << std::endl;
 
  136   std::cout << 
"               Device Info" << std::endl;
 
  137   std::cout << 
"----------------------------------------------" << std::endl;
 
  138   std::cout << std::endl;
 
  140   std::cout << std::endl;
 
  143   std::size_t BLAS1_N = 10000000;
 
  145   std::size_t BLAS2_M = 3840;
 
  146   std::size_t BLAS2_N = 3840;
 
  148   std::size_t BLAS3_M = 1976;
 
  149   std::size_t BLAS3_N = 1976;
 
  150   std::size_t BLAS3_K = 1976;
 
  152   std::cout << 
"Benchmark : BLAS" << std::endl;
 
  153   std::cout << 
"----------------" << std::endl;
 
  154   bench<float>(BLAS1_N, BLAS2_M, BLAS2_N, BLAS3_M, BLAS3_N, BLAS3_K, 
"s");
 
  155   std::cout << 
"----" << std::endl;
 
  156 #ifdef VIENNACL_WITH_OPENCL 
  159   bench<double>(BLAS1_N, BLAS2_M, BLAS2_N, BLAS3_M, BLAS3_N, BLAS3_K, 
"d");
 
void init_random(viennacl::matrix< T, F > &M)
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
viennacl::enable_if< viennacl::is_any_sparse_matrix< M1 >::value, matrix_expression< const M1, const M1, op_trans > >::type trans(const M1 &mat)
Returns an expression template class representing a transposed matrix. 
void bench(size_t BLAS1_N, size_t BLAS2_M, size_t BLAS2_N, size_t BLAS3_M, size_t BLAS3_N, size_t BLAS3_K, std::string const &prefix)
size_type internal_size() const
Returns the total amount of allocated memory in multiples of sizeof(NumericT) 
std::vector< std::vector< NumericT > > trans(std::vector< std::vector< NumericT > > const &A)
Generic interface for matrix-vector and matrix-matrix products. See viennacl/linalg/vector_operations...
Implementation of the dense matrix class. 
viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT1 >::type >::value, typename VectorT1::value_type >::type inner_prod(VectorT1 const &v1, VectorT2 const &v2)
viennacl::ocl::device const & current_device()
Convenience function for returning the active device in the current context. 
Generic interface for the computation of inner products. See viennacl/linalg/vector_operations.hpp for implementations. 
std::string info(vcl_size_t indent=0, char indent_char= ' ') const 
Returns an info string with a few properties of the device. Use full_info() to get all details...
#define BENCHMARK_OP(OPERATION, NAME, PERF, INDEX)
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like) 
bool double_support() const 
ViennaCL convenience function: Returns true if the device supports double precision. 
size_type size2() const
Returns the number of columns. 
Implementations of LU factorization for row-major and column-major dense matrices. 
size_type size1() const
Returns the number of rows. 
Proxy classes for vectors. 
A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling. 
Proxy classes for matrices. 
void prod(std::vector< std::map< IndexT, NumericT > > const &stl_A, std::vector< std::map< IndexT, NumericT > > const &stl_B, std::vector< std::map< IndexT, NumericT > > &stl_C)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
void lu_factorize(matrix< NumericT, viennacl::row_major > &A)
LU factorization of a row-major dense matrix. 
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)