1 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_ 
    2 #define VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_ 
   27 #ifdef VIENNACL_WITH_OPENMP 
   49         template<
typename ScalarType>
 
   60         template<
typename ScalarType>
 
   72         template<
typename ScalarType>
 
   97               s=-s/A[
row][row+from];
 
   98               _axpy(&A[row][row+from],&A[j][row+from],to-from,s);
 
  111               ss[i]=-ss[i]/A[row][row+from];
 
  113               _axpy(ss,&A[row+i][row],width,A[row][row+i]);
 
  116             for (
vcl_size_t col=row+from;col<row+to;col++)
 
  126         template<
typename ScalarType>
 
  137           for (;k<n-belowDiagonal;k++)
 
  144               for (
vcl_size_t bulgeStart=k+1;bulgeStart<n-belowDiagonal;bulgeStart+=belowDiagonal)
 
  158         template<
typename ScalarType>
 
  164           for (
vcl_size_t k=0;k<n-block_size;k+=block_size)
 
  170               norms[bi]=
_nrm2(&A[k+bi][k+bi+block_size],n-k-bi-block_size);
 
  177                 if (std::abs(A[k+bi][k+bi+block_size]-
ScalarType(1))>std::abs(A[k+bi][k+bi+block_size]+
ScalarType(1)))
 
  178                     norms[bi]=-norms[bi];
 
  180                     A[k+bi][i]/=norms[bi];
 
  186                     ScalarType s=
_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
 
  187                     s=-s/A[k+bi][k+bi+block_size];
 
  188                     _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
 
  196   #ifdef VIENNACL_WITH_OPENMP 
  197   #pragma omp parallel for 
  198             for (
int j=k+block_size;j<(int)n;j++)
 
  207                   ScalarType s=
_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
 
  208                   s=-s/A[k+bi][k+bi+block_size];
 
  209                   _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
 
  220   #ifdef VIENNACL_WITH_OPENMP 
  221   #pragma omp parallel for 
  222             for (
int section=0;section<(int)num_threads;section++)
 
  224             for (
vcl_size_t section=0;section<num_threads;section++)
 
  228               vcl_size_t end  =((n-k)*(section+1))/num_threads+k;
 
  239                     ss[i]=-ss[i]/A[k+bi][k+bi+block_size];
 
  241                     _axpy(ss+start,&A[i][start],length,A[k+bi][i]);
 
  265       template<
typename ScalarType>
 
  269           std::cerr << 
"ViennaCL: Warning in inplace_tred2(): Matrix is not hermitian (or real symmetric)" << std::endl;
 
  275         if (bandwidth*bandwidth*num_threads<n*4 || 2*block_size+1>bandwidth)
 
  294       template<
typename ScalarType>
 
  306                   block_size=
std::min(std::min(m-j,n-j),block_size);
 
  317                 if (std::abs(A[i][j+bi])>std::abs(A[p][j+bi]))
 
  345                     _axpy(&(A[j+bi_][j+block_size]),&(A[
row][j+block_size]),n-j-block_size,-A[
row][j+bi_]);
 
  351                 for (
vcl_size_t col=j+bi;col<j+block_size;col++)
 
  352                   A[
row][col]-=multiplier*A[j+bi][col];
 
  353                     A[
row][j+bi]=multiplier;
 
  372                 _axpy(&(A[j+bi][j+block_size]),&(A[
row][j+block_size]),n-j-block_size,-A[
row][j+bi]);
 
  386   #ifdef VIENNACL_OPENMP 
  387   #pragma omp parallel for 
  388           for (
int row=j+block_size;
row<(int)m;
row++)
 
  394                 _axpy(&(A[j+bi][j+block_size]),&(A[
row][j+block_size]),n-j-block_size,-A[
row][j+bi]);
 
  406       template<
typename ScalarType>
 
  409         std::vector<ScalarType> betas(
std::min(m,n));
 
  414           block_size=
std::min(std::min(m-k,n-k),block_size);
 
  420             norms[bi]=
_nrm2(&A[k+bi][k+bi],m-k-bi);
 
  429                 A[k+bi][i]/=norms[bi];
 
  433               for (
vcl_size_t j=k+bi+1; j<k+block_size; j++)
 
  436                 s = -s/A[k+bi][k+bi];
 
  437                 _axpy(&A[k+bi][k+bi],&A[j][k+bi],m-k-bi,s);
 
  441             betas[k+bi]=-norms[bi];
 
  445   #ifdef VIENNACL_OPENMP 
  446   #pragma omp parallel for 
  447           for (
int j=k+block_size; j<(int)n; j++)
 
  457                 s = -s/A[k+bi][k+bi];
 
  458                 _axpy(&A[k+bi][k+bi],A[j]+k+bi,m-k-bi,s);
 
  486       template<
typename ScalarType>
 
  489         std::vector<ScalarType> betas(
std::min(m,n));
 
  500           block_size=
std::min(std::min(m-k,n-k),block_size);
 
  505               block_cols[bi][i]=A[k+i][k+bi];
 
  511             norms[bi]=
_nrm2(&block_cols[bi][bi],m-k-bi);
 
  520                 block_cols[bi][i]/=norms[bi];
 
  527                 s = -s/block_cols[bi][bi];
 
  528                 _axpy(&block_cols[bi][bi],&block_cols[j][bi],m-k-bi,s);
 
  532             betas[k+bi]=-norms[bi];
 
  538               A[k+i][k+bi]=block_cols[bi][i];
 
  541   #ifdef VIENNACL_OPENMP 
  542   #pragma omp parallel for 
  543           for (
int section=0;section<(int)num_threads;section++)
 
  545           for (
vcl_size_t section=0;section<num_threads;section++)
 
  548             vcl_size_t start=((n-k-block_size)*(section+0))/num_threads+k+block_size;
 
  549             vcl_size_t end  =((n-k-block_size)*(section+1))/num_threads+k+block_size;
 
  558                   _axpy(&A[i][start],ss+start,length,A[i][k+bi]);
 
  560                   ss[i]=-ss[i]/A[k+bi][k+bi];
 
  562                   _axpy(ss+start,&A[i][start],length,A[i][k+bi]);
 
  580           delete [] block_cols[i];
 
  581         delete [] block_cols;
 
optimized BLAS functions using SSE2 and SSE3 intrinsic functions 
std::vector< ScalarType > inplace_qr_row_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size=8, vcl_size_t num_threads=1)
Inplace qr factorization of an m x n dense row-major matrix, returning the householder normalization ...
void _axpy(const T *, T *, vcl_size_t, T)
std::vector< ScalarType > inplace_qr_col_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size=8)
Inplace qr factorization of an m x n dense column-major matrix, returning the householder normalizati...
void eliminateHermitian(ScalarType **A, vcl_size_t row, vcl_size_t from, vcl_size_t to, vcl_size_t width, ScalarType *ss)
void reduceHermitianToBandedMatrix(ScalarType **A, vcl_size_t n, vcl_size_t block_size, vcl_size_t num_threads)
result_of::size_type< T >::type start(T const &obj)
bool lu_factorize_row_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t *piv=NULL, vcl_size_t block_size=8)
Inplace lu factorization of an m x n dense row-major matrix with optional partial pivoting...
T _nrm2(const T *, vcl_size_t)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
T _dotc(vcl_size_t, const T *, const T *)
bool isHermitian(ScalarType **const A, vcl_size_t n)
vcl_size_t getHermitianBandwidth(ScalarType **const A, vcl_size_t n)
T min(const T &lhs, const T &rhs)
Minimum. 
void tridiagonalizeHermitianBandedMatrix(ScalarType **A, vcl_size_t n, vcl_size_t bandwidth)
void inplace_tred2(ScalarType **A, vcl_size_t n, vcl_size_t block_size=1, vcl_size_t num_threads=1)
Inplace reduction of a dense n x n row-major or column-major hermitian (or real symmetric) matrix to ...