1 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_ 
    2 #define VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_ 
   28 #if defined VIENNACL_WITH_COMPLEX 
   33 #if defined VIENNACL_WITH_SSE3 
   34 #include <pmmintrin.h> 
   35 #elif defined VIENNACL_WITH_SSE2 
   36 #include <emmintrin.h> 
  124           return std::abs(x[0]);
 
  126         T scaledSquareSum(1);
 
  129             T absXi=std::abs(x[i]);
 
  130             if (std::abs(x[i])>std::abs(scale)){
 
  132               scaledSquareSum=T(1)+scaledSquareSum*temp*temp;
 
  137               scaledSquareSum+=temp*temp;
 
  141         return scale*sqrt(scaledSquareSum);
 
  144   #if defined VIENNACL_WITH_COMPLEX 
  148         template<> 
inline std::complex<double> 
conjIfComplex(std::complex<double> x){
return conj(x);}
 
  149         template<> 
inline std::complex<float > 
conjIfComplex(std::complex<float > x){
return conj(x);}
 
  153       inline std::complex<double> 
_nrm2(
const std::complex<double>* x, 
vcl_size_t n)
 
  158           return std::complex<double>(0);
 
  160           return std::abs(x[0]);
 
  162         double scaledSquareSum=1.0;
 
  164           if (x[i].real()!=0.0){
 
  165             double absXi=std::abs(x[i].real());
 
  167               double temp=scale/absXi;
 
  168               scaledSquareSum=1.0+scaledSquareSum*temp*temp;
 
  172               double temp=absXi/scale;
 
  173               scaledSquareSum+=temp*temp;
 
  176           if (x[i].imag()!=0.0){
 
  177             double absXi=std::abs(x[i].imag());
 
  179               double temp=scale/absXi;
 
  180               scaledSquareSum=1.0+scaledSquareSum*temp*temp;
 
  184               double temp=absXi/scale;
 
  185               scaledSquareSum+=temp*temp;
 
  189         return std::complex<double>(scale*sqrt(scaledSquareSum));
 
  193       inline std::complex<float> 
_nrm2(
const std::complex<float>* x, 
vcl_size_t n)
 
  198           return std::complex<float>(0);
 
  200           return std::abs(x[0]);
 
  202         float scaledSquareSum=1.0;
 
  204           if (x[i].real()!=0.0){
 
  205             float absXi=std::abs(x[i].real());
 
  207               float temp=scale/absXi;
 
  208               scaledSquareSum=1.0f+scaledSquareSum*temp*temp;
 
  212               float temp=absXi/scale;
 
  213               scaledSquareSum+=temp*temp;
 
  216           if (x[i].imag()!=0.0){
 
  217             float absXi=std::abs(x[i].imag());
 
  219               float temp=scale/absXi;
 
  220               scaledSquareSum=1.0f+scaledSquareSum*temp*temp;
 
  224               float temp=absXi/scale;
 
  225               scaledSquareSum+=temp*temp;
 
  229         return std::complex<float>(scale*sqrt(scaledSquareSum));
 
  232   #endif //defined VIENNACL_COMPLEX 
  234   #if defined VIENNACL_WITH_SSE2 
  238       inline void _axpy<float>(
const float* x, 
float* y, 
vcl_size_t n, 
float a)
 
  260           __m128 reg0,reg1,reg2,reg3;
 
  261           __m128 areg=_mm_set1_ps(a);
 
  268             reg0=_mm_load_ps(x+0);
 
  269             reg1=_mm_load_ps(x+4);
 
  270             reg2=_mm_load_ps(y+0);
 
  271             reg3=_mm_load_ps(y+4);
 
  274             prod=_mm_mul_ps(reg0,areg);
 
  275             sum0=_mm_add_ps(prod,reg2);
 
  276             prod=_mm_mul_ps(reg1,areg);
 
  277             sum1=_mm_add_ps(prod,reg3);
 
  280             _mm_store_ps(y+0,sum0);
 
  281             _mm_store_ps(y+4,sum1);
 
  296       inline void _axpy<double>(
const double* x, 
double* y, 
vcl_size_t n, 
double a)
 
  318           __m128d reg0,reg1,reg2,reg3;
 
  319           __m128d areg=_mm_set1_pd(a);
 
  326             reg0=_mm_load_pd(x+0);
 
  327             reg1=_mm_load_pd(x+2);
 
  328             reg2=_mm_load_pd(y+0);
 
  329             reg3=_mm_load_pd(y+2);
 
  332             prod=_mm_mul_pd(reg0,areg);
 
  333             sum0=_mm_add_pd(prod,reg2);
 
  334             prod=_mm_mul_pd(reg1,areg);
 
  335             sum1=_mm_add_pd(prod,reg3);
 
  338             _mm_store_pd(y+0,sum0);
 
  339             _mm_store_pd(y+2,sum1);
 
  354       inline float _dot<float>(
vcl_size_t n, 
const float* x, 
const float* y)
 
  380           __m128 sumReg=_mm_setzero_ps();
 
  381           __m128 reg0,reg1,reg2,reg3;
 
  387             reg0=_mm_load_ps(x+0);
 
  388             reg1=_mm_load_ps(x+4);
 
  389             reg2=_mm_load_ps(y+0);
 
  390             reg3=_mm_load_ps(y+4);
 
  393             reg0=_mm_mul_ps(reg0,reg2);
 
  394             reg1=_mm_mul_ps(reg1,reg3);
 
  397             sumReg=_mm_add_ps(sumReg,reg0);
 
  398             sumReg=_mm_add_ps(sumReg,reg1);
 
  411           float* pSums=(
float*)((((
vcl_size_t)sums)&(~15))+16);
 
  412           _mm_store_ps(pSums,sumReg);
 
  414           return sum+pSums[0]+pSums[1]+pSums[2]+pSums[3];
 
  420       inline double _dot(
vcl_size_t n, 
const double* x, 
const double* y)
 
  444           __m128d sum0=_mm_setzero_pd();
 
  445           __m128d sum1=_mm_setzero_pd();
 
  446           __m128d reg0,reg1,reg2,reg3;
 
  452             reg0=_mm_load_pd(x+0);
 
  453             reg1=_mm_load_pd(x+2);
 
  454             reg2=_mm_load_pd(y+0);
 
  455             reg3=_mm_load_pd(y+2);
 
  458             reg0=_mm_mul_pd(reg0,reg2);
 
  459             reg1=_mm_mul_pd(reg1,reg3);
 
  462             sum0=_mm_add_pd(sum0,reg0);
 
  463             sum1=_mm_add_pd(sum1,reg1);
 
  476           double* pSums=(
double*)((((
vcl_size_t)sums)&(~15))+16);
 
  477           sum0=_mm_add_pd(sum0,sum1);
 
  478           _mm_store_pd(pSums,sum0);
 
  480           return sum+pSums[0]+pSums[1];
 
  485       template<> 
inline float  _dotc<float >(
vcl_size_t n, 
const float  *x, 
const float  *y){
return _dot(n,x,y);}
 
  486       template<> 
inline double _dotc<double>(
vcl_size_t n, 
const double *x, 
const double *y){
return _dot(n,x,y);}
 
  488   #if defined VIENNACL_WITH_COMPLEX 
  492       inline void _axpy<std::complex<float> >(
const std::complex<float>* x, std::complex<float>* y, 
vcl_size_t n, std::complex<float> a)
 
  512           __m128 reg0,reg1,reg2,reg3,reg4;
 
  513           __m128 areg0=_mm_set_ps(a.imag(),a.real(),a.imag(),a.real());
 
  514           __m128 areg1=_mm_set_ps(a.real(),a.imag(),a.real(),a.imag());
 
  515   #ifndef VIENNACL_WITH_SSE3 
  516           __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
 
  523             reg0=_mm_load_ps((
float*)(x+0));
 
  524             reg1=_mm_load_ps((
float*)(x+2));
 
  525             reg2=_mm_load_ps((
float*)(y+0));
 
  526             reg3=_mm_load_ps((
float*)(y+2));
 
  529   #ifndef VIENNACL_WITH_SSE3 
  530             reg4=_mm_shuffle_ps(reg0,reg0,0xA0);
 
  531             reg0=_mm_shuffle_ps(reg0,reg0,0xF5);
 
  532             reg4=_mm_mul_ps(reg4,areg0);
 
  533             reg0=_mm_mul_ps(reg0,areg1);
 
  534             reg0=_mm_mul_ps(reg0,nreg);
 
  535             reg0=_mm_add_ps(reg4,reg0);
 
  536             reg0=_mm_add_ps(reg0,reg2);
 
  537             reg4=_mm_shuffle_ps(reg1,reg1,0xA0);
 
  538             reg1=_mm_shuffle_ps(reg1,reg1,0xF5);
 
  539             reg4=_mm_mul_ps(reg4,areg0);
 
  540             reg1=_mm_mul_ps(reg1,areg1);
 
  541             reg1=_mm_mul_ps(reg1,nreg);
 
  542             reg1=_mm_add_ps(reg4,reg1);
 
  543             reg1=_mm_add_ps(reg1,reg3);
 
  545             reg4=_mm_moveldup_ps(reg0);
 
  546             reg0=_mm_movehdup_ps(reg0);
 
  547             reg4=_mm_mul_ps(reg4,areg0);
 
  548             reg0=_mm_mul_ps(reg0,areg1);
 
  549             reg0=_mm_addsub_ps(reg4,reg0);
 
  550             reg0=_mm_add_ps(reg0,reg2);
 
  551             reg4=_mm_moveldup_ps(reg1);
 
  552             reg1=_mm_movehdup_ps(reg1);
 
  553             reg4=_mm_mul_ps(reg4,areg0);
 
  554             reg1=_mm_mul_ps(reg1,areg1);
 
  555             reg1=_mm_addsub_ps(reg4,reg1);
 
  556             reg1=_mm_add_ps(reg1,reg3);
 
  559             _mm_store_ps((
float*)(y+0),reg0);
 
  560             _mm_store_ps((
float*)(y+2),reg1);
 
  575       inline void _axpy<std::complex<double> >(
const std::complex<double>* x, std::complex<double>* y, 
vcl_size_t n, std::complex<double> a)
 
  584           __m128d reg0,reg1,reg2,reg3,reg4;
 
  585           __m128d areg0=_mm_set_pd(a.imag(),a.real());
 
  586           __m128d areg1=_mm_set_pd(a.real(),a.imag());
 
  587   #ifndef VIENNACL_WITH_SSE3 
  588           __m128d nreg=_mm_set_pd(1.0,-1.0);
 
  595             reg0=_mm_load_pd((
double*)(x+0));
 
  596             reg1=_mm_load_pd((
double*)(x+1));
 
  597             reg2=_mm_load_pd((
double*)(y+0));
 
  598             reg3=_mm_load_pd((
double*)(y+1));
 
  601   #ifndef VIENNACL_WITH_SSE3 
  602             reg4=_mm_shuffle_pd(reg0,reg0,0x0);
 
  603             reg0=_mm_shuffle_pd(reg0,reg0,0x3);
 
  604             reg4=_mm_mul_pd(reg4,areg0);
 
  605             reg0=_mm_mul_pd(reg0,areg1);
 
  606             reg0=_mm_mul_pd(reg0,nreg);
 
  607             reg0=_mm_add_pd(reg4,reg0);
 
  608             reg0=_mm_add_pd(reg0,reg2);
 
  609             reg4=_mm_shuffle_pd(reg1,reg1,0x0);
 
  610             reg1=_mm_shuffle_pd(reg1,reg1,0x3);
 
  611             reg4=_mm_mul_pd(reg4,areg0);
 
  612             reg1=_mm_mul_pd(reg1,areg1);
 
  613             reg1=_mm_mul_pd(reg1,nreg);
 
  614             reg1=_mm_add_pd(reg4,reg1);
 
  615             reg1=_mm_add_pd(reg1,reg3);
 
  617             reg4=_mm_shuffle_pd(reg0,reg0,0x0);
 
  618             reg0=_mm_shuffle_pd(reg0,reg0,0x3);
 
  619             reg4=_mm_mul_pd(reg4,areg0);
 
  620             reg0=_mm_mul_pd(reg0,areg1);
 
  621             reg0=_mm_addsub_pd(reg4,reg0);
 
  622             reg0=_mm_add_pd(reg0,reg2);
 
  623             reg4=_mm_shuffle_pd(reg1,reg1,0x0);
 
  624             reg1=_mm_shuffle_pd(reg1,reg1,0x3);
 
  625             reg4=_mm_mul_pd(reg4,areg0);
 
  626             reg1=_mm_mul_pd(reg1,areg1);
 
  627             reg1=_mm_addsub_pd(reg4,reg1);
 
  628             reg1=_mm_add_pd(reg1,reg3);
 
  631             _mm_store_pd((
double*)(y+0),reg0);
 
  632             _mm_store_pd((
double*)(y+1),reg1);
 
  647       inline std::complex<float> _dot<std::complex<float> >(
vcl_size_t n, 
const std::complex<float>* x, 
const std::complex<float>* y)
 
  652           std::complex<float> 
sum(0);
 
  660           std::complex<float> 
sum(0);
 
  671           __m128 sumReg=_mm_setzero_ps();
 
  672           __m128 reg0,reg1,reg2,reg3,reg4;
 
  673   #ifndef VIENNACL_WITH_SSE3 
  674           __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
 
  681             reg0=_mm_load_ps((
float*)(x+0));
 
  682             reg1=_mm_load_ps((
float*)(x+2));
 
  683             reg2=_mm_load_ps((
float*)(y+0));
 
  684             reg3=_mm_load_ps((
float*)(y+2));
 
  687   #ifndef VIENNACL_WITH_SSE3 
  688             reg4=_mm_shuffle_ps(reg2,reg2,0xA0);
 
  689             reg2=_mm_shuffle_ps(reg2,reg2,0xF5);
 
  690             reg4=_mm_mul_ps(reg4,reg0);
 
  691             reg2=_mm_mul_ps(reg2,reg0);
 
  692             reg2=_mm_shuffle_ps(reg2,reg2,0xB1);
 
  693             reg2=_mm_mul_ps(reg2,nreg);
 
  694             reg0=_mm_add_ps(reg4,reg2);
 
  695             reg4=_mm_shuffle_ps(reg3,reg3,0xA0);
 
  696             reg3=_mm_shuffle_ps(reg3,reg3,0xF5);
 
  697             reg4=_mm_mul_ps(reg4,reg1);
 
  698             reg3=_mm_mul_ps(reg3,reg1);
 
  699             reg3=_mm_shuffle_ps(reg3,reg3,0xB1);
 
  700             reg3=_mm_mul_ps(reg3,nreg);
 
  701             reg1=_mm_add_ps(reg4,reg3);
 
  703             reg4=_mm_moveldup_ps(reg2);
 
  704             reg2=_mm_movehdup_ps(reg2);
 
  705             reg4=_mm_mul_ps(reg4,reg0);
 
  706             reg2=_mm_mul_ps(reg2,reg0);
 
  707             reg2=_mm_shuffle_ps(reg2,reg2,0xB1);
 
  708             reg0=_mm_addsub_ps(reg4,reg2);
 
  709             reg4=_mm_moveldup_ps(reg3);
 
  710             reg3=_mm_movehdup_ps(reg3);
 
  711             reg4=_mm_mul_ps(reg4,reg1);
 
  712             reg3=_mm_mul_ps(reg3,reg1);
 
  713             reg3=_mm_shuffle_ps(reg3,reg3,0xB1);
 
  714             reg1=_mm_addsub_ps(reg4,reg3);
 
  718             sumReg=_mm_add_ps(sumReg,reg0);
 
  719             sumReg=_mm_add_ps(sumReg,reg1);
 
  731           std::complex<float> sums[4];
 
  732           std::complex<float>* pSums=(std::complex<float>*)((((
vcl_size_t)sums)&(~15))+16);
 
  733           pSums[0]=std::complex<float>(0);
 
  734           pSums[1]=std::complex<float>(0);
 
  735           _mm_store_ps((
float*)pSums,sumReg);
 
  737           return sum+pSums[0]+pSums[1];
 
  743       inline std::complex<double> _dot<std::complex<double> >(
vcl_size_t n, 
const std::complex<double>* x, 
const std::complex<double>* y)
 
  748           std::complex<double> 
sum(0);
 
  755           __m128d sumReg=_mm_setzero_pd();
 
  756           __m128d reg0,reg1,reg2,reg3,reg4;
 
  757   #ifndef VIENNACL_WITH_SSE3 
  758           __m128d nreg=_mm_set_pd(1.0,-1.0);
 
  765             reg0=_mm_load_pd((
double*)(x+0));
 
  766             reg1=_mm_load_pd((
double*)(x+1));
 
  767             reg2=_mm_load_pd((
double*)(y+0));
 
  768             reg3=_mm_load_pd((
double*)(y+1));
 
  771   #ifndef VIENNACL_WITH_SSE3 
  772             reg4=_mm_shuffle_pd(reg2,reg2,0x0);
 
  773             reg2=_mm_shuffle_pd(reg2,reg2,0x3);
 
  774             reg4=_mm_mul_pd(reg4,reg0);
 
  775             reg2=_mm_mul_pd(reg2,reg0);
 
  776             reg2=_mm_shuffle_pd(reg2,reg2,0x1);
 
  777             reg2=_mm_mul_pd(reg2,nreg);
 
  778             reg0=_mm_add_pd(reg4,reg2);
 
  779             reg4=_mm_shuffle_pd(reg3,reg3,0x0);
 
  780             reg3=_mm_shuffle_pd(reg3,reg3,0x3);
 
  781             reg4=_mm_mul_pd(reg4,reg1);
 
  782             reg3=_mm_mul_pd(reg3,reg1);
 
  783             reg3=_mm_shuffle_pd(reg3,reg3,0x1);
 
  784             reg3=_mm_mul_pd(reg3,nreg);
 
  785             reg1=_mm_add_pd(reg4,reg3);
 
  787             reg4=_mm_shuffle_pd(reg2,reg2,0x0);
 
  788             reg2=_mm_shuffle_pd(reg2,reg2,0x3);
 
  789             reg4=_mm_mul_pd(reg4,reg0);
 
  790             reg2=_mm_mul_pd(reg2,reg0);
 
  791             reg2=_mm_shuffle_pd(reg2,reg2,0x1);
 
  792             reg0=_mm_addsub_pd(reg4,reg2);
 
  793             reg4=_mm_shuffle_pd(reg3,reg3,0x0);
 
  794             reg3=_mm_shuffle_pd(reg3,reg3,0x3);
 
  795             reg4=_mm_mul_pd(reg4,reg1);
 
  796             reg3=_mm_mul_pd(reg3,reg1);
 
  797             reg3=_mm_shuffle_pd(reg3,reg3,0x1);
 
  798             reg1=_mm_addsub_pd(reg4,reg3);
 
  802             sumReg=_mm_add_pd(sumReg,reg0);
 
  803             sumReg=_mm_add_pd(sumReg,reg1);
 
  811           std::complex<double> 
sum(0);
 
  816           std::complex<double> sums[2];
 
  817           std::complex<double>* pSums=(std::complex<double>*)((((
vcl_size_t)sums)&(~15))+16);
 
  818           pSums[0]=std::complex<double>(0);
 
  819           _mm_store_pd((
double*)pSums,sumReg);
 
  827       inline std::complex<float> _dotc<std::complex<float> >(
vcl_size_t n, 
const std::complex<float>* x, 
const std::complex<float>* y)
 
  832           std::complex<float> 
sum(0);
 
  834             sum+=conj(x[i])*y[i];
 
  840           std::complex<float> 
sum(0);
 
  845             sum+=conj(x[0])*y[0];
 
  851           __m128 sumReg=_mm_setzero_ps();
 
  852           __m128 reg0,reg1,reg2,reg3,reg4;
 
  853   #ifndef VIENNACL_WITH_SSE3 
  854           __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
 
  861             reg0=_mm_load_ps((
float*)(x+0));
 
  862             reg1=_mm_load_ps((
float*)(x+2));
 
  863             reg2=_mm_load_ps((
float*)(y+0));
 
  864             reg3=_mm_load_ps((
float*)(y+2));
 
  867   #ifndef VIENNACL_WITH_SSE3 
  868             reg4=_mm_shuffle_ps(reg2,reg2,0xA0);
 
  869             reg2=_mm_shuffle_ps(reg2,reg2,0xF5);
 
  870             reg4=_mm_mul_ps(reg4,reg0);
 
  871             reg2=_mm_mul_ps(reg2,reg0);
 
  872             reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
 
  873             reg4=_mm_mul_ps(reg4,nreg);
 
  874             reg0=_mm_add_ps(reg4,reg2);
 
  875             reg4=_mm_shuffle_ps(reg3,reg3,0xA0);
 
  876             reg3=_mm_shuffle_ps(reg3,reg3,0xF5);
 
  877             reg4=_mm_mul_ps(reg4,reg1);
 
  878             reg3=_mm_mul_ps(reg3,reg1);
 
  879             reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
 
  880             reg4=_mm_mul_ps(reg4,nreg);
 
  881             reg1=_mm_add_ps(reg4,reg3);
 
  883             reg4=_mm_moveldup_ps(reg2);
 
  884             reg2=_mm_movehdup_ps(reg2);
 
  885             reg4=_mm_mul_ps(reg4,reg0);
 
  886             reg2=_mm_mul_ps(reg2,reg0);
 
  887             reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
 
  888             reg0=_mm_addsub_ps(reg2,reg4);
 
  889             reg4=_mm_moveldup_ps(reg3);
 
  890             reg3=_mm_movehdup_ps(reg3);
 
  891             reg4=_mm_mul_ps(reg4,reg1);
 
  892             reg3=_mm_mul_ps(reg3,reg1);
 
  893             reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
 
  894             reg1=_mm_addsub_ps(reg3,reg4);
 
  898             sumReg=_mm_add_ps(sumReg,reg0);
 
  899             sumReg=_mm_add_ps(sumReg,reg1);
 
  908             sum+=conj(x[i])*y[i];
 
  911           std::complex<float> sums[4];
 
  912           std::complex<float>* pSums=(std::complex<float>*)((((
vcl_size_t)sums)&(~15))+16);
 
  913           sumReg=_mm_shuffle_ps(sumReg,sumReg,0xB1);
 
  914           _mm_store_ps((
float*)pSums,sumReg);
 
  916           return sum+pSums[0]+pSums[1];
 
  922       inline std::complex<double> _dotc<std::complex<double> >(
vcl_size_t n, 
const std::complex<double>* x, 
const std::complex<double>* y)
 
  927           std::complex<double> 
sum(0);
 
  929             sum+=conj(x[i])*y[i];
 
  934           __m128d sumReg=_mm_setzero_pd();
 
  935           __m128d reg0,reg1,reg2,reg3,reg4;
 
  936   #ifndef VIENNACL_WITH_SSE3 
  937           __m128d nreg=_mm_set_pd(1.0,-1.0);
 
  944             reg0=_mm_load_pd((
double*)(x+0));
 
  945             reg1=_mm_load_pd((
double*)(x+1));
 
  946             reg2=_mm_load_pd((
double*)(y+0));
 
  947             reg3=_mm_load_pd((
double*)(y+1));
 
  950   #ifndef VIENNACL_WITH_SSE3 
  951             reg4=_mm_shuffle_pd(reg2,reg2,0x0);
 
  952             reg2=_mm_shuffle_pd(reg2,reg2,0x3);
 
  953             reg4=_mm_mul_pd(reg4,reg0);
 
  954             reg2=_mm_mul_pd(reg2,reg0);
 
  955             reg4=_mm_shuffle_pd(reg4,reg4,0x1);
 
  956             reg4=_mm_mul_pd(reg4,nreg);
 
  957             reg0=_mm_add_pd(reg4,reg2);
 
  958             reg4=_mm_shuffle_pd(reg3,reg3,0x0);
 
  959             reg3=_mm_shuffle_pd(reg3,reg3,0x3);
 
  960             reg4=_mm_mul_pd(reg4,reg1);
 
  961             reg3=_mm_mul_pd(reg3,reg1);
 
  962             reg4=_mm_shuffle_pd(reg4,reg4,0x1);
 
  963             reg4=_mm_mul_pd(reg4,nreg);
 
  964             reg1=_mm_add_pd(reg4,reg3);
 
  966             reg4=_mm_shuffle_pd(reg2,reg2,0x0);
 
  967             reg2=_mm_shuffle_pd(reg2,reg2,0x3);
 
  968             reg4=_mm_mul_pd(reg4,reg0);
 
  969             reg2=_mm_mul_pd(reg2,reg0);
 
  970             reg4=_mm_shuffle_pd(reg4,reg4,0x1);
 
  971             reg0=_mm_addsub_pd(reg2,reg4);
 
  972             reg4=_mm_shuffle_pd(reg3,reg3,0x0);
 
  973             reg3=_mm_shuffle_pd(reg3,reg3,0x3);
 
  974             reg4=_mm_mul_pd(reg4,reg1);
 
  975             reg3=_mm_mul_pd(reg3,reg1);
 
  976             reg4=_mm_shuffle_pd(reg4,reg4,0x1);
 
  977             reg1=_mm_addsub_pd(reg3,reg4);
 
  982             sumReg=_mm_add_pd(sumReg,reg0);
 
  983             sumReg=_mm_add_pd(sumReg,reg1);
 
  991           std::complex<double> 
sum(0);
 
  996           std::complex<double> sums[2];
 
  997           std::complex<double>* pSums=(std::complex<double>*)((((
vcl_size_t)sums)&(~15))+16);
 
  998           sumReg=_mm_shuffle_pd(sumReg,sumReg,0x1);
 
  999           _mm_store_pd((
double*)pSums,sumReg);
 
 1001           return sum+pSums[0];
 
 1005   #endif //defined VIENNACL_WITH_COMPLEX 
 1007   #endif //defined VIENNACL_WITH_SSE2 
void _axpy(const T *, T *, vcl_size_t, T)
T _dot(vcl_size_t, const T *, const T *)
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
void _swap(vcl_size_t, T *, T *)
T _nrm2(const T *, vcl_size_t)
T _dotc(vcl_size_t, const T *, const T *)
void _copy(vcl_size_t, T *, T *)