1 #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_ 
   51 template<
typename NumericT>
 
   53 void writeToGmem(
const unsigned int tid, 
const unsigned int tid_2,
 
   54                  const unsigned int num_threads_active,
 
   55                  const unsigned int num_blocks_mult,
 
   57                  unsigned int *g_pos_one,
 
   59                  unsigned int *g_left_count_mult,
 
   60                  unsigned int *g_right_count_mult,
 
   62                  unsigned short *s_left_count, 
unsigned short *s_right_count,
 
   63                  unsigned int *g_blocks_mult,
 
   64                  unsigned int *g_blocks_mult_sum,
 
   65                  unsigned short *s_compaction_list,
 
   66                  unsigned short *s_cl_helper,
 
   67                  unsigned int offset_mult_lambda
 
   71     if (tid < offset_mult_lambda)
 
   74         g_left_one[tid] = s_left[tid];
 
   75         g_right_one[tid] = s_right[tid];
 
   77         g_pos_one[tid] = s_right_count[tid];
 
   83         g_left_mult[tid - offset_mult_lambda] = s_left[tid];
 
   84         g_right_mult[tid - offset_mult_lambda] = s_right[tid];
 
   85         g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
 
   86         g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
 
   89     if (tid_2 < num_threads_active)
 
   92         if (tid_2 < offset_mult_lambda)
 
   95             g_left_one[tid_2] = s_left[tid_2];
 
   96             g_right_one[tid_2] = s_right[tid_2];
 
   98             g_pos_one[tid_2] = s_right_count[tid_2];
 
  103             g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
 
  104             g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
 
  105             g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];
 
  106             g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];
 
  115     if (tid <= num_blocks_mult)
 
  117         g_blocks_mult[tid] = s_compaction_list[tid];
 
  118         g_blocks_mult_sum[tid] = s_cl_helper[tid];
 
  121     if (tid_2 <= num_blocks_mult)
 
  123         g_blocks_mult[tid_2] = s_compaction_list[tid_2];
 
  124         g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
 
  131 template<
typename NumericT>
 
  135                     const unsigned int num_threads_active,
 
  136                     unsigned int &offset_mult_lambda,
 
  138                     unsigned short *s_left_count, 
unsigned short *s_right_count,
 
  139                     unsigned short *s_cl_one, 
unsigned short *s_cl_mult,
 
  140                     unsigned short *s_cl_blocking, 
unsigned short *s_cl_helper,
 
  141                     unsigned int is_one_lambda, 
unsigned int is_one_lambda_2,
 
  143                     unsigned int &left_count, 
unsigned int &right_count,
 
  144                     unsigned int &left_count_2, 
unsigned int &right_count_2,
 
  145                     unsigned int c_block_iend, 
unsigned int c_sum_block,
 
  146                     unsigned int c_block_iend_2, 
unsigned int c_sum_block_2
 
  151     right = s_right[tid];
 
  153     if (tid_2 < num_threads_active)
 
  156         left_2 = s_left[tid_2];
 
  157         right_2 = s_right[tid_2];
 
  164     unsigned int ptr_w = 0;
 
  165     unsigned int ptr_w_2 = 0;
 
  166     unsigned int ptr_blocking_w = 0;
 
  167     unsigned int ptr_blocking_w_2 = 0;
 
  171     ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
 
  172             : s_cl_mult[tid] + offset_mult_lambda;
 
  174     if (0 != c_block_iend)
 
  176         ptr_blocking_w = s_cl_blocking[tid];
 
  179     if (tid_2 < num_threads_active)
 
  181         ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
 
  182                   : s_cl_mult[tid_2] + offset_mult_lambda;
 
  184         if (0 != c_block_iend_2)
 
  186             ptr_blocking_w_2 = s_cl_blocking[tid_2];
 
  192       if(tid < num_threads_active)
 
  195         s_left[ptr_w] = left;
 
  196         s_right[ptr_w] = right;
 
  197         s_left_count[ptr_w] = left_count;
 
  198         s_right_count[ptr_w] = right_count;
 
  205       s_left[ptr_w] = left;
 
  206       s_right[ptr_w] = right;
 
  207       s_left_count[ptr_w] = left_count;
 
  208       s_right_count[ptr_w] = right_count;
 
  210           if (0 != c_block_iend)
 
  212         s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
 
  213         s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
 
  216     if (tid_2 < num_threads_active)
 
  220         s_left[ptr_w_2] = left_2;
 
  221         s_right[ptr_w_2] = right_2;
 
  222         s_left_count[ptr_w_2] = left_count_2;
 
  223         s_right_count[ptr_w_2] = right_count_2;
 
  225         if (0 != c_block_iend_2)
 
  227             s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
 
  228             s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
 
  240                               const unsigned int num_threads_compaction,
 
  241                               unsigned short *s_cl_blocking,
 
  242                               unsigned short *s_cl_helper
 
  247     s_cl_blocking[tid] = s_cl_helper[tid];
 
  249     if (tid_2 < num_threads_compaction)
 
  251         s_cl_blocking[tid_2] = s_cl_helper[tid_2];
 
  261     unsigned int offset = 1;
 
  264     for (
int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
 
  272             unsigned int  ai = offset*(2*tid+1)-1;
 
  273             unsigned int  bi = offset*(2*tid+2)-1;
 
  274             s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
 
  281     for (
int d = 2; d < num_threads_compaction; d <<= 1)
 
  291             unsigned int  ai = offset*(tid+1) - 1;
 
  292             unsigned int  bi = ai + (offset >> 1);
 
  293             s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
 
  305               const unsigned int num_threads_active,
 
  306               const unsigned int num_threads_compaction,
 
  307               unsigned short *s_cl_blocking,
 
  308               unsigned short *s_cl_helper)
 
  310     unsigned int offset = 1;
 
  314     for (
int d = num_threads_compaction >> 1; d > 0; d >>= 1)
 
  322             unsigned int ai = offset*(2*tid+1)-1;
 
  323             unsigned int bi = offset*(2*tid+2)-1;
 
  325             s_cl_blocking[bi] += s_cl_blocking[ai];
 
  333     for (
int d = 2; d < (num_threads_compaction - 1); d <<= 1)
 
  342             unsigned int ai = offset*(tid+1) - 1;
 
  343             unsigned int bi = ai + (offset >> 1);
 
  345             s_cl_blocking[bi] += s_cl_blocking[ai];
 
  357         s_cl_helper[num_threads_active - 1] =
 
  358             s_cl_helper[num_threads_compaction - 1];
 
  359         s_cl_blocking[num_threads_active - 1] =
 
  360             s_cl_blocking[num_threads_compaction - 1];
 
  370 scanInitial(
const unsigned int tid, 
const unsigned int tid_2, 
const unsigned int mat_size,
 
  371             const unsigned int num_threads_active,
 
  372             const unsigned int num_threads_compaction,
 
  373             unsigned short *s_cl_one, 
unsigned short *s_cl_mult,
 
  374             unsigned short *s_cl_blocking, 
unsigned short *s_cl_helper
 
  383     unsigned int offset = 1;
 
  386     for (
int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
 
  394             unsigned int  ai = offset*(2*tid+1);
 
  395             unsigned int  bi = offset*(2*tid+2)-1;
 
  397             s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
 
  398             s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];
 
  406             if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))
 
  410                 if (s_cl_helper[ai - 1] == 1)
 
  415                 else if (s_cl_helper[bi] == 1)
 
  418                     s_cl_helper[ai - 1] = 1;
 
  423                     unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];
 
  429                         s_cl_helper[ai - 1] = 1;
 
  435                         s_cl_blocking[bi] = temp;
 
  436                         s_cl_blocking[ai - 1] = 0;
 
  449     for (
int d = 2; d < num_threads_compaction; d <<= 1)
 
  459             unsigned int  ai = offset*(tid+1) - 1;
 
  460             unsigned int  bi = ai + (offset >> 1);
 
  462             s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
 
  463             s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
 
  473 template<
typename NumericT>
 
  477                             const unsigned int num_threads_active,
 
  479                             unsigned short  *s_left_count,
 
  480                             unsigned short *s_right_count,
 
  482                             const unsigned short left_count,
 
  483                             const unsigned short mid_count,
 
  484                             const unsigned short right_count,
 
  486                             unsigned int &compact_second_chunk,
 
  487                             unsigned short *s_compaction_list,
 
  488                             unsigned int &is_active_second)
 
  491     if ((left_count != mid_count) && (mid_count != right_count))
 
  494         storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
 
  495                       left, mid, left_count, mid_count, epsilon);
 
  497         is_active_second = 1;
 
  498         s_compaction_list[threadIdx.x] = 1;
 
  499         compact_second_chunk = 1;
 
  507         is_active_second = 0;
 
  508         s_compaction_list[threadIdx.x] = 0;
 
  511         if (left_count != mid_count)
 
  513             storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
 
  514                           left, mid, left_count, mid_count, epsilon);
 
  518             storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
 
  519                           mid, right, mid_count, right_count, epsilon);
 
  534 template<
typename NumericT>
 
  539                   const unsigned int lg_eig_count,
 
  540                   const unsigned int ug_eig_count,
 
  542                   unsigned int *g_num_one,
 
  543                   unsigned int *g_num_blocks_mult,
 
  545                   unsigned int *g_pos_one,
 
  547                   unsigned int *g_left_count_mult,
 
  548                   unsigned int *g_right_count_mult,
 
  549                   unsigned int *g_blocks_mult,
 
  550                   unsigned int *g_blocks_mult_sum
 
  553     const unsigned int tid = threadIdx.x;
 
  571     __shared__  
unsigned int compact_second_chunk;
 
  573     __shared__  
unsigned int all_threads_converged;
 
  576     __shared__  
unsigned int num_threads_active;
 
  579     __shared__  
unsigned int num_threads_compaction;
 
  582     unsigned short *s_compaction_list_exc = s_compaction_list + 1;
 
  589     unsigned int left_count = 0;
 
  590     unsigned int right_count = 0;
 
  594     unsigned int mid_count = 0;
 
  596     unsigned int is_active_second = 0;
 
  599     s_compaction_list[tid] = 0;
 
  602     s_left_count[tid] = 0;
 
  603     s_right_count[tid] = 0;
 
  613         s_left_count[0] = lg_eig_count;
 
  614         s_right_count[0] = ug_eig_count;
 
  616         compact_second_chunk = 0;
 
  617         num_threads_active = 1;
 
  619         num_threads_compaction = 1;
 
  621         all_threads_converged = 1;
 
  632         s_compaction_list[tid] = 0;
 
  637                                 left, right, left_count, right_count,
 
  638                                 mid, all_threads_converged);
 
  643         if (1 == all_threads_converged)
 
  671         if (tid < num_threads_active)
 
  680                                             s_left_count, s_right_count,
 
  682                                             left_count, mid_count, right_count,
 
  683                                             epsilon, compact_second_chunk,
 
  684                                             s_compaction_list_exc,
 
  695                 s_left_count[tid] = left_count;
 
  696                 s_right_count[tid] = right_count;
 
  698                 is_active_second = 0;
 
  708         if (compact_second_chunk > 0)
 
  716         if (compact_second_chunk > 0)
 
  719                              mid, right, mid_count, right_count,
 
  720                              s_compaction_list, num_threads_active,
 
  731             num_threads_active += s_compaction_list[num_threads_active];
 
  732             num_threads_compaction = 
ceilPow2(num_threads_active);
 
  734             compact_second_chunk = 0;
 
  735             all_threads_converged = 1;
 
  740         if (num_threads_compaction > blockDim.x)
 
  755     unsigned int left_count_2;
 
  756     unsigned int right_count_2;
 
  758     unsigned int tid_2 = tid + blockDim.x;
 
  762     left_count = s_left_count[tid];
 
  763     right_count = s_right_count[tid];
 
  766     if (tid_2 < num_threads_active)
 
  768         left_count_2 = s_left_count[tid_2];
 
  769         right_count_2 = s_right_count[tid_2];
 
  774     unsigned short  *s_cl_one = s_left_count + 1;
 
  775     unsigned short  *s_cl_mult = s_right_count + 1;
 
  779     unsigned short  *s_cl_blocking = s_compaction_list_exc;
 
  787         s_right_count[0] = 0;
 
  794     unsigned int is_one_lambda = 0;
 
  795     unsigned int is_one_lambda_2 = 0;
 
  798     unsigned int multiplicity = right_count - left_count;
 
  799     is_one_lambda = (1 == multiplicity);
 
  801     s_cl_one[tid] = is_one_lambda;
 
  802     s_cl_mult[tid] = (! is_one_lambda);
 
  805     s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
 
  806     s_cl_helper[tid] = 0;
 
  808     if (tid_2 < num_threads_active)
 
  811         unsigned int multiplicity = right_count_2 - left_count_2;
 
  812         is_one_lambda_2 = (1 == multiplicity);
 
  814         s_cl_one[tid_2] = is_one_lambda_2;
 
  815         s_cl_mult[tid_2] = (! is_one_lambda_2);
 
  818         s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
 
  819         s_cl_helper[tid_2] = 0;
 
  825         s_cl_blocking[tid_2] = 0;
 
  826         s_cl_helper[tid_2] = 0;
 
  830     scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction,
 
  831                 s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper);
 
  836                   num_threads_compaction, s_cl_blocking, s_cl_helper);
 
  841     unsigned int  c_block_iend = 0;
 
  842     unsigned int  c_block_iend_2 = 0;
 
  843     unsigned int  c_sum_block = 0;
 
  844     unsigned int  c_sum_block_2 = 0;
 
  851     if (1 == s_cl_helper[tid])
 
  854         c_block_iend = s_cl_mult[tid] + 1;
 
  855         c_sum_block = s_cl_blocking[tid];
 
  858     if (1 == s_cl_helper[tid_2])
 
  861         c_block_iend_2 = s_cl_mult[tid_2] + 1;
 
  862         c_sum_block_2 = s_cl_blocking[tid_2];
 
  866                                   s_cl_blocking, s_cl_helper);
 
  873     __shared__  
unsigned int num_blocks_mult;
 
  874     __shared__  
unsigned int num_mult;
 
  875     __shared__  
unsigned int offset_mult_lambda;
 
  880         num_blocks_mult = s_cl_blocking[num_threads_active - 1];
 
  881         offset_mult_lambda = s_cl_one[num_threads_active - 1];
 
  882         num_mult = s_cl_mult[num_threads_active - 1];
 
  884         *g_num_one = offset_mult_lambda;
 
  885         *g_num_blocks_mult = num_blocks_mult;
 
  897                         s_left, s_right, s_left_count, s_right_count,
 
  898                         s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,
 
  899                         is_one_lambda, is_one_lambda_2,
 
  900                         left, right, left_2, right_2,
 
  901                         left_count, right_count, left_count_2, right_count_2,
 
  902                         c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2
 
  910         s_cl_blocking[num_blocks_mult] = num_mult;
 
  917     writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,
 
  918                 g_left_one, g_right_one, g_pos_one,
 
  919                 g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult,
 
  920                 s_left, s_right, s_left_count, s_right_count,
 
  921                 g_blocks_mult, g_blocks_mult_sum,
 
  922                 s_compaction_list, s_cl_helper, offset_mult_lambda);
 
  928 #endif // #ifndef _BISECT_KERNEL_LARGE_H_ 
__device__ void writeToGmem(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_blocks_mult, NumericT *g_left_one, NumericT *g_right_one, unsigned int *g_pos_one, NumericT *g_left_mult, NumericT *g_right_mult, unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum, unsigned short *s_compaction_list, unsigned short *s_cl_helper, unsigned int offset_mult_lambda)
Write data to global memory. 
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
#define VIENNACL_BISECT_MAX_THREADS_BLOCK
__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int mat_size, const unsigned int num_threads_active, const unsigned int num_threads_compaction, unsigned short *s_cl_one, unsigned short *s_cl_mult, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
__device__ int ceilPow2(int n)
__global__ void bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT lg, const NumericT ug, const unsigned int lg_eig_count, const unsigned int ug_eig_count, NumericT epsilon, unsigned int *g_num_one, unsigned int *g_num_blocks_mult, NumericT *g_left_one, NumericT *g_right_one, unsigned int *g_pos_one, NumericT *g_left_mult, NumericT *g_right_mult, unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum)
Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix g_d diagonal elements in g...
__device__ void scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Compute addresses to obtain compact list of block start addresses. 
Global configuration parameters. 
__device__ void storeNonEmptyIntervalsLarge(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, NumericT left, NumericT mid, NumericT right, const unsigned short left_count, const unsigned short mid_count, const unsigned short right_count, NumericT epsilon, unsigned int &compact_second_chunk, unsigned short *s_compaction_list, unsigned int &is_active_second)
__device__ void compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, unsigned int &offset_mult_lambda, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, unsigned short *s_cl_one, unsigned short *s_cl_mult, unsigned short *s_cl_blocking, unsigned short *s_cl_helper, unsigned int is_one_lambda, unsigned int is_one_lambda_2, NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2, unsigned int &left_count, unsigned int &right_count, unsigned int &left_count_2, unsigned int &right_count_2, unsigned int c_block_iend, unsigned int c_sum_block, unsigned int c_block_iend_2, unsigned int c_sum_block_2)
Perform final stream compaction before writing data to global memory. 
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals. 
__device__ void storeInterval(unsigned int addr, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT right, S left_count, S right_count, NumericT precision)
__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Perform scan to obtain number of eigenvalues before a specific block. 
__device__ void subdivideActiveInterval(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged. 
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)