1 #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_ 
    2 #define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_ 
   65 template<
typename NumericT>
 
   69                                 unsigned int *blocks_mult,
 
   70                                 unsigned int *blocks_mult_sum,
 
   72                                 unsigned int *g_left_count,
 
   73                                 unsigned int *g_right_count,
 
   74                                 NumericT *g_lambda, 
unsigned int *g_pos,
 
   78   const unsigned int tid = threadIdx.x;
 
   91     unsigned int *s_compaction_list_exc = s_compaction_list + 1;
 
   94     __shared__  
unsigned int  all_threads_converged;
 
   96     __shared__  
unsigned int  num_threads_active;
 
   98     __shared__  
unsigned int  num_threads_compaction;
 
  100     __shared__  
unsigned int compact_second_chunk;
 
  103     __shared__  
unsigned int  c_block_start;
 
  104     __shared__  
unsigned int  c_block_end;
 
  105     __shared__  
unsigned int  c_block_offset_output;
 
  110     unsigned int  mid_count = 0;
 
  114     unsigned int  left_count = 0;
 
  115     unsigned int  right_count = 0;
 
  117     unsigned int  is_active_second = 0;
 
  125         c_block_start = blocks_mult[blockIdx.x];
 
  126         c_block_end = blocks_mult[blockIdx.x + 1];
 
  127         c_block_offset_output = blocks_mult_sum[blockIdx.x];
 
  130         num_threads_active = c_block_end - c_block_start;
 
  131         s_compaction_list[0] = 0;
 
  132         num_threads_compaction = 
ceilPow2(num_threads_active);
 
  134         all_threads_converged = 1;
 
  135         compact_second_chunk = 0;
 
  138      s_left_count [tid] = 42;
 
  139      s_right_count[tid] = 42;
 
  147     if (tid < num_threads_active)
 
  149         s_left[tid]  = g_left[c_block_start + tid];
 
  150         s_right[tid] = g_right[c_block_start + tid];
 
  151         s_left_count[tid]  = g_left_count[c_block_start + tid];
 
  152         s_right_count[tid] = g_right_count[c_block_start + tid];
 
  156     unsigned int iter = 0;
 
  162         s_compaction_list[threadIdx.x] = 0;
 
  163         s_compaction_list[threadIdx.x + blockDim.x] = 0;
 
  168                                 s_left_count, s_right_count,
 
  170                                 left, right, left_count, right_count,
 
  171                                 mid, all_threads_converged);
 
  175         if (1 == all_threads_converged)
 
  186                                                     mid, tid, num_threads_active,
 
  192         if (tid < num_threads_active)
 
  200                                        s_left, s_right, s_left_count, s_right_count,
 
  202                                        left_count, mid_count, right_count,
 
  203                                        precision, compact_second_chunk,
 
  204                                        s_compaction_list_exc,
 
  213                                        left_count, mid_count, right_count,
 
  214                                        s_compaction_list_exc, compact_second_chunk,
 
  225         if (1 == compact_second_chunk)
 
  230                              mid, right, mid_count, right_count,
 
  231                              s_compaction_list, num_threads_active,
 
  240             num_threads_active += s_compaction_list[num_threads_active];
 
  241             num_threads_compaction = 
ceilPow2(num_threads_active);
 
  243             compact_second_chunk = 0;
 
  244             all_threads_converged = 1;
 
  250         s_compaction_list_exc[threadIdx.x] = 0;
 
  251         s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;
 
  253         if (num_threads_compaction > blockDim.x)
 
  264     if (tid < num_threads_active)
 
  267         unsigned int addr = c_block_offset_output + tid;
 
  269         g_lambda[addr]  = s_left[tid];
 
  270         g_pos[addr]   = s_right_count[tid];
 
  277 #endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_ 
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
#define VIENNACL_BISECT_MAX_THREADS_BLOCK
__device__ void storeNonEmptyIntervals(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT mid, NumericT right, const S left_count, const S mid_count, const S right_count, NumericT precision, unsigned int &compact_second_chunk, T *s_compaction_list_exc, unsigned int &is_active_second)
Store all non-empty intervals resulting from the subdivision of the interval currently processed by t...
__device__ int ceilPow2(int n)
Global configuration parameters. 
__global__ void bisectKernelLarge_MultIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, unsigned int *blocks_mult, unsigned int *blocks_mult_sum, NumericT *g_left, NumericT *g_right, unsigned int *g_left_count, unsigned int *g_right_count, NumericT *g_lambda, unsigned int *g_pos, NumericT precision)
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals. 
__device__ void subdivideActiveIntervalMulti(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged. 
__device__ void storeIntervalConverged(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT &left, NumericT &mid, NumericT &right, S &left_count, S &mid_count, S &right_count, T *s_compaction_list_exc, unsigned int &compact_second_chunk, const unsigned int num_threads_active, unsigned int &is_active_second)
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)