1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_ 
    2 #define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_ 
   49   template <
typename StringType>
 
   53     source.append(
"     #define  VIENNACL_BISECT_MAX_THREADS_BLOCK                256\n");
 
   54     source.append(
"     #define  VIENNACL_BISECT_MAX_SMALL_MATRIX                 256\n");
 
   55     source.append(
"     #define  VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX   256\n");
 
   56     source.append(
"     #define  VIENNACL_BISECT_MIN_ABS_INTERVAL                 5.0e-37\n");
 
   65   template <
typename StringType>
 
   69   source.append(
"     inline int  \n");
 
   70   source.append(
"     floorPow2(int n)  \n");
 
   71   source.append(
"     {  \n");
 
   72   source.append(
"         uint glb_id = get_global_id(0); \n");
 
   73   source.append(
"         uint grp_id = get_group_id(0);  \n");
 
   74   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
   75   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
   76   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
   80   source.append(
"         if (0 == (n & (n-1)))  \n");
 
   81   source.append(
"         {  \n");
 
   82   source.append(
"             return n;  \n");
 
   83   source.append(
"         }  \n");
 
   85   source.append(
"         int exp;  \n");
 
   86   source.append(
"         frexp(( "); source.append(numeric_string); source.append(
" )n, &exp);  \n");
 
   87   source.append(
"         return (1 << (exp - 1));  \n");
 
   88   source.append(
"     }  \n");
 
   98   template <
typename StringType>
 
  101   source.append(
"       \n");
 
  102   source.append(
"     inline int  \n");
 
  103   source.append(
"     ceilPow2(int n)  \n");
 
  104   source.append(
"     {  \n");
 
  105   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  106   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  107   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  108   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  109   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  113   source.append(
"         if (0 == (n & (n-1)))  \n");
 
  114   source.append(
"         {  \n");
 
  115   source.append(
"             return n;  \n");
 
  116   source.append(
"         }  \n");
 
  118   source.append(
"         int exp;  \n");
 
  119   source.append(
"         frexp(( "); source.append(numeric_string); source.append(
" )n, &exp);  \n");
 
  120   source.append(
"         return (1 << exp);  \n");
 
  121   source.append(
"     }  \n");
 
  132   template <
typename StringType>
 
  135   source.append(
"       \n");
 
  136   source.append(
"     inline "); source.append(numeric_string); source.append(
" \n");
 
  137   source.append(
"     computeMidpoint(const "); source.append(numeric_string); source.append(
" left,\n");
 
  138   source.append(
"       const "); source.append(numeric_string); source.append(
"  right)  \n");
 
  139   source.append(
"     {  \n");
 
  140   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  141   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  142   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  143   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  144   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  145   source.append(
"          "); source.append(numeric_string); source.append(
"  mid;  \n");
 
  147   source.append(
"         if (sign(left) == sign(right))  \n");
 
  148   source.append(
"         {  \n");
 
  149   source.append(
"             mid = left + (right - left) * 0.5f;  \n");
 
  150   source.append(
"         }  \n");
 
  151   source.append(
"         else  \n");
 
  152   source.append(
"         {  \n");
 
  153   source.append(
"             mid = (left + right) * 0.5f;  \n");
 
  154   source.append(
"         }  \n");
 
  156   source.append(
"         return mid;  \n");
 
  157   source.append(
"     }  \n");
 
  177   template<
typename StringType>
 
  180   source.append(
"     \n");
 
  181   source.append(
"     void  \n");
 
  182   source.append(
"     storeInterval(unsigned int addr,  \n");
 
  183   source.append(
"                   __local "); source.append(numeric_string); source.append(
" * s_left,   \n");
 
  184   source.append(
"                   __local "); source.append(numeric_string); source.append(
" * s_right,  \n");
 
  185   source.append(
"                   __local unsigned int * s_left_count,  \n");
 
  186   source.append(
"                   __local unsigned int * s_right_count,  \n");
 
  187   source.append(
"                    "); source.append(numeric_string); source.append(
" left,   \n");
 
  188   source.append(
"                    "); source.append(numeric_string); source.append(
" right,  \n");
 
  189   source.append(
"                   unsigned int left_count, \n");
 
  190   source.append(
"                   unsigned int right_count,  \n");
 
  191   source.append(
"                    "); source.append(numeric_string); source.append(
"  precision)  \n");
 
  192   source.append(
"     {  \n");
 
  193   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  194   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  195   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  196   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  197   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  199   source.append(
"         s_left_count[addr] = left_count;  \n");
 
  200   source.append(
"         s_right_count[addr] = right_count;  \n");
 
  203   source.append(
"          "); source.append(numeric_string); source.append(
" t0 = fabs(right - left);  \n");
 
  204   source.append(
"          "); source.append(numeric_string); source.append(
" t1 = max(fabs(left), fabs(right)) * precision;  \n");
 
  206   source.append(
"         if (t0 <= max(( "); source.append(numeric_string); source.append(
" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1))  \n");
 
  207   source.append(
"         {  \n");
 
  209   source.append(
"              "); source.append(numeric_string); source.append(
" lambda = computeMidpoint(left, right);  \n");
 
  212   source.append(
"             s_left[addr] = lambda;  \n");
 
  213   source.append(
"             s_right[addr] = lambda;  \n");
 
  214   source.append(
"         }  \n");
 
  215   source.append(
"         else  \n");
 
  216   source.append(
"         {  \n");
 
  219   source.append(
"             s_left[addr] = left;  \n");
 
  220   source.append(
"             s_right[addr] = right;  \n");
 
  221   source.append(
"         }  \n");
 
  223   source.append(
"     }  \n");
 
  227   template<
typename StringType>
 
  230   source.append(
"     \n");
 
  231   source.append(
"     void  \n");
 
  232   source.append(
"     storeIntervalShort(unsigned int addr,  \n");
 
  233   source.append(
"                   __local "); source.append(numeric_string); source.append(
" * s_left,   \n");
 
  234   source.append(
"                   __local "); source.append(numeric_string); source.append(
" * s_right,  \n");
 
  235   source.append(
"                   __local unsigned short * s_left_count,  \n");
 
  236   source.append(
"                   __local unsigned short * s_right_count,  \n");
 
  237   source.append(
"                    "); source.append(numeric_string); source.append(
" left,   \n");
 
  238   source.append(
"                    "); source.append(numeric_string); source.append(
" right,  \n");
 
  239   source.append(
"                   unsigned int left_count, \n");
 
  240   source.append(
"                   unsigned int right_count,  \n");
 
  241   source.append(
"                    "); source.append(numeric_string); source.append(
"  precision)  \n");
 
  242   source.append(
"     {  \n");
 
  243   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  244   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  245   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  246   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  247   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  249   source.append(
"         s_left_count[addr] = left_count;  \n");
 
  250   source.append(
"         s_right_count[addr] = right_count;  \n");
 
  253   source.append(
"          "); source.append(numeric_string); source.append(
" t0 = fabs(right - left);  \n");
 
  254   source.append(
"          "); source.append(numeric_string); source.append(
" t1 = max(fabs(left), fabs(right)) * precision;  \n");
 
  256   source.append(
"         if (t0 <= max(( "); source.append(numeric_string); source.append(
" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1))  \n");
 
  257   source.append(
"         {  \n");
 
  259   source.append(
"              "); source.append(numeric_string); source.append(
" lambda = computeMidpoint(left, right);  \n");
 
  262   source.append(
"             s_left[addr] = lambda;  \n");
 
  263   source.append(
"             s_right[addr] = lambda;  \n");
 
  264   source.append(
"         }  \n");
 
  265   source.append(
"         else  \n");
 
  266   source.append(
"         {  \n");
 
  269   source.append(
"             s_left[addr] = left;  \n");
 
  270   source.append(
"             s_right[addr] = right;  \n");
 
  271   source.append(
"         }  \n");
 
  273   source.append(
"     }  \n");
 
  294   template <
typename StringType>
 
  297   source.append(
"       \n");
 
  298   source.append(
"     inline unsigned int  \n");
 
  299   source.append(
"     computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(
" *g_d,   \n");
 
  300   source.append(
"                                __global "); source.append(numeric_string); source.append(
" *g_s,   \n");
 
  301   source.append(
"                                const unsigned int n,  \n");
 
  302   source.append(
"                                const "); source.append(numeric_string); source.append(
" x,         \n");
 
  303   source.append(
"                                const unsigned int tid,  \n");
 
  304   source.append(
"                                const unsigned int num_intervals_active,  \n");
 
  305   source.append(
"                                __local "); source.append(numeric_string); source.append(
" *s_d,  \n");
 
  306   source.append(
"                                __local "); source.append(numeric_string); source.append(
" *s_s,  \n");
 
  307   source.append(
"                                unsigned int converged  \n");
 
  308   source.append(
"                               )  \n");
 
  309   source.append(
"     {  \n");
 
  310   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  311   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  312   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  313   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  314   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  317   source.append(
"          "); source.append(numeric_string); source.append(
" delta = 1.0f;  \n");
 
  318   source.append(
"         unsigned int count = 0;  \n");
 
  320   source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
  323   source.append(
"         if (lcl_id < n)  \n");
 
  324   source.append(
"         {  \n");
 
  325   source.append(
"             s_d[lcl_id] = *(g_d + lcl_id);  \n");
 
  326   source.append(
"             s_s[lcl_id] = *(g_s + lcl_id - 1);  \n");
 
  327   source.append(
"         }  \n");
 
  329   source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
  332   source.append(
"         if ((tid < num_intervals_active) && (0 == converged))  \n");
 
  333   source.append(
"         {  \n");
 
  337   source.append(
"             for (unsigned int k = 0; k < n; ++k)  \n");
 
  338   source.append(
"             {  \n");
 
  339   source.append(
"                 delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;  \n");
 
  340   source.append(
"                 count += (delta < 0) ? 1 : 0;  \n");
 
  341   source.append(
"             }  \n");
 
  343   source.append(
"         } \n"); 
 
  345   source.append(
"         return count;  \n");
 
  346   source.append(
"     }  \n");
 
  366   template <
typename StringType>
 
  369   source.append(
"       \n");
 
  370   source.append(
"     inline unsigned int  \n");
 
  371   source.append(
"     computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(
" *g_d,   \n");
 
  372   source.append(
"                                __global "); source.append(numeric_string); source.append(
" *g_s,   \n");
 
  373   source.append(
"                                const unsigned int n,  \n");
 
  374   source.append(
"                                const "); source.append(numeric_string); source.append(
" x,         \n");
 
  375   source.append(
"                                const unsigned int tid,  \n");
 
  376   source.append(
"                                const unsigned int num_intervals_active,  \n");
 
  377   source.append(
"                                __local "); source.append(numeric_string); source.append(
" *s_d,  \n");
 
  378   source.append(
"                                __local "); source.append(numeric_string); source.append(
" *s_s,  \n");
 
  379   source.append(
"                                unsigned int converged  \n");
 
  380   source.append(
"                               )  \n");
 
  381   source.append(
"     {  \n");
 
  382   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  383   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  384   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  385   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  386   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  388   source.append(
"          "); source.append(numeric_string); source.append(
" delta = 1.0f;  \n");
 
  389   source.append(
"         unsigned int count = 0;  \n");
 
  391   source.append(
"         unsigned int rem = n;  \n");
 
  394   source.append(
"         for (unsigned int i = 0; i < n; i += lcl_sz)  \n");
 
  395   source.append(
"         {  \n");
 
  397   source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
  400   source.append(
"             if ((i + lcl_id) < n)  \n");
 
  401   source.append(
"             {  \n");
 
  403   source.append(
"                 s_d[lcl_id] = *(g_d + i + lcl_id);  \n");
 
  404   source.append(
"                 s_s[lcl_id] = *(g_s + i + lcl_id - 1);  \n");
 
  405   source.append(
"             }  \n");
 
  407   source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
  410   source.append(
"             if (tid < num_intervals_active)  \n");
 
  411   source.append(
"             {  \n");
 
  415   source.append(
"                 for (unsigned int k = 0; k < min(rem,lcl_sz); ++k)  \n");
 
  416   source.append(
"                 {  \n");
 
  417   source.append(
"                     delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;  \n");
 
  419   source.append(
"                     count += (delta < 0) ? 1 : 0;  \n");
 
  420   source.append(
"                 }  \n");
 
  422   source.append(
"             } \n"); 
 
  424   source.append(
"             rem -= lcl_sz;  \n");
 
  425   source.append(
"         }  \n");
 
  427   source.append(
"         return count;  \n");
 
  428   source.append(
"     }  \n");
 
  455   template<
typename StringType>
 
  458   source.append(
"       \n");
 
  459   source.append(
"     void  \n");
 
  460   source.append(
"     storeNonEmptyIntervals(unsigned int addr,  \n");
 
  461   source.append(
"                            const unsigned int num_threads_active,  \n");
 
  462   source.append(
"                            __local "); source.append(numeric_string); source.append(
" *s_left,   \n");
 
  463   source.append(
"                            __local "); source.append(numeric_string); source.append(
" *s_right,  \n");
 
  464   source.append(
"                            __local unsigned int *s_left_count,  \n");
 
  465   source.append(
"                            __local unsigned int *s_right_count,  \n");
 
  466   source.append(
"                             "); source.append(numeric_string); source.append(
" left, \n ");
 
  467   source.append(
"                             "); source.append(numeric_string); source.append(
" mid,  \n");
 
  468   source.append(
"                             "); source.append(numeric_string); source.append(
" right,\n");
 
  469   source.append(
"                            const unsigned int left_count,  \n");
 
  470   source.append(
"                            const unsigned int mid_count,  \n");
 
  471   source.append(
"                            const unsigned int right_count,  \n");
 
  472   source.append(
"                             "); source.append(numeric_string); source.append(
" precision,  \n");
 
  473   source.append(
"                            __local unsigned int *compact_second_chunk,  \n");
 
  474   source.append(
"                            __local unsigned int *s_compaction_list_exc,  \n");
 
  475   source.append(
"                            unsigned int *is_active_second)  \n");
 
  476   source.append(
"     {  \n");
 
  477   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  478   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  479   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  480   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  481   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  484   source.append(
"          \n");
 
  485   source.append(
"         if ((left_count != mid_count) && (mid_count != right_count))  \n");
 
  486   source.append(
"         {  \n");
 
  489   source.append(
"             storeInterval(addr, s_left, s_right, s_left_count, s_right_count,  \n");
 
  490   source.append(
"                           left, mid, left_count, mid_count, precision);  \n");
 
  494   source.append(
"             *is_active_second = 1;  \n");
 
  495   source.append(
"             s_compaction_list_exc[lcl_id] = 1;  \n");
 
  496   source.append(
"             *compact_second_chunk = 1;  \n");
 
  497   source.append(
"         }  \n");
 
  498   source.append(
"         else  \n");
 
  499   source.append(
"         {  \n");
 
  504   source.append(
"             *is_active_second = 0;  \n");
 
  505   source.append(
"             s_compaction_list_exc[lcl_id] = 0;  \n");
 
  508   source.append(
"             if (left_count != mid_count)  \n");
 
  509   source.append(
"             {  \n");
 
  510   source.append(
"                 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,  \n");
 
  511   source.append(
"                               left, mid, left_count, mid_count, precision);  \n");
 
  512   source.append(
"             }  \n");
 
  513   source.append(
"             else  \n");
 
  514   source.append(
"             {  \n");
 
  515   source.append(
"                 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,  \n");
 
  516   source.append(
"                               mid, right, mid_count, right_count, precision);  \n");
 
  517   source.append(
"             }  \n");
 
  519   source.append(
"         }  \n");
 
  520   source.append(
"     }  \n");
 
  530   template <
typename StringType>
 
  533       source.append(
"       \n");
 
  534       source.append(
"     void  \n");
 
  535       source.append(
"     storeNonEmptyIntervalsLarge(unsigned int addr,  \n");
 
  536       source.append(
"                            const unsigned int num_threads_active,  \n");
 
  537       source.append(
"                            __local "); source.append(numeric_string); source.append(
" *s_left,   \n");
 
  538       source.append(
"                            __local "); source.append(numeric_string); source.append(
" *s_right,  \n");
 
  539       source.append(
"                            __local unsigned short *s_left_count,  \n");
 
  540       source.append(
"                            __local unsigned short *s_right_count,  \n");
 
  541       source.append(
"                             "); source.append(numeric_string); source.append(
" left, \n ");
 
  542       source.append(
"                             "); source.append(numeric_string); source.append(
" mid,  \n");
 
  543       source.append(
"                             "); source.append(numeric_string); source.append(
" right,\n");
 
  544       source.append(
"                            const unsigned int left_count,  \n");
 
  545       source.append(
"                            const unsigned int mid_count,  \n");
 
  546       source.append(
"                            const unsigned int right_count,  \n");
 
  547       source.append(
"                             "); source.append(numeric_string); source.append(
" epsilon,  \n");
 
  548       source.append(
"                            __local unsigned int *compact_second_chunk,  \n");
 
  549       source.append(
"                            __local unsigned short *s_compaction_list,  \n");
 
  550       source.append(
"                            unsigned int *is_active_second)  \n");
 
  551       source.append(
"     {  \n");
 
  552       source.append(
"         uint glb_id = get_global_id(0); \n");
 
  553       source.append(
"         uint grp_id = get_group_id(0); \n");
 
  554       source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  555       source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  556       source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  559       source.append(
"         if ((left_count != mid_count) && (mid_count != right_count))  \n");
 
  560       source.append(
"         {  \n");
 
  562       source.append(
"             storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count,  \n");
 
  563       source.append(
"                           left, mid, left_count, mid_count, epsilon);  \n");
 
  565       source.append(
"             *is_active_second = 1;  \n");
 
  566       source.append(
"             s_compaction_list[lcl_id] = 1;  \n");
 
  567       source.append(
"             *compact_second_chunk = 1;  \n");
 
  568       source.append(
"         }  \n");
 
  569       source.append(
"         else  \n");
 
  570       source.append(
"         {  \n");
 
  575       source.append(
"             *is_active_second = 0;  \n");
 
  576       source.append(
"             s_compaction_list[lcl_id] = 0;  \n");
 
  579       source.append(
"             if (left_count != mid_count)  \n");
 
  580       source.append(
"             {  \n");
 
  581       source.append(
"                 storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count,  \n");
 
  582       source.append(
"                               left, mid, left_count, mid_count, epsilon);  \n");
 
  583       source.append(
"             }  \n");
 
  584       source.append(
"             else  \n");
 
  585       source.append(
"             {  \n");
 
  586       source.append(
"                 storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count,  \n");
 
  587       source.append(
"                               mid, right, mid_count, right_count, epsilon);  \n");
 
  588       source.append(
"             }  \n");
 
  589       source.append(
"         }  \n");
 
  590       source.append(
"     }  \n");
 
  603   template<
typename StringType>
 
  606   source.append(
"       \n");
 
  607   source.append(
"     void  \n");
 
  608   source.append(
"     createIndicesCompaction(__local unsigned int *s_compaction_list_exc,  \n");
 
  609   source.append(
"                             unsigned int num_threads_compaction)  \n");
 
  610   source.append(
"     {  \n");
 
  611   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  612   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  613   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  614   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  615   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  618   source.append(
"         unsigned int offset = 1;  \n");
 
  619   source.append(
"         const unsigned int tid = lcl_id;  \n");
 
  624   source.append(
"         for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)  \n");
 
  625   source.append(
"         {  \n");
 
  627   source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
  629   source.append(
"             if (tid < d)  \n");
 
  630   source.append(
"             {  \n");
 
  632   source.append(
"                 unsigned int  ai = offset*(2*tid+1)-1;  \n");
 
  633   source.append(
"                 unsigned int  bi = offset*(2*tid+2)-1;  \n");
 
  634   source.append(
"              \n");
 
  635   source.append(
"                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
 
  636   source.append(
"                                               + s_compaction_list_exc[ai];  \n");
 
  637   source.append(
"             }  \n");
 
  639   source.append(
"             offset <<= 1;  \n");
 
  640   source.append(
"         }  \n");
 
  643   source.append(
"         for (int d = 2; d < num_threads_compaction; d <<= 1)  \n");
 
  644   source.append(
"         {  \n");
 
  646   source.append(
"             offset >>= 1;  \n");
 
  647   source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
  649   source.append(
"             if (tid < (d-1))  \n");
 
  650   source.append(
"             {  \n");
 
  652   source.append(
"                 unsigned int  ai = offset*(tid+1) - 1;  \n");
 
  653   source.append(
"                 unsigned int  bi = ai + (offset >> 1);  \n");
 
  655   source.append(
"                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
 
  656   source.append(
"                                               + s_compaction_list_exc[ai];  \n");
 
  657   source.append(
"             }  \n");
 
  658   source.append(
"         }  \n");
 
  660   source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
  662   source.append(
"     }  \n");
 
  666   template<
typename StringType>
 
  669   source.append(
"       \n");
 
  670   source.append(
"     void  \n");
 
  671   source.append(
"     createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc,  \n");
 
  672   source.append(
"                             unsigned int num_threads_compaction)  \n");
 
  673   source.append(
"     {  \n");
 
  674   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  675   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  676   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  677   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  678   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  681   source.append(
"         unsigned int offset = 1;  \n");
 
  682   source.append(
"         const unsigned int tid = lcl_id;  \n");
 
  685   source.append(
"         for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)  \n");
 
  686   source.append(
"         {  \n");
 
  688   source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
  690   source.append(
"             if (tid < d)  \n");
 
  691   source.append(
"             {  \n");
 
  693   source.append(
"                 unsigned int  ai = offset*(2*tid+1)-1;  \n");
 
  694   source.append(
"                 unsigned int  bi = offset*(2*tid+2)-1;  \n");
 
  695   source.append(
"              \n");
 
  696   source.append(
"                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
 
  697   source.append(
"                                               + s_compaction_list_exc[ai];  \n");
 
  698   source.append(
"             }  \n");
 
  700   source.append(
"             offset <<= 1;  \n");
 
  701   source.append(
"         }  \n");
 
  704   source.append(
"         for (int d = 2; d < num_threads_compaction; d <<= 1)  \n");
 
  705   source.append(
"         {  \n");
 
  707   source.append(
"             offset >>= 1;  \n");
 
  708   source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
  710   source.append(
"             if (tid < (d-1))  \n");
 
  711   source.append(
"             {  \n");
 
  713   source.append(
"                 unsigned int  ai = offset*(tid+1) - 1;  \n");
 
  714   source.append(
"                 unsigned int  bi = ai + (offset >> 1);  \n");
 
  716   source.append(
"                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
 
  717   source.append(
"                                               + s_compaction_list_exc[ai];  \n");
 
  718   source.append(
"             }  \n");
 
  719   source.append(
"         }  \n");
 
  721   source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
  723   source.append(
"     }  \n");
 
  742   template<
typename StringType>
 
  745   source.append(
"       \n");
 
  746   source.append(
"     void  \n");
 
  747   source.append(
"     compactIntervals(__local "); source.append(numeric_string); source.append(
" *s_left,  \n");
 
  748   source.append(
"                      __local "); source.append(numeric_string); source.append(
" *s_right, \n");
 
  749   source.append(
"                      __local unsigned int *s_left_count, \n");
 
  750   source.append(
"                      __local unsigned int *s_right_count,  \n");
 
  751   source.append(
"                       "); source.append(numeric_string); source.append(
" mid,  \n");
 
  752   source.append(
"                       "); source.append(numeric_string); source.append(
" right, \n");
 
  753   source.append(
"                      unsigned int mid_count, unsigned int right_count,  \n");
 
  754   source.append(
"                      __local unsigned int *s_compaction_list,  \n");
 
  755   source.append(
"                      unsigned int num_threads_active,  \n");
 
  756   source.append(
"                      unsigned int is_active_second)  \n");
 
  757   source.append(
"     {  \n");
 
  758   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  759   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  760   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  761   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  762   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  764   source.append(
"         const unsigned int tid = lcl_id;  \n");
 
  768   source.append(
"         if ((tid < num_threads_active) && (1 == is_active_second))  \n");
 
  769   source.append(
"         {  \n");
 
  770   source.append(
"             unsigned int addr_w = num_threads_active + s_compaction_list[tid];  \n");
 
  771   source.append(
"             s_left[addr_w] = mid;  \n");
 
  772   source.append(
"             s_right[addr_w] = right;  \n");
 
  773   source.append(
"             s_left_count[addr_w] = mid_count;  \n");
 
  774   source.append(
"             s_right_count[addr_w] = right_count;  \n");
 
  775   source.append(
"         }  \n");
 
  776   source.append(
"     }  \n");
 
  782   template<
typename StringType>
 
  785   source.append(
"       \n");
 
  786   source.append(
"     void  \n");
 
  787   source.append(
"     compactIntervalsShort(__local "); source.append(numeric_string); source.append(
" *s_left,  \n");
 
  788   source.append(
"                      __local "); source.append(numeric_string); source.append(
" *s_right,  \n");
 
  789   source.append(
"                      __local unsigned short *s_left_count, \n");
 
  790   source.append(
"                      __local unsigned short *s_right_count,  \n");
 
  791   source.append(
"                      "); source.append(numeric_string); source.append(
" mid,   \n");
 
  792   source.append(
"                      "); source.append(numeric_string); source.append(
" right, \n");
 
  793   source.append(
"                      unsigned int mid_count, unsigned int right_count,  \n");
 
  794   source.append(
"                      __local unsigned short *s_compaction_list,  \n");
 
  795   source.append(
"                      unsigned int num_threads_active,  \n");
 
  796   source.append(
"                      unsigned int is_active_second)  \n");
 
  797   source.append(
"     {  \n");
 
  798   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  799   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  800   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  801   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  802   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  804   source.append(
"         const unsigned int tid = lcl_id;  \n");
 
  808   source.append(
"         if ((tid < num_threads_active) && (1 == is_active_second))  \n");
 
  809   source.append(
"         {  \n");
 
  810   source.append(
"             unsigned int addr_w = num_threads_active + s_compaction_list[tid];  \n");
 
  811   source.append(
"             s_left[addr_w] = mid;  \n");
 
  812   source.append(
"             s_right[addr_w] = right;  \n");
 
  813   source.append(
"             s_left_count[addr_w] = mid_count;  \n");
 
  814   source.append(
"             s_right_count[addr_w] = right_count;  \n");
 
  815   source.append(
"         }  \n");
 
  816   source.append(
"     }  \n");
 
  821   template<
typename StringType>
 
  824   source.append(
"       \n");
 
  825   source.append(
"     void  \n");
 
  826   source.append(
"     storeIntervalConverged( __local "); source.append(numeric_string); source.append(
" *s_left,   \n");
 
  827   source.append(
"                             __local "); source.append(numeric_string); source.append(
" *s_right,   \n");
 
  828   source.append(
"                            __local unsigned int *s_left_count, \n");
 
  829   source.append(
"                            __local unsigned int *s_right_count,  \n");
 
  830   source.append(
"                            "); source.append(numeric_string); source.append(
" *left,   \n");
 
  831   source.append(
"                            "); source.append(numeric_string); source.append(
" *mid,   \n");
 
  832   source.append(
"                            "); source.append(numeric_string); source.append(
" *right,   \n");
 
  833   source.append(
"                            unsigned int *left_count,     \n");
 
  834   source.append(
"                            unsigned int *mid_count,      \n");
 
  835   source.append(
"                            unsigned int *right_count,     \n");
 
  836   source.append(
"                            __local unsigned int *s_compaction_list_exc,  \n");
 
  837   source.append(
"                            __local unsigned int *compact_second_chunk,  \n");
 
  838   source.append(
"                            const unsigned int num_threads_active,  \n");
 
  839   source.append(
"                            unsigned int *is_active_second)  \n");
 
  840   source.append(
"     {  \n");
 
  841   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  842   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  843   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  844   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  845   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  847   source.append(
"         const unsigned int tid = lcl_id;  \n");
 
  848   source.append(
"         const unsigned int multiplicity = *right_count - *left_count;  \n");
 
  850   source.append(
"         if (1 == multiplicity)  \n");
 
  851   source.append(
"         {  \n");
 
  854   source.append(
"             s_left[tid] = *left;  \n");
 
  855   source.append(
"             s_right[tid] = *right;  \n");
 
  856   source.append(
"             s_left_count[tid] = *left_count;  \n");
 
  857   source.append(
"             s_right_count[tid] = *right_count;  \n");
 
  858   source.append(
"             \n");
 
  861   source.append(
"             *is_active_second = 0;  \n");
 
  862   source.append(
"             s_compaction_list_exc[tid] = 0;  \n");
 
  863   source.append(
"         }  \n");
 
  864   source.append(
"         else  \n");
 
  865   source.append(
"         {  \n");
 
  868   source.append(
"             *mid_count = *left_count + (multiplicity >> 1);  \n");
 
  871   source.append(
"             s_left[tid] = *left;  \n");
 
  872   source.append(
"             s_right[tid] = *right;  \n");
 
  873   source.append(
"             s_left_count[tid] = *left_count;  \n");
 
  874   source.append(
"             s_right_count[tid] = *mid_count;  \n");
 
  875   source.append(
"             *mid = *left;  \n");
 
  878   source.append(
"             *is_active_second = 1;  \n");
 
  879   source.append(
"             s_compaction_list_exc[tid] = 1;  \n");
 
  880   source.append(
"             *compact_second_chunk = 1;  \n");
 
  881   source.append(
"         }  \n");
 
  882   source.append(
"     }  \n");
 
  889   template<
typename StringType>
 
  892   source.append(
"       \n");
 
  893   source.append(
"     void  \n");
 
  894   source.append(
"     storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(
" *s_left,   \n");
 
  895   source.append(
"                             __local "); source.append(numeric_string); source.append(
" *s_right,   \n");
 
  896   source.append(
"                            __local unsigned short *s_left_count, \n");
 
  897   source.append(
"                            __local unsigned short *s_right_count,  \n");
 
  898   source.append(
"                            "); source.append(numeric_string); source.append(
" *left,   \n");
 
  899   source.append(
"                            "); source.append(numeric_string); source.append(
" *mid,   \n");
 
  900   source.append(
"                            "); source.append(numeric_string); source.append(
" *right,   \n");
 
  901   source.append(
"                            unsigned int *left_count,     \n");
 
  902   source.append(
"                            unsigned int *mid_count,      \n");
 
  903   source.append(
"                            unsigned int *right_count,     \n");
 
  904   source.append(
"                            __local unsigned short *s_compaction_list_exc,  \n");
 
  905   source.append(
"                            __local unsigned int *compact_second_chunk,  \n");
 
  906   source.append(
"                            const unsigned int num_threads_active,  \n");
 
  907   source.append(
"                            unsigned int *is_active_second)  \n");
 
  908   source.append(
"     {  \n");
 
  909   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  910   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  911   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  912   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  913   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  915   source.append(
"         const unsigned int tid = lcl_id;  \n");
 
  916   source.append(
"         const unsigned int multiplicity = *right_count - *left_count;  \n");
 
  918   source.append(
"         if (1 == multiplicity)  \n");
 
  919   source.append(
"         {  \n");
 
  922   source.append(
"             s_left[tid] = *left;  \n");
 
  923   source.append(
"             s_right[tid] = *right;  \n");
 
  924   source.append(
"             s_left_count[tid] = *left_count;  \n");
 
  925   source.append(
"             s_right_count[tid] = *right_count;  \n");
 
  926   source.append(
"             \n");
 
  929   source.append(
"             *is_active_second = 0;  \n");
 
  930   source.append(
"             s_compaction_list_exc[tid] = 0;  \n");
 
  931   source.append(
"         }  \n");
 
  932   source.append(
"         else  \n");
 
  933   source.append(
"         {  \n");
 
  936   source.append(
"             *mid_count = *left_count + (multiplicity >> 1);  \n");
 
  939   source.append(
"             s_left[tid] = *left;  \n");
 
  940   source.append(
"             s_right[tid] = *right;  \n");
 
  941   source.append(
"             s_left_count[tid] = *left_count;  \n");
 
  942   source.append(
"             s_right_count[tid] = *mid_count;  \n");
 
  943   source.append(
"             *mid = *left;  \n");
 
  946   source.append(
"             *is_active_second = 1;  \n");
 
  947   source.append(
"             s_compaction_list_exc[tid] = 1;  \n");
 
  948   source.append(
"             *compact_second_chunk = 1;  \n");
 
  949   source.append(
"         }  \n");
 
  950   source.append(
"     }  \n");
 
  970   template<
typename StringType>
 
  973   source.append(
"       \n");
 
  974   source.append(
"     void  \n");
 
  975   source.append(
"     subdivideActiveIntervalMulti(const unsigned int tid,  \n");
 
  976   source.append(
"                             __local "); source.append(numeric_string); source.append(
" *s_left,    \n");
 
  977   source.append(
"                             __local "); source.append(numeric_string); source.append(
" *s_right,   \n");
 
  978   source.append(
"                             __local unsigned int *s_left_count,   \n");
 
  979   source.append(
"                             __local unsigned int *s_right_count,  \n");
 
  980   source.append(
"                             const unsigned int num_threads_active,  \n");
 
  981   source.append(
"                              "); source.append(numeric_string); source.append(
" *left,   \n");
 
  982   source.append(
"                              "); source.append(numeric_string); source.append(
" *right,   \n");
 
  983   source.append(
"                             unsigned int *left_count, unsigned int *right_count,  \n");
 
  984   source.append(
"                              "); source.append(numeric_string); source.append(
" *mid,    \n");
 
  985   source.append(
"                              __local unsigned int *all_threads_converged)  \n");
 
  986   source.append(
"     {  \n");
 
  987   source.append(
"         uint glb_id = get_global_id(0); \n");
 
  988   source.append(
"         uint grp_id = get_group_id(0); \n");
 
  989   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
  990   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
  991   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
  994   source.append(
"         if (tid < num_threads_active)  \n");
 
  995   source.append(
"         {  \n");
 
  997   source.append(
"             *left = s_left[tid];  \n");
 
  998   source.append(
"             *right = s_right[tid];  \n");
 
  999   source.append(
"             *left_count = s_left_count[tid];  \n");
 
 1000   source.append(
"             *right_count = s_right_count[tid];  \n");
 
 1003   source.append(
"             if (*left != *right)  \n");
 
 1004   source.append(
"             {  \n");
 
 1006   source.append(
"                 *mid = computeMidpoint(*left, *right);  \n");
 
 1007   source.append(
"                 *all_threads_converged = 0;  \n");
 
 1008   source.append(
"             }  \n");
 
 1009   source.append(
"             else if ((*right_count - *left_count) > 1)  \n");
 
 1010   source.append(
"             {  \n");
 
 1013   source.append(
"                 *all_threads_converged = 0;  \n");
 
 1014   source.append(
"             }  \n");
 
 1016   source.append(
"         }    \n");
 
 1018   source.append(
"     }  \n");
 
 1022   template<
typename StringType>
 
 1025   source.append(
"       \n");
 
 1026   source.append(
"     void  \n");
 
 1027   source.append(
"     subdivideActiveIntervalShort(const unsigned int tid,  \n");
 
 1028   source.append(
"                             __local "); source.append(numeric_string); source.append(
" *s_left,    \n");
 
 1029   source.append(
"                             __local "); source.append(numeric_string); source.append(
" *s_right,   \n");
 
 1030   source.append(
"                             __local unsigned short *s_left_count,   \n");
 
 1031   source.append(
"                             __local unsigned short *s_right_count,  \n");
 
 1032   source.append(
"                             const unsigned int num_threads_active,  \n");
 
 1033   source.append(
"                             "); source.append(numeric_string); source.append(
" *left,   \n");
 
 1034   source.append(
"                             "); source.append(numeric_string); source.append(
" *right,   \n");
 
 1035   source.append(
"                             unsigned int *left_count, unsigned int *right_count,  \n");
 
 1036   source.append(
"                             "); source.append(numeric_string); source.append(
" *mid,    \n");
 
 1037   source.append(
"                             __local unsigned int *all_threads_converged)  \n");
 
 1038   source.append(
"     {  \n");
 
 1039   source.append(
"         uint glb_id = get_global_id(0); \n");
 
 1040   source.append(
"         uint grp_id = get_group_id(0); \n");
 
 1041   source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
 1042   source.append(
"         uint lcl_id = get_local_id(0); \n");
 
 1043   source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
 1046   source.append(
"         if (tid < num_threads_active)  \n");
 
 1047   source.append(
"         {  \n");
 
 1049   source.append(
"             *left = s_left[tid];  \n");
 
 1050   source.append(
"             *right = s_right[tid];  \n");
 
 1051   source.append(
"             *left_count = s_left_count[tid];  \n");
 
 1052   source.append(
"             *right_count = s_right_count[tid];  \n");
 
 1055   source.append(
"             if (*left != *right)  \n");
 
 1056   source.append(
"             {  \n");
 
 1058   source.append(
"                 *mid = computeMidpoint(*left, *right);  \n");
 
 1059   source.append(
"                 *all_threads_converged = 0;  \n");
 
 1060   source.append(
"             }  \n");
 
 1062   source.append(
"         }    \n");
 
 1064   source.append(
"     }  \n");
 
 1084   template <
typename StringType>
 
 1087       source.append(
"     __kernel  \n");
 
 1088       source.append(
"     void  \n");
 
 1089       source.append(
"     bisectKernelSmall(__global "); source.append(numeric_string); source.append(
" *g_d,   \n");
 
 1090       source.append(
"                  __global "); source.append(numeric_string); source.append(
" *g_s,   \n");
 
 1091       source.append(
"                  const unsigned int n,  \n");
 
 1092       source.append(
"                  __global "); source.append(numeric_string); source.append(
" *g_left,   \n");
 
 1093       source.append(
"                  __global "); source.append(numeric_string); source.append(
" *g_right,  \n");
 
 1094       source.append(
"                  __global unsigned int *g_left_count, __global unsigned int *g_right_count,  \n");
 
 1095       source.append(
"                  const "); source.append(numeric_string); source.append(
" lg,  \n");
 
 1096       source.append(
"                  const "); source.append(numeric_string); source.append(
" ug,  \n");
 
 1097       source.append(
"                  const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n");
 
 1098       source.append(
"                  "); source.append(numeric_string); source.append(
" epsilon  \n");
 
 1099       source.append(
"                 )  \n");
 
 1100       source.append(
"     {  \n");
 
 1101       source.append(
"         g_s = g_s + 1; \n");
 
 1102       source.append(
"         uint glb_id = get_global_id(0); \n");
 
 1103       source.append(
"         uint grp_id = get_group_id(0); \n");
 
 1104       source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
 1105       source.append(
"         uint lcl_id = get_local_id(0); \n");
 
 1106       source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
 1110       source.append(
"         __local "); source.append(numeric_string); source.append(
" s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
 
 1111       source.append(
"         __local "); source.append(numeric_string); source.append(
" s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
 
 1115       source.append(
"         __local  unsigned int  s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
 
 1116       source.append(
"         __local  unsigned int  s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
 
 1119       source.append(
"         __local  unsigned int  \n");
 
 1120       source.append(
"           s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1];  \n");
 
 1125       source.append(
"         __local  unsigned int compact_second_chunk;  \n");
 
 1126       source.append(
"         __local  unsigned int all_threads_converged;  \n");
 
 1129       source.append(
"         __local  unsigned int num_threads_active;  \n");
 
 1132       source.append(
"         __local  unsigned int num_threads_compaction;  \n");
 
 1135       source.append(
"         __local unsigned int *s_compaction_list_exc = s_compaction_list + 1;  \n");
 
 1140       source.append(
"          "); source.append(numeric_string); source.append(
" left = 0.0f;  \n");
 
 1141       source.append(
"          "); source.append(numeric_string); source.append(
" right = 0.0f;  \n");
 
 1142       source.append(
"         unsigned int left_count = 0;  \n");
 
 1143       source.append(
"         unsigned int right_count = 0;  \n");
 
 1145       source.append(
"          "); source.append(numeric_string); source.append(
" mid = 0.0f;  \n");
 
 1147       source.append(
"         unsigned int mid_count = 0;  \n");
 
 1149       source.append(
"         unsigned int  is_active_second = 0;  \n");
 
 1151       source.append(
"         s_compaction_list[lcl_id] = 0;  \n");
 
 1152       source.append(
"         s_left[lcl_id] = 0.0;  \n");
 
 1153       source.append(
"         s_right[lcl_id] = 0.0;  \n");
 
 1154       source.append(
"         s_left_count[lcl_id] = 0;  \n");
 
 1155       source.append(
"         s_right_count[lcl_id] = 0;  \n");
 
 1157       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1160       source.append(
"         if (0 == lcl_id)  \n");
 
 1161       source.append(
"         {  \n");
 
 1162       source.append(
"             s_left[0] = lg;  \n");
 
 1163       source.append(
"             s_right[0] = ug;  \n");
 
 1164       source.append(
"             s_left_count[0] = lg_eig_count;  \n");
 
 1165       source.append(
"             s_right_count[0] = ug_eig_count;  \n");
 
 1167       source.append(
"             compact_second_chunk = 0;  \n");
 
 1168       source.append(
"             num_threads_active = 1;  \n");
 
 1170       source.append(
"             num_threads_compaction = 1;  \n");
 
 1171       source.append(
"         }  \n");
 
 1176       source.append(
"         while (true)  \n");
 
 1177       source.append(
"         {  \n");
 
 1179       source.append(
"             all_threads_converged = 1;  \n");
 
 1180       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1182       source.append(
"             is_active_second = 0;  \n");
 
 1183       source.append(
"             subdivideActiveIntervalMulti(lcl_id,  \n");
 
 1184       source.append(
"                                     s_left, s_right, s_left_count, s_right_count,  \n");
 
 1185       source.append(
"                                     num_threads_active,  \n");
 
 1186       source.append(
"                                     &left, &right, &left_count, &right_count,  \n");
 
 1187       source.append(
"                                     &mid, &all_threads_converged);  \n");
 
 1189       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1192       source.append(
"             if (1 == all_threads_converged)  \n");
 
 1193       source.append(
"             {  \n");
 
 1194       source.append(
"                 break;  \n");
 
 1195       source.append(
"             }  \n");
 
 1197       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1204       source.append(
"             mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid,  \n");
 
 1205       source.append(
"                                                    lcl_id, num_threads_active,  \n");
 
 1206       source.append(
"                                                    s_left, s_right,  \n");
 
 1207       source.append(
"                                                    (left == right));  \n");
 
 1209       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1221       source.append(
"             if (lcl_id < num_threads_active)  \n");
 
 1222       source.append(
"             {  \n");
 
 1224       source.append(
"                 if (left != right)  \n");
 
 1225       source.append(
"                 {  \n");
 
 1228       source.append(
"                     storeNonEmptyIntervals(lcl_id, num_threads_active,  \n");
 
 1229       source.append(
"                                            s_left, s_right, s_left_count, s_right_count,  \n");
 
 1230       source.append(
"                                            left, mid, right,  \n");
 
 1231       source.append(
"                                            left_count, mid_count, right_count,  \n");
 
 1232       source.append(
"                                            epsilon, &compact_second_chunk,  \n");
 
 1233       source.append(
"                                            s_compaction_list_exc,  \n");
 
 1234       source.append(
"                                            &is_active_second);  \n");
 
 1235       source.append(
"                 }  \n");
 
 1236       source.append(
"                 else  \n");
 
 1237       source.append(
"                 {  \n");
 
 1239       source.append(
"                     storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,  \n");
 
 1240       source.append(
"                                            &left, &mid, &right,  \n");
 
 1241       source.append(
"                                            &left_count, &mid_count, &right_count,  \n");
 
 1242       source.append(
"                                            s_compaction_list_exc, &compact_second_chunk,  \n");
 
 1243       source.append(
"                                            num_threads_active,  \n");
 
 1244       source.append(
"                                            &is_active_second);  \n");
 
 1245       source.append(
"                 }  \n");
 
 1246       source.append(
"             }  \n");
 
 1249       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1254       source.append(
"             if (compact_second_chunk > 0)  \n");
 
 1255       source.append(
"             {  \n");
 
 1257       source.append(
"                 createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);  \n");
 
 1259       source.append(
"                 compactIntervals(s_left, s_right, s_left_count, s_right_count,  \n");
 
 1260       source.append(
"                                  mid, right, mid_count, right_count,  \n");
 
 1261       source.append(
"                                  s_compaction_list, num_threads_active,  \n");
 
 1262       source.append(
"                                  is_active_second);  \n");
 
 1263       source.append(
"             }  \n");
 
 1265       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1267       source.append(
"             if (0 == lcl_id)  \n");
 
 1268       source.append(
"             {  \n");
 
 1271       source.append(
"                 num_threads_active += s_compaction_list[num_threads_active];  \n");
 
 1273       source.append(
"                 num_threads_compaction = ceilPow2(num_threads_active);  \n");
 
 1275       source.append(
"                 compact_second_chunk = 0;  \n");
 
 1276       source.append(
"             }  \n");
 
 1278       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1280       source.append(
"         }  \n");
 
 1282       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1289       source.append(
"         if (lcl_id < n)  \n");
 
 1290       source.append(
"         {  \n");
 
 1292       source.append(
"             g_left[lcl_id]  = s_left[lcl_id];  \n");
 
 1294       source.append(
"             g_left_count[lcl_id]  = s_left_count[lcl_id];  \n");
 
 1295       source.append(
"         }  \n");
 
 1296       source.append(
"     }  \n");
 
 1316   template <
typename StringType>
 
 1319       source.append(
"     __kernel  \n");
 
 1320       source.append(
"     void  \n");
 
 1321       source.append(
"     bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(
" *g_d,   \n");
 
 1322       source.append(
"                                     __global "); source.append(numeric_string); source.append(
" *g_s,   \n");
 
 1323       source.append(
"                                     const unsigned int n,  \n");
 
 1324       source.append(
"                                     __global unsigned int *blocks_mult,  \n");
 
 1325       source.append(
"                                     __global unsigned int *blocks_mult_sum,  \n");
 
 1326       source.append(
"                                     __global "); source.append(numeric_string); source.append(
" *g_left,   \n");
 
 1327       source.append(
"                                     __global "); source.append(numeric_string); source.append(
" *g_right,  \n");
 
 1328       source.append(
"                                     __global unsigned int *g_left_count,  \n");
 
 1329       source.append(
"                                     __global unsigned int *g_right_count,  \n");
 
 1330       source.append(
"                                     __global  "); source.append(numeric_string); source.append(
" *g_lambda, \n");
 
 1331       source.append(
"                                     __global unsigned int *g_pos,  \n");
 
 1332       source.append(
"                                     "); source.append(numeric_string); source.append(
" precision  \n");
 
 1333       source.append(
"                                    )  \n");
 
 1334       source.append(
"     {  \n");
 
 1335       source.append(
"         g_s = g_s + 1; \n");
 
 1336       source.append(
"         uint glb_id = get_global_id(0); \n");
 
 1337       source.append(
"         uint grp_id = get_group_id(0); \n");
 
 1338       source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
 1339       source.append(
"         uint lcl_id = get_local_id(0); \n");
 
 1340       source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
 1342       source.append(
"       const unsigned int tid = lcl_id;  \n");
 
 1345       source.append(
"         __local "); source.append(numeric_string); source.append(
" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
 
 1346       source.append(
"         __local "); source.append(numeric_string); source.append(
" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
 
 1349       source.append(
"         __local  unsigned int  s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
 
 1350       source.append(
"         __local  unsigned int  s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
 
 1353       source.append(
"         __local  unsigned int  s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];  \n");
 
 1355       source.append(
"         __local unsigned int *s_compaction_list_exc = s_compaction_list + 1;  \n");
 
 1358       source.append(
"         __local  unsigned int  all_threads_converged;  \n");
 
 1360       source.append(
"         __local  unsigned int  num_threads_active;  \n");
 
 1362       source.append(
"         __local  unsigned int  num_threads_compaction;  \n");
 
 1364       source.append(
"         __local  unsigned int compact_second_chunk;  \n");
 
 1367       source.append(
"         __local  unsigned int  c_block_start;  \n");
 
 1368       source.append(
"         __local  unsigned int  c_block_end;  \n");
 
 1369       source.append(
"         __local  unsigned int  c_block_offset_output;  \n");
 
 1372       source.append(
"         "); source.append(numeric_string); source.append(
" mid = 0.0f;  \n");
 
 1374       source.append(
"         unsigned int  mid_count = 0;  \n");
 
 1376       source.append(
"         "); source.append(numeric_string); source.append(
" left = 0.0f;  \n");
 
 1377       source.append(
"         "); source.append(numeric_string); source.append(
" right = 0.0f;  \n");
 
 1378       source.append(
"         unsigned int  left_count = 0;  \n");
 
 1379       source.append(
"         unsigned int  right_count = 0;  \n");
 
 1381       source.append(
"         unsigned int  is_active_second = 0;  \n");
 
 1383       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE);            \n");
 
 1386       source.append(
"         if (0 == tid)  \n");
 
 1387       source.append(
"         {  \n");
 
 1389       source.append(
"             c_block_start = blocks_mult[grp_id];  \n");
 
 1390       source.append(
"             c_block_end = blocks_mult[grp_id + 1];  \n");
 
 1391       source.append(
"             c_block_offset_output = blocks_mult_sum[grp_id];  \n");
 
 1392       source.append(
"               \n");
 
 1394       source.append(
"             num_threads_active = c_block_end - c_block_start;  \n");
 
 1395       source.append(
"             s_compaction_list[0] = 0;  \n");
 
 1396       source.append(
"             num_threads_compaction = ceilPow2(num_threads_active);  \n");
 
 1398       source.append(
"             all_threads_converged = 1;  \n");
 
 1399       source.append(
"             compact_second_chunk = 0;  \n");
 
 1400       source.append(
"         }  \n");
 
 1401       source.append(
"          s_left_count [tid] = 42;  \n");
 
 1402       source.append(
"          s_right_count[tid] = 42;  \n");
 
 1403       source.append(
"          s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
 
 1404       source.append(
"          s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
 
 1405       source.append(
"           \n");
 
 1406       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1407       source.append(
"           \n");
 
 1410       source.append(
"         if (tid < num_threads_active)  \n");
 
 1411       source.append(
"         {  \n");
 
 1413       source.append(
"             s_left[tid]  = g_left[c_block_start + tid];  \n");
 
 1414       source.append(
"             s_right[tid] = g_right[c_block_start + tid];  \n");
 
 1415       source.append(
"             s_left_count[tid]  = g_left_count[c_block_start + tid];  \n");
 
 1416       source.append(
"             s_right_count[tid] = g_right_count[c_block_start + tid];  \n");
 
 1417       source.append(
"               \n");
 
 1418       source.append(
"         }  \n");
 
 1419       source.append(
"        \n");
 
 1420       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1421       source.append(
"         unsigned int iter = 0;  \n");
 
 1423       source.append(
"         while (true)  \n");
 
 1424       source.append(
"         {  \n");
 
 1425       source.append(
"             iter++;  \n");
 
 1427       source.append(
"             s_compaction_list[lcl_id] = 0;  \n");
 
 1428       source.append(
"             s_compaction_list[lcl_id + lcl_sz] = 0;  \n");
 
 1429       source.append(
"             s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
 
 1432       source.append(
"             subdivideActiveIntervalMulti(tid, s_left, s_right,  \n");
 
 1433       source.append(
"                                     s_left_count, s_right_count,  \n");
 
 1434       source.append(
"                                     num_threads_active,  \n");
 
 1435       source.append(
"                                     &left, &right, &left_count, &right_count,  \n");
 
 1436       source.append(
"                                     &mid, &all_threads_converged);  \n");
 
 1437       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1440       source.append(
"             if (1 == all_threads_converged)  \n");
 
 1441       source.append(
"             {  \n");
 
 1442       source.append(
"                  \n");
 
 1443       source.append(
"                 break;  \n");
 
 1444       source.append(
"             }  \n");
 
 1450       source.append(
"             mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,  \n");
 
 1451       source.append(
"                                                         mid, tid, num_threads_active,  \n");
 
 1452       source.append(
"                                                         s_left, s_right,  \n");
 
 1453       source.append(
"                                                         (left == right));  \n");
 
 1454       source.append(
"                                                \n");
 
 1455       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1457       source.append(
"             if (tid < num_threads_active)  \n");
 
 1458       source.append(
"             {  \n");
 
 1459       source.append(
"                   \n");
 
 1461       source.append(
"                 if (left != right)  \n");
 
 1462       source.append(
"                 {  \n");
 
 1464       source.append(
"                     storeNonEmptyIntervals(tid, num_threads_active,  \n");
 
 1465       source.append(
"                                            s_left, s_right, s_left_count, s_right_count,  \n");
 
 1466       source.append(
"                                            left, mid, right,  \n");
 
 1467       source.append(
"                                            left_count, mid_count, right_count,  \n");
 
 1468       source.append(
"                                            precision, &compact_second_chunk,  \n");
 
 1469       source.append(
"                                            s_compaction_list_exc,  \n");
 
 1470       source.append(
"                                            &is_active_second);  \n");
 
 1471       source.append(
"                      \n");
 
 1472       source.append(
"                 }  \n");
 
 1473       source.append(
"                 else  \n");
 
 1474       source.append(
"                 {  \n");
 
 1476       source.append(
"                     storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,  \n");
 
 1477       source.append(
"                                            &left, &mid, &right,  \n");
 
 1478       source.append(
"                                            &left_count, &mid_count, &right_count,  \n");
 
 1479       source.append(
"                                            s_compaction_list_exc, &compact_second_chunk,  \n");
 
 1480       source.append(
"                                            num_threads_active,  \n");
 
 1481       source.append(
"                                            &is_active_second);  \n");
 
 1482       source.append(
"                   \n");
 
 1483       source.append(
"                 }  \n");
 
 1484       source.append(
"             }  \n");
 
 1486       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1490       source.append(
"             if (1 == compact_second_chunk)  \n");
 
 1491       source.append(
"             {  \n");
 
 1493       source.append(
"                 createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);  \n");
 
 1494       source.append(
"                 compactIntervals(s_left, s_right, s_left_count, s_right_count,  \n");
 
 1495       source.append(
"                                  mid, right, mid_count, right_count,  \n");
 
 1496       source.append(
"                                  s_compaction_list, num_threads_active,  \n");
 
 1497       source.append(
"                                  is_active_second);  \n");
 
 1498       source.append(
"             }  \n");
 
 1500       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1503       source.append(
"             if (0 == tid)  \n");
 
 1504       source.append(
"             {  \n");
 
 1505       source.append(
"                 num_threads_active += s_compaction_list[num_threads_active];  \n");
 
 1506       source.append(
"                 num_threads_compaction = ceilPow2(num_threads_active);  \n");
 
 1508       source.append(
"                 compact_second_chunk = 0;  \n");
 
 1509       source.append(
"                 all_threads_converged = 1;  \n");
 
 1510       source.append(
"             }  \n");
 
 1512       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1515       source.append(
"             s_compaction_list_exc[lcl_id] = 0;  \n");
 
 1516       source.append(
"             s_compaction_list_exc[lcl_id + lcl_sz] = 0;   \n");
 
 1517       source.append(
"               \n");
 
 1518       source.append(
"             if (num_threads_compaction > lcl_sz)              \n");
 
 1519       source.append(
"             {  \n");
 
 1520       source.append(
"               break;  \n");
 
 1521       source.append(
"             }  \n");
 
 1524       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1526       source.append(
"    } \n"); 
 
 1529       source.append(
"         if (tid < num_threads_active)  \n");
 
 1530       source.append(
"         {  \n");
 
 1532       source.append(
"             unsigned int addr = c_block_offset_output + tid;  \n");
 
 1533       source.append(
"               \n");
 
 1534       source.append(
"             g_lambda[addr]  = s_left[tid];  \n");
 
 1535       source.append(
"             g_pos[addr]   = s_right_count[tid];  \n");
 
 1536       source.append(
"         }  \n");
 
 1537       source.append(
"     }  \n");
 
 1554   template <
typename StringType>
 
 1557       source.append(
"     __kernel  \n");
 
 1558       source.append(
"     void  \n");
 
 1559       source.append(
"     bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(
" *g_d,   \n");
 
 1560       source.append(
"                                    __global "); source.append(numeric_string); source.append(
" *g_s,    \n");
 
 1561       source.append(
"                                    const unsigned int n,  \n");
 
 1562       source.append(
"                                    unsigned int num_intervals,  \n");
 
 1563       source.append(
"                                    __global "); source.append(numeric_string); source.append(
" *g_left,  \n");
 
 1564       source.append(
"                                    __global "); source.append(numeric_string); source.append(
" *g_right,  \n");
 
 1565       source.append(
"                                    __global unsigned int *g_pos,  \n");
 
 1566       source.append(
"                                    "); source.append(numeric_string); source.append(
" precision)  \n");
 
 1567       source.append(
"     {  \n");
 
 1568       source.append(
"         g_s = g_s + 1; \n");
 
 1569       source.append(
"         uint glb_id = get_global_id(0); \n");
 
 1570       source.append(
"         uint grp_id = get_group_id(0); \n");
 
 1571       source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
 1572       source.append(
"         uint lcl_id = get_local_id(0); \n");
 
 1573       source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
 1574       source.append(
"         const unsigned int gtid = (lcl_sz * grp_id) + lcl_id;  \n");
 
 1575       source.append(
"         __local "); source.append(numeric_string); source.append(
" s_left_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
 
 1576       source.append(
"         __local "); source.append(numeric_string); source.append(
" s_right_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
 
 1579       source.append(
"          "); source.append(numeric_string); source.append(
" left, right;  \n");
 
 1582       source.append(
"         unsigned int right_count;  \n");
 
 1584       source.append(
"         unsigned int converged = 0;  \n");
 
 1586       source.append(
"         "); source.append(numeric_string); source.append(
" mid = 0.0f;  \n");
 
 1588       source.append(
"         unsigned int mid_count = 0;  \n");
 
 1591       source.append(
"         if (gtid < num_intervals)  \n");
 
 1592       source.append(
"         {  \n");
 
 1593       source.append(
"             left = g_left[gtid];  \n");
 
 1594       source.append(
"             right = g_right[gtid];  \n");
 
 1595       source.append(
"             right_count = g_pos[gtid];  \n");
 
 1596       source.append(
"         }  \n");
 
 1598       source.append(
"         __local  unsigned int  converged_all_threads;  \n");
 
 1600       source.append(
"         if (0 == lcl_id)  \n");
 
 1601       source.append(
"         {  \n");
 
 1602       source.append(
"             converged_all_threads = 0;  \n");
 
 1603       source.append(
"         }  \n");
 
 1604       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1606       source.append(
"         while (true)  \n");
 
 1607       source.append(
"         {  \n");
 
 1608       source.append(
"             converged_all_threads = 1;  \n");
 
 1610       source.append(
"             if ((gtid < num_intervals) && (0 == converged))  \n");
 
 1611       source.append(
"             {  \n");
 
 1612       source.append(
"                 mid = computeMidpoint(left, right);  \n");
 
 1613       source.append(
"             }  \n");
 
 1615       source.append(
"             mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,  \n");
 
 1616       source.append(
"                                                         mid, gtid, num_intervals,  \n");
 
 1617       source.append(
"                                                         s_left_scratch,  \n");
 
 1618       source.append(
"                                                         s_right_scratch,  \n");
 
 1619       source.append(
"                                                         converged);  \n");
 
 1620       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1622       source.append(
"             if ((gtid < num_intervals) && (0 == converged))  \n");
 
 1623       source.append(
"             {  \n");
 
 1625       source.append(
"                 if (right_count == mid_count)  \n");
 
 1626       source.append(
"                 {  \n");
 
 1627       source.append(
"                     right = mid;  \n");
 
 1628       source.append(
"                 }  \n");
 
 1629       source.append(
"                 else  \n");
 
 1630       source.append(
"                 {  \n");
 
 1631       source.append(
"                     left = mid;  \n");
 
 1632       source.append(
"                 }  \n");
 
 1634       source.append(
"                 "); source.append(numeric_string); source.append(
" t0 = right - left;  \n");
 
 1635       source.append(
"                 "); source.append(numeric_string); source.append(
" t1 = max(fabs(right), fabs(left)) * precision;  \n");
 
 1637       source.append(
"                 if (t0 < min(precision, t1))  \n");
 
 1638       source.append(
"                 {  \n");
 
 1639       source.append(
"                     "); source.append(numeric_string); source.append(
" lambda = computeMidpoint(left, right);  \n");
 
 1640       source.append(
"                     left = lambda;  \n");
 
 1641       source.append(
"                     right = lambda;  \n");
 
 1643       source.append(
"                     converged = 1;  \n");
 
 1644       source.append(
"                 }  \n");
 
 1645       source.append(
"                 else  \n");
 
 1646       source.append(
"                 {  \n");
 
 1647       source.append(
"                     converged_all_threads = 0;  \n");
 
 1648       source.append(
"                 }  \n");
 
 1649       source.append(
"             }  \n");
 
 1650       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1651       source.append(
"             if (1 == converged_all_threads)  \n");
 
 1652       source.append(
"             {  \n");
 
 1653       source.append(
"                 break;  \n");
 
 1654       source.append(
"             }  \n");
 
 1655       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1656       source.append(
"         }  \n");
 
 1658       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1659       source.append(
"         if (gtid < num_intervals)  \n");
 
 1660       source.append(
"         {  \n");
 
 1663       source.append(
"             g_left[gtid] = left;  \n");
 
 1664       source.append(
"         }  \n");
 
 1665       source.append(
"     }  \n");
 
 1672   template <
typename StringType>
 
 1675       source.append(
"       \n");
 
 1676       source.append(
"     void writeToGmem(const unsigned int tid, const unsigned int tid_2,  \n");
 
 1677       source.append(
"                      const unsigned int num_threads_active,  \n");
 
 1678       source.append(
"                      const unsigned int num_blocks_mult,  \n");
 
 1679       source.append(
"                      __global "); source.append(numeric_string); source.append(
" *g_left_one,    \n");
 
 1680       source.append(
"                      __global "); source.append(numeric_string); source.append(
" *g_right_one,   \n");
 
 1681       source.append(
"                      __global unsigned int *g_pos_one,  \n");
 
 1682       source.append(
"                      __global "); source.append(numeric_string); source.append(
" *g_left_mult,   \n");
 
 1683       source.append(
"                      __global "); source.append(numeric_string); source.append(
" *g_right_mult,  \n");
 
 1684       source.append(
"                      __global unsigned int *g_left_count_mult,  \n");
 
 1685       source.append(
"                      __global unsigned int *g_right_count_mult,  \n");
 
 1686       source.append(
"                      __local "); source.append(numeric_string); source.append(
" *s_left,    \n");
 
 1687       source.append(
"                      __local "); source.append(numeric_string); source.append(
" *s_right,  \n");
 
 1688       source.append(
"                      __local unsigned short *s_left_count, __local unsigned short *s_right_count,  \n");
 
 1689       source.append(
"                      __global unsigned int *g_blocks_mult,  \n");
 
 1690       source.append(
"                      __global unsigned int *g_blocks_mult_sum,  \n");
 
 1691       source.append(
"                      __local unsigned short *s_compaction_list,  \n");
 
 1692       source.append(
"                      __local unsigned short *s_cl_helper,  \n");
 
 1693       source.append(
"                      unsigned int offset_mult_lambda  \n");
 
 1694       source.append(
"                     )  \n");
 
 1695       source.append(
"     {  \n");
 
 1696       source.append(
"         uint glb_id = get_global_id(0); \n");
 
 1697       source.append(
"         uint grp_id = get_group_id(0); \n");
 
 1698       source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
 1699       source.append(
"         uint lcl_id = get_local_id(0); \n");
 
 1700       source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
 1703       source.append(
"         if (tid < offset_mult_lambda)  \n");
 
 1704       source.append(
"         {  \n");
 
 1706       source.append(
"             g_left_one[tid] = s_left[tid];  \n");
 
 1707       source.append(
"             g_right_one[tid] = s_right[tid];  \n");
 
 1709       source.append(
"             g_pos_one[tid] = s_right_count[tid];  \n");
 
 1710       source.append(
"         }  \n");
 
 1711       source.append(
"         else  \n");
 
 1712       source.append(
"         {  \n");
 
 1714       source.append(
"               \n");
 
 1715       source.append(
"             g_left_mult[tid - offset_mult_lambda] = s_left[tid];  \n");
 
 1716       source.append(
"             g_right_mult[tid - offset_mult_lambda] = s_right[tid];  \n");
 
 1717       source.append(
"             g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];  \n");
 
 1718       source.append(
"             g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];  \n");
 
 1719       source.append(
"         }  \n");
 
 1721       source.append(
"         if (tid_2 < num_threads_active)  \n");
 
 1722       source.append(
"         {  \n");
 
 1724       source.append(
"             if (tid_2 < offset_mult_lambda)  \n");
 
 1725       source.append(
"             {  \n");
 
 1727       source.append(
"                 g_left_one[tid_2] = s_left[tid_2];  \n");
 
 1728       source.append(
"                 g_right_one[tid_2] = s_right[tid_2];  \n");
 
 1730       source.append(
"                 g_pos_one[tid_2] = s_right_count[tid_2];  \n");
 
 1731       source.append(
"             }  \n");
 
 1732       source.append(
"             else  \n");
 
 1733       source.append(
"             {  \n");
 
 1735       source.append(
"                 g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];  \n");
 
 1736       source.append(
"                 g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];  \n");
 
 1737       source.append(
"                 g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];  \n");
 
 1738       source.append(
"                 g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];  \n");
 
 1739       source.append(
"             }  \n");
 
 1741       source.append(
"    } \n");      
 
 1743           source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1747       source.append(
"         if (tid <= num_blocks_mult)  \n");
 
 1748       source.append(
"         {  \n");
 
 1749       source.append(
"             g_blocks_mult[tid] = s_compaction_list[tid];  \n");
 
 1750       source.append(
"             g_blocks_mult_sum[tid] = s_cl_helper[tid];  \n");
 
 1751       source.append(
"         }  \n");
 
 1752       source.append(
"         if (tid_2 <= num_blocks_mult)  \n");
 
 1753       source.append(
"         {  \n");
 
 1754       source.append(
"             g_blocks_mult[tid_2] = s_compaction_list[tid_2];  \n");
 
 1755       source.append(
"             g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];  \n");
 
 1756       source.append(
"         }  \n");
 
 1757       source.append(
"     }  \n");
 
 1764   template <
typename StringType>
 
 1767       source.append(
"       \n");
 
 1768       source.append(
"     void  \n");
 
 1769       source.append(
"     compactStreamsFinal(const unsigned int tid, const unsigned int tid_2,  \n");
 
 1770       source.append(
"                         const unsigned int num_threads_active,  \n");
 
 1771       source.append(
"                         __local unsigned int *offset_mult_lambda,  \n");
 
 1772       source.append(
"                         __local "); source.append(numeric_string); source.append(
" *s_left,    \n");
 
 1773       source.append(
"                         __local "); source.append(numeric_string); source.append(
" *s_right,   \n");
 
 1774       source.append(
"                         __local unsigned short *s_left_count,   __local unsigned short *s_right_count,  \n");
 
 1775       source.append(
"                         __local unsigned short *s_cl_one,       __local unsigned short *s_cl_mult,  \n");
 
 1776       source.append(
"                         __local unsigned short *s_cl_blocking,  __local unsigned short *s_cl_helper,  \n");
 
 1777       source.append(
"                         unsigned int is_one_lambda, unsigned int is_one_lambda_2,  \n");
 
 1778       source.append(
"                         "); source.append(numeric_string); source.append(
" *left,    \n");
 
 1779       source.append(
"                         "); source.append(numeric_string); source.append(
" *right,    \n");
 
 1780       source.append(
"                         "); source.append(numeric_string); source.append(
" *left_2,    \n");
 
 1781       source.append(
"                         "); source.append(numeric_string); source.append(
" *right_2,    \n");
 
 1782       source.append(
"                         unsigned int *left_count, unsigned int *right_count,  \n");
 
 1783       source.append(
"                         unsigned int *left_count_2, unsigned int *right_count_2,  \n");
 
 1784       source.append(
"                         unsigned int c_block_iend, unsigned int c_sum_block,  \n");
 
 1785       source.append(
"                         unsigned int c_block_iend_2, unsigned int c_sum_block_2  \n");
 
 1786       source.append(
"                        )  \n");
 
 1787       source.append(
"     {  \n");
 
 1788       source.append(
"         uint glb_id = get_global_id(0); \n");
 
 1789       source.append(
"         uint grp_id = get_group_id(0); \n");
 
 1790       source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
 1791       source.append(
"         uint lcl_id = get_local_id(0); \n");
 
 1792       source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
 1795       source.append(
"         *left = s_left[tid];  \n");
 
 1796       source.append(
"         *right = s_right[tid];  \n");
 
 1798       source.append(
"         if (tid_2 < num_threads_active)  \n");
 
 1799       source.append(
"         {  \n");
 
 1800       source.append(
"             *left_2 = s_left[tid_2];  \n");
 
 1801       source.append(
"             *right_2 = s_right[tid_2];  \n");
 
 1802       source.append(
"         }  \n");
 
 1804       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1808       source.append(
"         unsigned int ptr_w = 0;  \n");
 
 1809       source.append(
"         unsigned int ptr_w_2 = 0;  \n");
 
 1810       source.append(
"         unsigned int ptr_blocking_w = 0;  \n");
 
 1811       source.append(
"         unsigned int ptr_blocking_w_2 = 0;  \n");
 
 1812       source.append(
"           \n");
 
 1813       source.append(
"          \n");
 
 1815       source.append(
"         ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]  \n");
 
 1816       source.append(
"                 : s_cl_mult[tid] + *offset_mult_lambda;  \n");
 
 1818       source.append(
"         if (0 != c_block_iend)  \n");
 
 1819       source.append(
"         {  \n");
 
 1820       source.append(
"             ptr_blocking_w = s_cl_blocking[tid];  \n");
 
 1821       source.append(
"         }  \n");
 
 1823       source.append(
"         if (tid_2 < num_threads_active)  \n");
 
 1824       source.append(
"         {  \n");
 
 1825       source.append(
"             ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]  \n");
 
 1826       source.append(
"                       : s_cl_mult[tid_2] + *offset_mult_lambda;  \n");
 
 1828       source.append(
"             if (0 != c_block_iend_2)  \n");
 
 1829       source.append(
"             {  \n");
 
 1830       source.append(
"                 ptr_blocking_w_2 = s_cl_blocking[tid_2];  \n");
 
 1831       source.append(
"             }  \n");
 
 1832       source.append(
"         }  \n");
 
 1833       source.append(
"           \n");
 
 1834       source.append(
"          \n");
 
 1835       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1836       source.append(
"           \n");
 
 1838       source.append(
"           if(tid < num_threads_active) \n");
 
 1839       source.append(
"           { \n ");
 
 1840       source.append(
"             s_left[ptr_w] = *left;  \n");
 
 1841       source.append(
"             s_right[ptr_w] = *right;  \n");
 
 1842       source.append(
"             s_left_count[ptr_w] = *left_count;  \n");
 
 1843       source.append(
"             s_right_count[ptr_w] = *right_count;  \n");
 
 1844       source.append(
"           } \n ");
 
 1845       source.append(
"           \n");
 
 1846       source.append(
"           \n");
 
 1847       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1848       source.append(
"         if(tid == 1)  \n");
 
 1849       source.append(
"         {  \n");
 
 1850       source.append(
"           s_left[ptr_w] = *left;  \n");
 
 1851       source.append(
"           s_right[ptr_w] = *right;  \n");
 
 1852       source.append(
"           s_left_count[ptr_w] = *left_count;  \n");
 
 1853       source.append(
"           s_right_count[ptr_w] = *right_count;  \n");
 
 1854       source.append(
"           \n");
 
 1855       source.append(
"         }  \n");
 
 1856       source.append(
"               if (0 != c_block_iend)  \n");
 
 1857       source.append(
"         {  \n");
 
 1858       source.append(
"             s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;  \n");
 
 1859       source.append(
"             s_cl_helper[ptr_blocking_w + 1] = c_sum_block;  \n");
 
 1860       source.append(
"         }  \n");
 
 1861       source.append(
"           \n");
 
 1862       source.append(
"         if (tid_2 < num_threads_active)  \n");
 
 1863       source.append(
"         {  \n");
 
 1865       source.append(
"             s_left[ptr_w_2] = *left_2;  \n");
 
 1866       source.append(
"             s_right[ptr_w_2] = *right_2;  \n");
 
 1867       source.append(
"             s_left_count[ptr_w_2] = *left_count_2;  \n");
 
 1868       source.append(
"             s_right_count[ptr_w_2] = *right_count_2;  \n");
 
 1870       source.append(
"             if (0 != c_block_iend_2)  \n");
 
 1871       source.append(
"             {  \n");
 
 1872       source.append(
"                 s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;  \n");
 
 1873       source.append(
"                 s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;  \n");
 
 1874       source.append(
"             }  \n");
 
 1875       source.append(
"         }  \n");
 
 1877       source.append(
"     }  \n");
 
 1886   template <
typename StringType>
 
 1889       source.append(
"       \n");
 
 1890       source.append(
"     void  \n");
 
 1891       source.append(
"     scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2,  \n");
 
 1892       source.append(
"                                   const unsigned int num_threads_compaction,  \n");
 
 1893       source.append(
"                                   __local unsigned short *s_cl_blocking,  \n");
 
 1894       source.append(
"                                   __local unsigned short *s_cl_helper  \n");
 
 1895       source.append(
"                                  )  \n");
 
 1896       source.append(
"     {  \n");
 
 1897       source.append(
"         uint glb_id = get_global_id(0); \n");
 
 1898       source.append(
"         uint grp_id = get_group_id(0); \n");
 
 1899       source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
 1900       source.append(
"         uint lcl_id = get_local_id(0); \n");
 
 1901       source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
 1905       source.append(
"         s_cl_blocking[tid] = s_cl_helper[tid];  \n");
 
 1907       source.append(
"         if (tid_2 < num_threads_compaction)  \n");
 
 1908       source.append(
"         {  \n");
 
 1909       source.append(
"             s_cl_blocking[tid_2] = s_cl_helper[tid_2];  \n");
 
 1910       source.append(
"         }  \n");
 
 1912       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1919       source.append(
"         unsigned int offset = 1;  \n");
 
 1922       source.append(
"         for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)  \n");
 
 1923       source.append(
"         {  \n");
 
 1925       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1927       source.append(
"             if (tid < d)  \n");
 
 1928       source.append(
"             {  \n");
 
 1930       source.append(
"                 unsigned int  ai = offset*(2*tid+1)-1;  \n");
 
 1931       source.append(
"                 unsigned int  bi = offset*(2*tid+2)-1;  \n");
 
 1932       source.append(
"                 s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];  \n");
 
 1933       source.append(
"             }  \n");
 
 1935       source.append(
"             offset <<= 1;  \n");
 
 1936       source.append(
"         }  \n");
 
 1939       source.append(
"         for (int d = 2; d < num_threads_compaction; d <<= 1)  \n");
 
 1940       source.append(
"         {  \n");
 
 1942       source.append(
"             offset >>= 1;  \n");
 
 1943       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1946       source.append(
"             if (tid < (d-1))  \n");
 
 1947       source.append(
"             {  \n");
 
 1949       source.append(
"                 unsigned int  ai = offset*(tid+1) - 1;  \n");
 
 1950       source.append(
"                 unsigned int  bi = ai + (offset >> 1);  \n");
 
 1951       source.append(
"                 s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];  \n");
 
 1952       source.append(
"             }  \n");
 
 1953       source.append(
"         }  \n");
 
 1955       source.append(
"     }  \n");
 
 1965   template <
typename StringType>
 
 1968       source.append(
"       \n");
 
 1969       source.append(
"     void  \n");
 
 1970       source.append(
"     scanSumBlocks(const unsigned int tid, const unsigned int tid_2,  \n");
 
 1971       source.append(
"                   const unsigned int num_threads_active,  \n");
 
 1972       source.append(
"                   const unsigned int num_threads_compaction,  \n");
 
 1973       source.append(
"                   __local unsigned short *s_cl_blocking,  \n");
 
 1974       source.append(
"                   __local unsigned short *s_cl_helper)  \n");
 
 1975       source.append(
"     {  \n");
 
 1976       source.append(
"         uint glb_id = get_global_id(0); \n");
 
 1977       source.append(
"         uint grp_id = get_group_id(0); \n");
 
 1978       source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
 1979       source.append(
"         uint lcl_id = get_local_id(0); \n");
 
 1980       source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
 1982       source.append(
"         unsigned int offset = 1;  \n");
 
 1986       source.append(
"         for (int d = num_threads_compaction >> 1; d > 0; d >>= 1)  \n");
 
 1987       source.append(
"         {  \n");
 
 1989       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 1991       source.append(
"             if (tid < d)  \n");
 
 1992       source.append(
"             {  \n");
 
 1994       source.append(
"                 unsigned int ai = offset*(2*tid+1)-1;  \n");
 
 1995       source.append(
"                 unsigned int bi = offset*(2*tid+2)-1;  \n");
 
 1997       source.append(
"                 s_cl_blocking[bi] += s_cl_blocking[ai];  \n");
 
 1998       source.append(
"             }  \n");
 
 2000       source.append(
"             offset *= 2;  \n");
 
 2001       source.append(
"         }  \n");
 
 2005       source.append(
"         for (int d = 2; d < (num_threads_compaction - 1); d <<= 1)  \n");
 
 2006       source.append(
"         {  \n");
 
 2008       source.append(
"             offset >>= 1;  \n");
 
 2009       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2011       source.append(
"             if (tid < (d-1))  \n");
 
 2012       source.append(
"             {  \n");
 
 2013       source.append(
"                 unsigned int ai = offset*(tid+1) - 1;  \n");
 
 2014       source.append(
"                 unsigned int bi = ai + (offset >> 1);  \n");
 
 2015       source.append(
"                 s_cl_blocking[bi] += s_cl_blocking[ai];  \n");
 
 2016       source.append(
"             }  \n");
 
 2017       source.append(
"         }  \n");
 
 2018       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2020       source.append(
"         if (0 == tid)  \n");
 
 2021       source.append(
"         {  \n");
 
 2026       source.append(
"             s_cl_helper[num_threads_active - 1] =  \n");
 
 2027       source.append(
"                 s_cl_helper[num_threads_compaction - 1];  \n");
 
 2028       source.append(
"             s_cl_blocking[num_threads_active - 1] =  \n");
 
 2029       source.append(
"                 s_cl_blocking[num_threads_compaction - 1];  \n");
 
 2030       source.append(
"         }  \n");
 
 2031       source.append(
"     }  \n");
 
 2041   template <
typename StringType>
 
 2044       source.append(
"       \n");
 
 2045       source.append(
"     void  \n");
 
 2046       source.append(
"     scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int n,  \n");
 
 2047       source.append(
"                 const unsigned int num_threads_active,  \n");
 
 2048       source.append(
"                 const unsigned int num_threads_compaction,  \n");
 
 2049       source.append(
"                 __local unsigned short *s_cl_one, __local unsigned short *s_cl_mult,  \n");
 
 2050       source.append(
"                 __local unsigned short *s_cl_blocking, __local unsigned short *s_cl_helper  \n");
 
 2051       source.append(
"                )  \n");
 
 2052       source.append(
"     {  \n");
 
 2053       source.append(
"         uint glb_id = get_global_id(0); \n");
 
 2054       source.append(
"         uint grp_id = get_group_id(0); \n");
 
 2055       source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
 2056       source.append(
"         uint lcl_id = get_local_id(0); \n");
 
 2057       source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
 2065       source.append(
"         unsigned int offset = 1;  \n");
 
 2068       source.append(
"         for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)  \n");
 
 2069       source.append(
"         {  \n");
 
 2071       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2073       source.append(
"             if (tid < d)  \n");
 
 2074       source.append(
"             {  \n");
 
 2076       source.append(
"                 unsigned int  ai = offset*(2*tid+1);  \n");
 
 2077       source.append(
"                 unsigned int  bi = offset*(2*tid+2)-1;  \n");
 
 2079       source.append(
"                 s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];  \n");
 
 2080       source.append(
"                 s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];  \n");
 
 2088       source.append(
"                 if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))  \n");
 
 2089       source.append(
"                 {  \n");
 
 2092       source.append(
"                     if (s_cl_helper[ai - 1] == 1)  \n");
 
 2093       source.append(
"                     {  \n");
 
 2095       source.append(
"                         s_cl_helper[bi] = 1;  \n");
 
 2096       source.append(
"                     }  \n");
 
 2097       source.append(
"                     else if (s_cl_helper[bi] == 1)  \n");
 
 2098       source.append(
"                     {  \n");
 
 2100       source.append(
"                         s_cl_helper[ai - 1] = 1;  \n");
 
 2101       source.append(
"                     }  \n");
 
 2102       source.append(
"               else      \n");   
 
 2103       source.append(
"                     {  \n");
 
 2105       source.append(
"                         unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];  \n");
 
 2107       source.append(
"                         if (temp > (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2))  \n");
 
 2108       source.append(
"                         {  \n");
 
 2111       source.append(
"                             s_cl_helper[ai - 1] = 1;  \n");
 
 2112       source.append(
"                             s_cl_helper[bi] = 1;  \n");
 
 2113       source.append(
"                         }  \n");
 
 2114       source.append(
"                         else  \n");
 
 2115       source.append(
"                         {  \n");
 
 2117       source.append(
"                             s_cl_blocking[bi] = temp;  \n");
 
 2118       source.append(
"                             s_cl_blocking[ai - 1] = 0;  \n");
 
 2119       source.append(
"                         }  \n");
 
 2120       source.append(
"                     }  \n");
 
 2121       source.append(
"            } \n"); 
 
 2122       source.append(
"             }  \n");
 
 2123       source.append(
"             offset <<= 1;  \n");
 
 2124       source.append(
"         }  \n");
 
 2129       source.append(
"         for (int d = 2; d < num_threads_compaction; d <<= 1)  \n");
 
 2130       source.append(
"         {  \n");
 
 2131       source.append(
"             offset >>= 1;  \n");
 
 2132       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2134       source.append(
"             if (tid < (d-1))  \n");
 
 2135       source.append(
"             {  \n");
 
 2136       source.append(
"                 unsigned int  ai = offset*(tid+1) - 1;  \n");
 
 2137       source.append(
"                 unsigned int  bi = ai + (offset >> 1);  \n");
 
 2138       source.append(
"                 s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];  \n");
 
 2139       source.append(
"                 s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];  \n");
 
 2140       source.append(
"             }  \n");
 
 2141       source.append(
"         }  \n");
 
 2142       source.append(
"     }  \n");
 
 2158   template <
typename StringType>
 
 2161       source.append(
"     __kernel  \n");
 
 2162       source.append(
"     void  \n");
 
 2163       source.append(
"     bisectKernelLarge(__global "); source.append(numeric_string); source.append(
" *g_d,    \n");
 
 2164       source.append(
"                       __global "); source.append(numeric_string); source.append(
" *g_s,    \n");
 
 2165       source.append(
"                       const unsigned int n,  \n");
 
 2166       source.append(
"                       const "); source.append(numeric_string); source.append(
" lg,     \n");
 
 2167       source.append(
"                       const "); source.append(numeric_string); source.append(
" ug,     \n");
 
 2168       source.append(
"                       const unsigned int lg_eig_count,  \n");
 
 2169       source.append(
"                       const unsigned int ug_eig_count,  \n");
 
 2170       source.append(
"                       "); source.append(numeric_string); source.append(
" epsilon,  \n");
 
 2171       source.append(
"                       __global unsigned int *g_num_one,  \n");
 
 2172       source.append(
"                       __global unsigned int *g_num_blocks_mult,  \n");
 
 2173       source.append(
"                       __global "); source.append(numeric_string); source.append(
" *g_left_one,    \n");
 
 2174       source.append(
"                       __global "); source.append(numeric_string); source.append(
" *g_right_one,   \n");
 
 2175       source.append(
"                       __global unsigned int *g_pos_one,  \n");
 
 2176       source.append(
"                       __global "); source.append(numeric_string); source.append(
" *g_left_mult,   \n");
 
 2177       source.append(
"                       __global "); source.append(numeric_string); source.append(
" *g_right_mult,  \n");
 
 2178       source.append(
"                       __global unsigned int *g_left_count_mult,  \n");
 
 2179       source.append(
"                       __global unsigned int *g_right_count_mult,  \n");
 
 2180       source.append(
"                       __global unsigned int *g_blocks_mult,  \n");
 
 2181       source.append(
"                       __global unsigned int *g_blocks_mult_sum  \n");
 
 2182       source.append(
"                      )  \n");
 
 2183       source.append(
"     {  \n");
 
 2184       source.append(
"         g_s = g_s + 1; \n");
 
 2185       source.append(
"         uint glb_id = get_global_id(0); \n");
 
 2186       source.append(
"         uint grp_id = get_group_id(0); \n");
 
 2187       source.append(
"         uint grp_nm = get_num_groups(0); \n");
 
 2188       source.append(
"         uint lcl_id = get_local_id(0); \n");
 
 2189       source.append(
"         uint lcl_sz = get_local_size(0); \n");
 
 2191       source.append(
"         const unsigned int tid = lcl_id;  \n");
 
 2195       source.append(
"         __local "); source.append(numeric_string); source.append(
"  s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];  \n");
 
 2196       source.append(
"         __local "); source.append(numeric_string); source.append(
"  s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];  \n");
 
 2200       source.append(
"         __local  unsigned short  s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];  \n");
 
 2201       source.append(
"         __local  unsigned short  s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];  \n");
 
 2204       source.append(
"         __local  unsigned short  s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];  \n");
 
 2209       source.append(
"         __local  unsigned int compact_second_chunk;  \n");
 
 2211       source.append(
"         __local  unsigned int all_threads_converged;  \n");
 
 2214       source.append(
"         __local  unsigned int num_threads_active;  \n");
 
 2217       source.append(
"         __local  unsigned int num_threads_compaction;  \n");
 
 2220       source.append(
"         __local unsigned short *s_compaction_list_exc = s_compaction_list + 1;  \n");
 
 2224       source.append(
"         "); source.append(numeric_string); source.append(
" left = 0.0f;  \n");
 
 2225       source.append(
"         "); source.append(numeric_string); source.append(
" right = 0.0f;  \n");
 
 2226       source.append(
"         unsigned int left_count = 0;  \n");
 
 2227       source.append(
"         unsigned int right_count = 0;  \n");
 
 2229       source.append(
"         "); source.append(numeric_string); source.append(
" mid = 0.0f;  \n");
 
 2231       source.append(
"         unsigned int mid_count = 0;  \n");
 
 2233       source.append(
"         unsigned int is_active_second = 0;  \n");
 
 2236       source.append(
"         s_compaction_list[tid] = 0;  \n");
 
 2237       source.append(
"         s_left[tid] = 0;  \n");
 
 2238       source.append(
"         s_right[tid] = 0;  \n");
 
 2239       source.append(
"         s_left_count[tid] = 0;  \n");
 
 2240       source.append(
"         s_right_count[tid] = 0;  \n");
 
 2242       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2245       source.append(
"         if (0 == tid)  \n");
 
 2246       source.append(
"         {  \n");
 
 2248       source.append(
"             s_left[0] = lg;  \n");
 
 2249       source.append(
"             s_right[0] = ug;  \n");
 
 2250       source.append(
"             s_left_count[0] = lg_eig_count;  \n");
 
 2251       source.append(
"             s_right_count[0] = ug_eig_count;  \n");
 
 2253       source.append(
"             compact_second_chunk = 0;  \n");
 
 2254       source.append(
"             num_threads_active = 1;  \n");
 
 2256       source.append(
"             num_threads_compaction = 1;  \n");
 
 2258       source.append(
"             all_threads_converged = 1;  \n");
 
 2259       source.append(
"         }  \n");
 
 2261       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2267       source.append(
"    while( true )    \n");
 
 2268       source.append(
"         {  \n");
 
 2269       source.append(
"             s_compaction_list[tid] = 0;  \n");
 
 2270       source.append(
"             s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
 
 2271       source.append(
"             s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
 
 2272       source.append(
"             subdivideActiveIntervalShort(tid, s_left, s_right, s_left_count, s_right_count,  \n");
 
 2273       source.append(
"                                     num_threads_active,  \n");
 
 2274       source.append(
"                                     &left, &right, &left_count, &right_count,  \n");
 
 2275       source.append(
"                                     &mid, &all_threads_converged);  \n");
 
 2277       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2280       source.append(
"             if (1 == all_threads_converged)  \n");
 
 2281       source.append(
"             {  \n");
 
 2282       source.append(
"                 break;  \n");
 
 2283       source.append(
"             }  \n");
 
 2290       source.append(
"             mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,  \n");
 
 2291       source.append(
"                                                         mid, lcl_id,  \n");
 
 2292       source.append(
"                                                         num_threads_active,  \n");
 
 2293       source.append(
"                                                         s_left, s_right,  \n");
 
 2294       source.append(
"                                                         (left == right));  \n");
 
 2296       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2308       source.append(
"             if (tid < num_threads_active)  \n");
 
 2309       source.append(
"             {  \n");
 
 2311       source.append(
"                 if (left != right)  \n");
 
 2312       source.append(
"                 {  \n");
 
 2315       source.append(
"                     storeNonEmptyIntervalsLarge(tid, num_threads_active,  \n");
 
 2316       source.append(
"                                                 s_left, s_right,  \n");
 
 2317       source.append(
"                                                 s_left_count, s_right_count,  \n");
 
 2318       source.append(
"                                                 left, mid, right,  \n");
 
 2319       source.append(
"                                                 left_count, mid_count, right_count,  \n");
 
 2320       source.append(
"                                                 epsilon, &compact_second_chunk,  \n");
 
 2321       source.append(
"                                                 s_compaction_list_exc,  \n");
 
 2322       source.append(
"                                                 &is_active_second);  \n");
 
 2323       source.append(
"                 }  \n");
 
 2324       source.append(
"                 else  \n");
 
 2325       source.append(
"                 {  \n");
 
 2330       source.append(
"                     s_left[tid] = left;  \n");
 
 2331       source.append(
"                     s_right[tid] = left;  \n");
 
 2332       source.append(
"                     s_left_count[tid] = left_count;  \n");
 
 2333       source.append(
"                     s_right_count[tid] = right_count;  \n");
 
 2335       source.append(
"                     is_active_second = 0;  \n");
 
 2336       source.append(
"                 }  \n");
 
 2337       source.append(
"             }  \n");
 
 2340       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2345       source.append(
"             if (compact_second_chunk > 0)  \n");
 
 2346       source.append(
"             {  \n");
 
 2349       source.append(
"                 createIndicesCompactionShort(s_compaction_list_exc, num_threads_compaction);  \n");
 
 2350       source.append(
"             }  \n");
 
 2351       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2352       source.append(
"               \n");
 
 2353       source.append(
"        if (compact_second_chunk > 0)               \n");
 
 2354       source.append(
"             {  \n");
 
 2355       source.append(
"                 compactIntervalsShort(s_left, s_right, s_left_count, s_right_count,  \n");
 
 2356       source.append(
"                                  mid, right, mid_count, right_count,  \n");
 
 2357       source.append(
"                                  s_compaction_list, num_threads_active,  \n");
 
 2358       source.append(
"                                  is_active_second);  \n");
 
 2359       source.append(
"             }  \n");
 
 2361       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2364       source.append(
"             if (0 == tid)  \n");
 
 2365       source.append(
"             {  \n");
 
 2368       source.append(
"                 num_threads_active += s_compaction_list[num_threads_active];  \n");
 
 2369       source.append(
"                 num_threads_compaction = ceilPow2(num_threads_active);  \n");
 
 2371       source.append(
"                 compact_second_chunk = 0;  \n");
 
 2372       source.append(
"                 all_threads_converged = 1;  \n");
 
 2373       source.append(
"             }  \n");
 
 2374       source.append(
"             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2375       source.append(
"             if (num_threads_compaction > lcl_sz)  \n");
 
 2376       source.append(
"             {  \n");
 
 2377       source.append(
"                 break;  \n");
 
 2378       source.append(
"             }  \n");
 
 2379       source.append(
"         }  \n");
 
 2380       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2388       source.append(
"         unsigned int left_count_2;  \n");
 
 2389       source.append(
"         unsigned int right_count_2;  \n");
 
 2391       source.append(
"         unsigned int tid_2 = tid + lcl_sz;  \n");
 
 2395       source.append(
"         left_count = s_left_count[tid];  \n");
 
 2396       source.append(
"         right_count = s_right_count[tid];  \n");
 
 2399       source.append(
"         if (tid_2 < num_threads_active)  \n");
 
 2400       source.append(
"         {  \n");
 
 2401       source.append(
"             left_count_2 = s_left_count[tid_2];  \n");
 
 2402       source.append(
"             right_count_2 = s_right_count[tid_2];  \n");
 
 2403       source.append(
"         }  \n");
 
 2407       source.append(
"         __local unsigned short  *s_cl_one = s_left_count + 1;  \n");
 
 2408       source.append(
"         __local unsigned short  *s_cl_mult = s_right_count + 1;  \n");
 
 2412       source.append(
"         __local unsigned short  *s_cl_blocking = s_compaction_list_exc;  \n");
 
 2414       source.append(
"         __local unsigned short  s_cl_helper[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];  \n");
 
 2416       source.append(
"         if (0 == tid)  \n");
 
 2417       source.append(
"         {  \n");
 
 2419       source.append(
"             s_left_count[0] = 0;  \n");
 
 2420       source.append(
"             s_right_count[0] = 0;  \n");
 
 2421       source.append(
"              \n");
 
 2422       source.append(
"         }  \n");
 
 2424       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2427       source.append(
"         unsigned int is_one_lambda = 0;  \n");
 
 2428       source.append(
"         unsigned int is_one_lambda_2 = 0;  \n");
 
 2431       source.append(
"         unsigned int multiplicity = right_count - left_count;  \n");
 
 2432       source.append(
"         is_one_lambda = (1 == multiplicity);  \n");
 
 2434       source.append(
"         s_cl_one[tid] = is_one_lambda;  \n");
 
 2435       source.append(
"         s_cl_mult[tid] = (! is_one_lambda);  \n");
 
 2438       source.append(
"         s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;  \n");
 
 2439       source.append(
"         s_cl_helper[tid] = 0;  \n");
 
 2441       source.append(
"         if (tid_2 < num_threads_active)  \n");
 
 2442       source.append(
"         {  \n");
 
 2444       source.append(
"             unsigned int multiplicity = right_count_2 - left_count_2;  \n");
 
 2445       source.append(
"             is_one_lambda_2 = (1 == multiplicity);  \n");
 
 2447       source.append(
"             s_cl_one[tid_2] = is_one_lambda_2;  \n");
 
 2448       source.append(
"             s_cl_mult[tid_2] = (! is_one_lambda_2);  \n");
 
 2451       source.append(
"             s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;  \n");
 
 2452       source.append(
"             s_cl_helper[tid_2] = 0;  \n");
 
 2453       source.append(
"         }  \n");
 
 2454       source.append(
"         else if (tid_2 < (2 * (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2) + 1))  \n");
 
 2455       source.append(
"         {  \n");
 
 2458       source.append(
"             s_cl_blocking[tid_2] = 0;  \n");
 
 2459       source.append(
"             s_cl_helper[tid_2] = 0;  \n");
 
 2460       source.append(
"         }  \n");
 
 2463       source.append(
"         scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction,  \n");
 
 2464       source.append(
"                     s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper);  \n");
 
 2465       source.append(
"           \n");
 
 2466       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2468       source.append(
"         scanSumBlocks(tid, tid_2, num_threads_active,  \n");
 
 2469       source.append(
"                       num_threads_compaction, s_cl_blocking, s_cl_helper);  \n");
 
 2472       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2474       source.append(
"         unsigned int  c_block_iend = 0;  \n");
 
 2475       source.append(
"         unsigned int  c_block_iend_2 = 0;  \n");
 
 2476       source.append(
"         unsigned int  c_sum_block = 0;  \n");
 
 2477       source.append(
"         unsigned int  c_sum_block_2 = 0;  \n");
 
 2484       source.append(
"         if (1 == s_cl_helper[tid])  \n");
 
 2485       source.append(
"         {  \n");
 
 2487       source.append(
"             c_block_iend = s_cl_mult[tid] + 1;  \n");
 
 2488       source.append(
"             c_sum_block = s_cl_blocking[tid];  \n");
 
 2489       source.append(
"         }  \n");
 
 2491       source.append(
"         if (1 == s_cl_helper[tid_2])  \n");
 
 2492       source.append(
"         {  \n");
 
 2494       source.append(
"             c_block_iend_2 = s_cl_mult[tid_2] + 1;  \n");
 
 2495       source.append(
"             c_sum_block_2 = s_cl_blocking[tid_2];  \n");
 
 2496       source.append(
"         }  \n");
 
 2498       source.append(
"         scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction,  \n");
 
 2499       source.append(
"                                       s_cl_blocking, s_cl_helper);  \n");
 
 2503       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2506       source.append(
"         __local  unsigned int num_blocks_mult;  \n");
 
 2507       source.append(
"         __local  unsigned int num_mult;  \n");
 
 2508       source.append(
"         __local  unsigned int offset_mult_lambda;  \n");
 
 2510       source.append(
"         if (0 == tid)  \n");
 
 2511       source.append(
"         {  \n");
 
 2513       source.append(
"             num_blocks_mult = s_cl_blocking[num_threads_active - 1];  \n");
 
 2514       source.append(
"             offset_mult_lambda = s_cl_one[num_threads_active - 1];  \n");
 
 2515       source.append(
"             num_mult = s_cl_mult[num_threads_active - 1];  \n");
 
 2517       source.append(
"             *g_num_one = offset_mult_lambda;  \n");
 
 2518       source.append(
"             *g_num_blocks_mult = num_blocks_mult;  \n");
 
 2519       source.append(
"         }  \n");
 
 2521       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2523       source.append(
"         "); source.append(numeric_string); source.append(
" left_2, right_2;  \n");
 
 2524       source.append(
"         --s_cl_one;  \n");
 
 2525       source.append(
"         --s_cl_mult;  \n");
 
 2526       source.append(
"         --s_cl_blocking;  \n");
 
 2527       source.append(
"           \n");
 
 2528       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2529       source.append(
"         compactStreamsFinal(tid, tid_2, num_threads_active, &offset_mult_lambda,  \n");
 
 2530       source.append(
"                             s_left, s_right, s_left_count, s_right_count,  \n");
 
 2531       source.append(
"                             s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,  \n");
 
 2532       source.append(
"                             is_one_lambda, is_one_lambda_2,  \n");
 
 2533       source.append(
"                             &left, &right, &left_2, &right_2,  \n");
 
 2534       source.append(
"                             &left_count, &right_count, &left_count_2, &right_count_2,  \n");
 
 2535       source.append(
"                             c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2  \n");
 
 2536       source.append(
"                            );  \n");
 
 2538       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2541       source.append(
"         if (0 == tid)  \n");
 
 2542       source.append(
"         {  \n");
 
 2543       source.append(
"             s_cl_blocking[num_blocks_mult] = num_mult;  \n");
 
 2544       source.append(
"             s_cl_helper[0] = 0;  \n");
 
 2545       source.append(
"         }  \n");
 
 2547       source.append(
"         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
 
 2550       source.append(
"         writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,  \n");
 
 2551       source.append(
"                     g_left_one, g_right_one, g_pos_one,  \n");
 
 2552       source.append(
"                     g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult,  \n");
 
 2553       source.append(
"                     s_left, s_right, s_left_count, s_right_count,  \n");
 
 2554       source.append(
"                     g_blocks_mult, g_blocks_mult_sum,  \n");
 
 2555       source.append(
"                     s_compaction_list, s_cl_helper, offset_mult_lambda);  \n");
 
 2556       source.append(
"                       \n");
 
 2558       source.append(
"     }  \n");
 
 2565   template <
class NumericT>
 
 2578       static std::map<cl_context, bool> init_done;
 
 2582         source.reserve(8192);
 
 2584         viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
 
 2587         if (numeric_string == 
"float" || numeric_string == 
"double")
 
 2632         #ifdef VIENNACL_BUILD_INFO 
 2633         std::cout << 
"Creating program " << prog_name << std::endl;
 
 2635         ctx.add_program(source, prog_name);
 
 2636         init_done[ctx.handle().get()] = 
true;
 
 2645 #endif // #ifndef _BISECT_KERNEL_LARGE_H_ 
void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_writeToGmem(StringType &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
void generate_bisect_kernel_bisectKernel(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_subdivideActiveInterval(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_floorPow2(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_bisectKernelLarge(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_createIndicesCompactionShort(StringType &source)
void generate_bisect_kernel_compactStreamsFinal(StringType &source, std::string const &numeric_string)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
void generate_bisect_kernel_compactIntervals(StringType &source, std::string const &numeric_string)
Provides OpenCL-related utilities. 
void generate_bisect_kernel_compactIntervalsShort(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_storeIntervalConvergedShort(StringType &source, std::string const &numeric_string)
const viennacl::ocl::handle< cl_context > & handle() const 
Returns the context handle. 
Common implementations shared by OpenCL-based operations. 
static void apply(viennacl::ocl::context const &)
void generate_bisect_kernel_createIndicesCompaction(StringType &source)
static std::string program_name()
const OCL_TYPE & get() const 
void generate_bisect_kernel_ceilPow2(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_scanCompactBlocksStartAddress(StringType &source)
void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_computeMidpoint(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_storeNonEmptyIntervals(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_config(StringType &source)
void generate_bisect_kernel_subdivideActiveIntervalShort(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_computeNumSmallerEigenvals(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_storeIntervalShort(StringType &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL. 
void generate_bisect_kernel_scanSumBlocks(StringType &source)
void generate_bisect_kernel_scanInitial(StringType &source)
void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType &source, std::string const &numeric_string)
Helper class for converting a type to its string representation. 
void generate_bisect_kernel_storeInterval(StringType &source, std::string const &numeric_string)
Main kernel class for the generation of the bisection kernels and utilities. 
void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_storeIntervalConverged(StringType &source, std::string const &numeric_string)