1 #ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_ 
    2 #define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_ 
   29 #ifdef VIENNACL_WITH_AVX2 
   30 #include "immintrin.h" 
   43 #ifdef VIENNACL_WITH_AVX2 
   45 unsigned int row_C_scan_symbolic_vector_AVX2(
int const *row_indices_B_begin, 
int const *row_indices_B_end,
 
   46                                              int const *B_row_buffer, 
int const *B_col_buffer, 
int B_size2,
 
   47                                              int *row_C_vector_output)
 
   49   __m256i avx_all_ones    = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
 
   50   __m256i avx_all_bsize2  = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
 
   52   __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 
   53   __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
 
   54   __m256i avx_load_mask2 = avx_load_mask;
 
   56   __m256i avx_row_indices = _mm256_set1_epi32(0);
 
   57           avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
 
   58             avx_load_mask = avx_load_mask2; 
 
   59   __m256i avx_row_start   = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer,   avx_row_indices, avx_load_mask, 4);
 
   60             avx_load_mask = avx_load_mask2; 
 
   61   __m256i avx_row_end     = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
 
   63           avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
 
   64   __m256i avx_index_front = avx_all_bsize2;
 
   65   avx_index_front         = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
 
   67   int *output_ptr = row_C_vector_output;
 
   72     __m256i avx_index_min1 = avx_index_front;
 
   73     __m256i avx_temp       = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
 
   74     avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); 
 
   76     avx_temp       = _mm256_shuffle_epi32(avx_index_min1, 
int(78));    
 
   77     avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); 
 
   79     avx_temp       = _mm256_shuffle_epi32(avx_index_min1, 
int(177));    
 
   80     avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); 
 
   82     int min_index_in_front = ((
int*)&avx_index_min1)[0];
 
   84     if (min_index_in_front == B_size2)
 
   88     *output_ptr = min_index_in_front;
 
   92     avx_load_mask   = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
 
   94     avx_temp        = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
 
   95     avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
 
   97     avx_temp        = _mm256_and_si256(avx_all_ones, avx_load_mask); 
 
   98     avx_row_start   = _mm256_add_epi32(avx_row_start, avx_temp);
 
  100     avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
 
  101     avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
 
  104   return static_cast<unsigned int>(output_ptr - row_C_vector_output);
 
  112 template<
unsigned int IndexNum>
 
  114                                           unsigned int const *B_row_buffer, 
unsigned int const *B_col_buffer, 
unsigned int B_size2,
 
  115                                           unsigned int const *row_C_vector_input, 
unsigned int const *row_C_vector_input_end,
 
  116                                           unsigned int *row_C_vector_output)
 
  118   unsigned int index_front[IndexNum+1];
 
  119   unsigned int const *index_front_start[IndexNum+1];
 
  120   unsigned int const *index_front_end[IndexNum+1];
 
  123   for (
unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
 
  125     index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B];
 
  126     index_front_end[i]   = B_col_buffer + B_row_buffer[*row_indices_B + 1];
 
  128   index_front_start[IndexNum] = row_C_vector_input;
 
  129   index_front_end[IndexNum]   = row_C_vector_input_end;
 
  132   for (
unsigned int i=0; i<=IndexNum; ++i)
 
  133     index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
 
  135   unsigned int *output_ptr = row_C_vector_output;
 
  140     unsigned int min_index_in_front = B_size2;
 
  141     for (
unsigned int i=0; i<=IndexNum; ++i)
 
  142       min_index_in_front = 
std::min(min_index_in_front, index_front[i]);
 
  144     if (min_index_in_front == B_size2) 
 
  148     for (
unsigned int i=0; i<=IndexNum; ++i)
 
  150       if (index_front[i] == min_index_in_front)
 
  152         index_front_start[i] += 1;
 
  153         index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
 
  158     *output_ptr = min_index_in_front;
 
  162   return static_cast<unsigned int>(output_ptr - row_C_vector_output);
 
  168 template<
typename OutputWriterT>
 
  170                                           unsigned int const *input2_begin, 
unsigned int const *input2_end,
 
  171                                           unsigned int termination_index,
 
  172                                           unsigned int *output_begin)
 
  174   unsigned int *output_ptr = output_begin;
 
  176   unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
 
  177   unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
 
  180     unsigned int min_index = 
std::min(val_1, val_2);
 
  182     if (min_index == termination_index)
 
  185     if (min_index == val_1)
 
  188       val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
 
  191     if (min_index == val_2)
 
  194       val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
 
  198     OutputWriterT::apply(output_ptr, min_index); 
 
  202   return static_cast<unsigned int>(output_ptr - output_begin);
 
  207                                         unsigned int const *B_row_buffer, 
unsigned int const *B_col_buffer, 
unsigned int B_size2,
 
  208                                         unsigned int *row_C_vector_1, 
unsigned int *row_C_vector_2, 
unsigned int *row_C_vector_3)
 
  211   if (row_start_A == row_end_A)
 
  215   if (row_end_A - row_start_A == 1)
 
  217     unsigned int A_col = A_col_buffer[row_start_A];
 
  218     return B_row_buffer[A_col + 1] - B_row_buffer[A_col];
 
  222   unsigned int row_C_len = 0;
 
  223   if (row_end_A - row_start_A == 2)
 
  225     unsigned int A_col_1 = A_col_buffer[row_start_A];
 
  226     unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
 
  227     return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
 
  228                                                                       B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
 
  234 #ifdef VIENNACL_WITH_AVX2 
  235     row_C_len = row_C_scan_symbolic_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A),
 
  236                                                 (
const int*)B_row_buffer, (
const int*)B_col_buffer, 
int(B_size2),
 
  237                                                 (
int*)row_C_vector_1);
 
  240     unsigned int A_col_1 = A_col_buffer[row_start_A];
 
  241     unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
 
  242     row_C_len =  row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
 
  243                                                                            B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
 
  251   while (row_end_A > row_start_A)
 
  253 #ifdef VIENNACL_WITH_AVX2 
  254     if (row_end_A - row_start_A > 2) 
 
  256       unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A),
 
  257                                                                 (
const int*)B_row_buffer, (
const int*)B_col_buffer, 
int(B_size2),
 
  258                                                                 (
int*)row_C_vector_3);
 
  259       if (row_start_A + 8 >= row_end_A)
 
  260         row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
 
  261                                                                               row_C_vector_1, row_C_vector_1 + row_C_len,
 
  265         row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
 
  266                                                                                row_C_vector_1, row_C_vector_1 + row_C_len,
 
  273     if (row_start_A == row_end_A - 1) 
 
  276       unsigned int row_index_B = A_col_buffer[row_start_A];
 
  277       return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
 
  278                                                                         row_C_vector_1, row_C_vector_1 + row_C_len,
 
  282     else if (row_start_A + 1 < row_end_A)
 
  285       unsigned int A_col_1 = A_col_buffer[row_start_A];
 
  286       unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
 
  287       unsigned int merged_len =  row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
 
  288                                                                                            B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
 
  291       if (row_start_A + 2 == row_end_A) 
 
  292         return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
 
  293                                                                           row_C_vector_1, row_C_vector_1 + row_C_len,
 
  297         row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
 
  298                                                                               row_C_vector_1, row_C_vector_1 + row_C_len,
 
  306       unsigned int row_index_B = A_col_buffer[row_start_A];
 
  307       row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
 
  308                                                                             row_C_vector_1, row_C_vector_1 + row_C_len,
 
  314     std::swap(row_C_vector_1, row_C_vector_2);
 
  326 template<
unsigned int IndexNum, 
typename NumericT>
 
  328                                           unsigned int const *B_row_buffer, 
unsigned int const *B_col_buffer, 
NumericT const *B_elements, 
unsigned int B_size2,
 
  329                                           unsigned int const *row_C_vector_input, 
unsigned int const *row_C_vector_input_end, 
NumericT *row_C_vector_input_values,
 
  330                                           unsigned int *row_C_vector_output, 
NumericT *row_C_vector_output_values)
 
  332   unsigned int index_front[IndexNum+1];
 
  333   unsigned int const *index_front_start[IndexNum+1];
 
  334   unsigned int const *index_front_end[IndexNum+1];
 
  335   NumericT const * value_front_start[IndexNum+1];
 
  339   for (
unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
 
  341     unsigned int row_B = *row_indices_B;
 
  343     index_front_start[i] = B_col_buffer + B_row_buffer[row_B];
 
  344     index_front_end[i]   = B_col_buffer + B_row_buffer[row_B + 1];
 
  345     value_front_start[i] = B_elements   + B_row_buffer[row_B];
 
  346     values_A[i]          = val_A[i];
 
  348   index_front_start[IndexNum] = row_C_vector_input;
 
  349   index_front_end[IndexNum]   = row_C_vector_input_end;
 
  350   value_front_start[IndexNum] = row_C_vector_input_values;
 
  354   for (
unsigned int i=0; i<=IndexNum; ++i)
 
  355     index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
 
  357   unsigned int *output_ptr = row_C_vector_output;
 
  362     unsigned int min_index_in_front = B_size2;
 
  363     for (
unsigned int i=0; i<=IndexNum; ++i)
 
  364       min_index_in_front = 
std::min(min_index_in_front, index_front[i]);
 
  366     if (min_index_in_front == B_size2) 
 
  371     for (
unsigned int i=0; i<=IndexNum; ++i)
 
  373       if (index_front[i] == min_index_in_front)
 
  375         index_front_start[i] += 1;
 
  376         index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
 
  378         row_C_value += values_A[i] * *value_front_start[i];
 
  379         value_front_start[i] += 1;
 
  384     *output_ptr = min_index_in_front;
 
  386     *row_C_vector_output_values = row_C_value;
 
  387     ++row_C_vector_output_values;
 
  390   return static_cast<unsigned int>(output_ptr - row_C_vector_output);
 
  395 #ifdef VIENNACL_WITH_AVX2 
  397 unsigned int row_C_scan_numeric_vector_AVX2(
int const *row_indices_B_begin, 
int const *row_indices_B_end, 
double const *values_A,
 
  398                                              int const *B_row_buffer, 
int const *B_col_buffer, 
double const *B_elements,
 
  400                                              int *row_C_vector_output, 
double *row_C_vector_output_values)
 
  402   __m256i avx_all_ones    = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
 
  403   __m256i avx_all_bsize2  = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
 
  405   __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 
  406   __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
 
  407   __m256i avx_load_mask2 = avx_load_mask;
 
  409   __m256i avx_row_indices = _mm256_set1_epi32(0);
 
  410           avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
 
  413   avx_load_mask = avx_load_mask2; 
 
  414   __m256d avx_value_A_low  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), 
 
  416                                                       _mm256_extractf128_si256(avx_row_indices_offsets, 0),                           
 
  417                                                       _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); 
 
  418   avx_load_mask = avx_load_mask2; 
 
  419   __m256d avx_value_A_high  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), 
 
  421                                                        _mm256_extractf128_si256(avx_row_indices_offsets, 1),                           
 
  422                                                        _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); 
 
  425             avx_load_mask = avx_load_mask2; 
 
  426   __m256i avx_row_start   = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer,   avx_row_indices, avx_load_mask, 4);
 
  427             avx_load_mask = avx_load_mask2; 
 
  428   __m256i avx_row_end     = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
 
  430           avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
 
  431           avx_load_mask2  = avx_load_mask;
 
  432   __m256i avx_index_front = avx_all_bsize2;
 
  433   avx_index_front         = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
 
  436   avx_load_mask = avx_load_mask2; 
 
  437   __m256d avx_value_front_low  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), 
 
  439                                                           _mm256_extractf128_si256(avx_row_start, 0),                           
 
  440                                                           _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); 
 
  441   avx_load_mask = avx_load_mask2; 
 
  442   __m256d avx_value_front_high  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), 
 
  444                                                            _mm256_extractf128_si256(avx_row_start, 1),                           
 
  445                                                            _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); 
 
  447   int *output_ptr = row_C_vector_output;
 
  452     __m256i avx_index_min1 = avx_index_front;
 
  453     __m256i avx_temp       = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
 
  454     avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); 
 
  456     avx_temp       = _mm256_shuffle_epi32(avx_index_min1, 
int(78));    
 
  457     avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); 
 
  459     avx_temp       = _mm256_shuffle_epi32(avx_index_min1, 
int(177));    
 
  460     avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); 
 
  462     int min_index_in_front = ((
int*)&avx_index_min1)[0];
 
  464     if (min_index_in_front == B_size2)
 
  469     value += (min_index_in_front == ((
int*)&avx_index_front)[0]) ? ((
double*)&avx_value_front_low)[0] * ((
double*)&avx_value_A_low)[0] : 0;
 
  470     value += (min_index_in_front == ((
int*)&avx_index_front)[1]) ? ((
double*)&avx_value_front_low)[1] * ((
double*)&avx_value_A_low)[1] : 0;
 
  471     value += (min_index_in_front == ((
int*)&avx_index_front)[2]) ? ((
double*)&avx_value_front_low)[2] * ((
double*)&avx_value_A_low)[2] : 0;
 
  472     value += (min_index_in_front == ((
int*)&avx_index_front)[3]) ? ((
double*)&avx_value_front_low)[3] * ((
double*)&avx_value_A_low)[3] : 0;
 
  473     value += (min_index_in_front == ((
int*)&avx_index_front)[4]) ? ((
double*)&avx_value_front_high)[0] * ((
double*)&avx_value_A_high)[0] : 0;
 
  474     value += (min_index_in_front == ((
int*)&avx_index_front)[5]) ? ((
double*)&avx_value_front_high)[1] * ((
double*)&avx_value_A_high)[1] : 0;
 
  475     value += (min_index_in_front == ((
int*)&avx_index_front)[6]) ? ((
double*)&avx_value_front_high)[2] * ((
double*)&avx_value_A_high)[2] : 0;
 
  476     value += (min_index_in_front == ((
int*)&avx_index_front)[7]) ? ((
double*)&avx_value_front_high)[3] * ((
double*)&avx_value_A_high)[3] : 0;
 
  477     *row_C_vector_output_values = value;
 
  478     ++row_C_vector_output_values;
 
  481     *output_ptr = min_index_in_front;
 
  485     avx_load_mask   = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
 
  487     avx_temp        = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
 
  488     avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
 
  490     avx_temp        = _mm256_and_si256(avx_all_ones, avx_load_mask); 
 
  491     avx_row_start   = _mm256_add_epi32(avx_row_start, avx_temp);
 
  493     avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
 
  494     avx_load_mask2  = avx_load_mask;
 
  495     avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
 
  498     avx_load_mask = avx_load_mask2; 
 
  499     avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low, 
 
  501                                             _mm256_extractf128_si256(avx_row_start, 0),                           
 
  502                                             _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); 
 
  504     avx_load_mask = avx_load_mask2; 
 
  505     avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high, 
 
  507                                     _mm256_extractf128_si256(avx_row_start, 1),                           
 
  508                                     _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); 
 
  514   return static_cast<unsigned int>(output_ptr - row_C_vector_output);
 
  519 template<
typename NumericT>
 
  521                                          unsigned int const *input2_index_begin, 
unsigned int const *input2_index_end, 
NumericT const *input2_values_begin, 
NumericT factor2,
 
  522                                          unsigned int termination_index,
 
  523                                          unsigned int *output_index_begin, 
NumericT *output_values_begin)
 
  525   unsigned int *output_ptr = output_index_begin;
 
  527   unsigned int index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
 
  528   unsigned int index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
 
  532     unsigned int min_index = 
std::min(index1, index2);
 
  535     if (min_index == termination_index)
 
  538     if (min_index == index1)
 
  540       ++input1_index_begin;
 
  541       index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
 
  543       value += factor1 * *input1_values_begin;
 
  544       ++input1_values_begin;
 
  547     if (min_index == index2)
 
  549       ++input2_index_begin;
 
  550       index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
 
  552       value += factor2 * *input2_values_begin;
 
  553       ++input2_values_begin;
 
  557     *output_ptr = min_index;
 
  559     *output_values_begin = value;
 
  560     ++output_values_begin;
 
  563   return static_cast<unsigned int>(output_ptr - output_index_begin);
 
  566 template<
typename NumericT>
 
  568                                unsigned int const *B_row_buffer, 
unsigned int const *B_col_buffer, 
NumericT const *B_elements, 
unsigned int B_size2,
 
  569                                unsigned int row_start_C, 
unsigned int row_end_C, 
unsigned int *C_col_buffer, 
NumericT *C_elements,
 
  570                                unsigned int *row_C_vector_1, 
NumericT *row_C_vector_1_values,
 
  571                                unsigned int *row_C_vector_2, 
NumericT *row_C_vector_2_values,
 
  572                                unsigned int *row_C_vector_3, 
NumericT *row_C_vector_3_values)
 
  577   if (row_start_A == row_end_A)
 
  581   if (row_end_A - row_start_A == 1)
 
  583     unsigned int A_col = A_col_buffer[row_start_A];
 
  584     unsigned int B_end = B_row_buffer[A_col + 1];
 
  585     NumericT A_value   = A_elements[row_start_A];
 
  586     C_col_buffer += row_start_C;
 
  587     C_elements += row_start_C;
 
  588     for (
unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, ++C_elements)
 
  590       *C_col_buffer = B_col_buffer[j];
 
  591       *C_elements = A_value * B_elements[j];
 
  596   unsigned int row_C_len = 0;
 
  597   if (row_end_A - row_start_A == 2) 
 
  599     unsigned int A_col_1 = A_col_buffer[row_start_A];
 
  600     unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
 
  602     unsigned int B_offset_1 = B_row_buffer[A_col_1];
 
  603     unsigned int B_offset_2 = B_row_buffer[A_col_2];
 
  605     row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
 
  606                                 B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
 
  608                                 C_col_buffer + row_start_C, C_elements + row_start_C);
 
  611 #ifdef VIENNACL_WITH_AVX2 
  612   else if (row_end_A - row_start_A > 10) 
 
  614     row_C_len = row_C_scan_numeric_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
 
  615                                                (
const int*)B_row_buffer, (
const int*)B_col_buffer, B_elements, 
int(B_size2),
 
  616                                                (
int*)row_C_vector_1, row_C_vector_1_values);
 
  622     unsigned int A_col_1 = A_col_buffer[row_start_A];
 
  623     unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
 
  625     unsigned int B_offset_1 = B_row_buffer[A_col_1];
 
  626     unsigned int B_offset_2 = B_row_buffer[A_col_2];
 
  628     row_C_len = 
row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
 
  629                                             B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
 
  631                                             row_C_vector_1, row_C_vector_1_values);
 
  636   while (row_end_A > row_start_A)
 
  638 #ifdef VIENNACL_WITH_AVX2 
  639     if (row_end_A - row_start_A > 9) 
 
  641       unsigned int merged_len = row_C_scan_numeric_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
 
  642                                                                (
const int*)B_row_buffer, (
const int*)B_col_buffer, B_elements, 
int(B_size2),
 
  643                                                                (
int*)row_C_vector_3, row_C_vector_3_values);
 
  645                                               row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, 
NumericT(1.0),
 
  647                                               row_C_vector_2, row_C_vector_2_values);
 
  652     if (row_start_A + 1 == row_end_A) 
 
  654       unsigned int A_col    = A_col_buffer[row_start_A];
 
  655       unsigned int B_offset = B_row_buffer[A_col];
 
  657       row_C_len = 
row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
 
  658                                               row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, 
NumericT(1.0),
 
  660                                               C_col_buffer + row_start_C, C_elements + row_start_C);
 
  663     else if (row_start_A + 2 < row_end_A)
 
  666       unsigned int A_col_1 = A_col_buffer[row_start_A];
 
  667       unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
 
  669       unsigned int B_offset_1 = B_row_buffer[A_col_1];
 
  670       unsigned int B_offset_2 = B_row_buffer[A_col_2];
 
  672       unsigned int merged_len = 
row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
 
  673                                                             B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
 
  675                                                             row_C_vector_3, row_C_vector_3_values);
 
  677                                               row_C_vector_1, row_C_vector_1 + row_C_len,  row_C_vector_1_values, 
NumericT(1.0),
 
  679                                               row_C_vector_2, row_C_vector_2_values);
 
  684       unsigned int A_col    = A_col_buffer[row_start_A];
 
  685       unsigned int B_offset = B_row_buffer[A_col];
 
  687       row_C_len = 
row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
 
  688                                               row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, 
NumericT(1.0),
 
  690                                               row_C_vector_2, row_C_vector_2_values);
 
  694     std::swap(row_C_vector_1,        row_C_vector_2);
 
  695     std::swap(row_C_vector_1_values, row_C_vector_2_values);
 
unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1, unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2, unsigned int termination_index, unsigned int *output_index_begin, NumericT *output_values_begin)
This file provides the forward declarations for the main types used within ViennaCL. 
unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end, unsigned int const *input2_begin, unsigned int const *input2_end, unsigned int termination_index, unsigned int *output_begin)
void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements, unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values, unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values, unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
static void apply(unsigned int *, unsigned int)
unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values, unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values)
Merges up to IndexNum rows from B into the result buffer. 
unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, unsigned int *row_C_vector_output)
Merges up to IndexNum rows from B into the result buffer. 
Common routines for single-threaded or OpenMP-enabled execution on CPU. 
viennacl::enable_if< viennacl::is_scalar< ScalarT1 >::value &&viennacl::is_scalar< ScalarT2 >::value >::type swap(ScalarT1 &s1, ScalarT2 &s2)
Swaps the contents of two scalars, data is copied. 
static void apply(unsigned int *ptr, unsigned int value)
T min(const T &lhs, const T &rhs)
Minimum.