/home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/seq_mv/csr_matop.c: 380 - 560
--------------------------------------------------------------------------------

380:   return idx%dim1*dim2 + idx/dim1;
[...]
463: #pragma omp parallel
464: #endif
465:    {
466:    HYPRE_Int num_threads = hypre_NumActiveThreads();
467:    HYPRE_Int my_thread_num = hypre_GetThreadNum();
468: 
469:    HYPRE_Int iBegin = hypre_CSRMatrixGetLoadBalancedPartitionBegin(A);
470:    HYPRE_Int iEnd = hypre_CSRMatrixGetLoadBalancedPartitionEnd(A);
471:    hypre_assert(iBegin <= iEnd);
472:    hypre_assert(iBegin >= 0 && iBegin <= num_rowsA);
473:    hypre_assert(iEnd >= 0 && iEnd <= num_rowsA);
474: 
475:    HYPRE_Int i, j;
476:    memset(bucket + my_thread_num*num_colsA, 0, sizeof(HYPRE_Int)*num_colsA);
[...]
483:    for (j = A_i[iBegin]; j < A_i[iEnd]; ++j) {
484:      HYPRE_Int idx = A_j[j];
485:      bucket[my_thread_num*num_colsA + idx]++;
[...]
496:    for (i = my_thread_num*num_colsA + 1; i < (my_thread_num + 1)*num_colsA; ++i) {
497:      HYPRE_Int transpose_i = transpose_idx(i, num_threads, num_colsA);
498:      HYPRE_Int transpose_i_minus_1 = transpose_idx(i - 1, num_threads, num_colsA);
499: 
500:      bucket[transpose_i] += bucket[transpose_i_minus_1];
501:    }
502: 
503: #ifdef HYPRE_USING_OPENMP
504: #pragma omp barrier
505: #pragma omp master
506: #endif
507:    {
508:      for (i = 1; i < num_threads; ++i) {
509:        HYPRE_Int j0 = num_colsA*i - 1, j1 = num_colsA*(i + 1) - 1;
510:        HYPRE_Int transpose_j0 = transpose_idx(j0, num_threads, num_colsA);
511:        HYPRE_Int transpose_j1 = transpose_idx(j1, num_threads, num_colsA);
512: 
513:        bucket[transpose_j1] += bucket[transpose_j0];
[...]
520:    if (my_thread_num > 0) {
521:      HYPRE_Int transpose_i0 = transpose_idx(num_colsA*my_thread_num - 1, num_threads, num_colsA);
522:      HYPRE_Int offset = bucket[transpose_i0];
523: 
524:      for (i = my_thread_num*num_colsA; i < (my_thread_num + 1)*num_colsA - 1; ++i) {
525:        HYPRE_Int transpose_i = transpose_idx(i, num_threads, num_colsA);
526: 
527:        bucket[transpose_i] += offset;
[...]
539:    if (data) {
540:       for (i = iEnd - 1; i >= iBegin; --i) {
541:         for (j = A_i[i + 1] - 1; j >= A_i[i]; --j) {
542:           HYPRE_Int idx = A_j[j];
543:           --bucket[my_thread_num*num_colsA + idx];
544: 
545:           HYPRE_Int offset = bucket[my_thread_num*num_colsA + idx];
546: 
547:           AT_data[offset] = A_data[j];
548:           AT_j[offset] = i;
549:         }
550:       }
551:    }
552:    else {
553:       for (i = iEnd - 1; i >= iBegin; --i) {
554:         for (j = A_i[i + 1] - 1; j >= A_i[i]; --j) {
555:           HYPRE_Int idx = A_j[j];
556:           --bucket[my_thread_num*num_colsA + idx];
557: 
558:           HYPRE_Int offset = bucket[my_thread_num*num_colsA + idx];
559: 
560:           AT_j[offset] = i;
