/home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/parcsr_ls/par_interp.c: 2726 - 3155
--------------------------------------------------------------------------------

2726: #pragma omp parallel private(i,my_thread_num,num_threads,max_coef,j,start_j,row_sum,scale,num_lost,now_checking,next_open,num_lost_offd,now_checking_offd,next_open_offd,start,stop,cnt_diag,cnt_offd,num_elmts,cnt)
2727: #endif
2728:    { 
2729:        my_thread_num = hypre_GetThreadNum();
2730:        num_threads = hypre_NumActiveThreads();
[...]
2740:        start = (n_fine/num_threads)*my_thread_num;
2741:        if (my_thread_num == num_threads-1)
[...]
2750:        if (trunc_factor > 0)
2751:        {
2752:           num_lost = 0;
2753:           num_lost_offd = 0;
2754:           
2755:           next_open = P_diag_i[start];
2756:           now_checking = P_diag_i[start];
2757:           next_open_offd = P_offd_i[start];;
2758:           now_checking_offd = P_offd_i[start];;
2759: 
2760:           for (i = start; i < stop; i++)
2761:           {
2762:             max_coef = 0;
2763:             for (j = P_diag_i[i]; j < P_diag_i[i+1]; j++)
2764:                max_coef = (max_coef < fabs(P_diag_data[j])) ? 
2765:                       fabs(P_diag_data[j]) : max_coef;
2766:             for (j = P_offd_i[i]; j < P_offd_i[i+1]; j++)
2767:                max_coef = (max_coef < fabs(P_offd_data[j])) ? 
2768:                       fabs(P_offd_data[j]) : max_coef;
2769:             max_coef *= trunc_factor;
2770: 
2771:             start_j = P_diag_i[i];
2772:             if (num_lost) P_diag_i[i] -= num_lost;  
2773:             row_sum = 0;
2774:             scale = 0;
2775:             for (j = start_j; j < P_diag_i[i+1]; j++)
2776:             {
2777:                row_sum += P_diag_data[now_checking];
2778:                if (fabs(P_diag_data[now_checking]) < max_coef)
2779:                {
2780:                   num_lost++;
2781:                   now_checking++;
2782:                }
2783:                else
2784:                {
2785:                   scale += P_diag_data[now_checking];
2786:                   P_diag_data[next_open] = P_diag_data[now_checking];
2787:                   P_diag_j[next_open] = P_diag_j[now_checking];
2788:                   now_checking++;
2789:                   next_open++;
2790:                }
2791:             }
2792: 
2793:             start_j = P_offd_i[i];
2794:             if (num_lost_offd) P_offd_i[i] -= num_lost_offd;
2795:             for (j = start_j; j < P_offd_i[i+1]; j++)
2796:             {
2797:                row_sum += P_offd_data[now_checking_offd];
2798:                if (fabs(P_offd_data[now_checking_offd]) < max_coef)
2799:                {
2800:                   num_lost_offd++;
2801:                   now_checking_offd++;
2802:                }
2803:                else
2804:                {
2805:                   scale += P_offd_data[now_checking_offd];
2806:                   P_offd_data[next_open_offd] = P_offd_data[now_checking_offd];
2807:                   P_offd_j[next_open_offd] = P_offd_j[now_checking_offd];
2808:                   now_checking_offd++;
2809:                   next_open_offd++;
2810:                }
2811:             }
2812:             /* normalize row of P */
2813: 
2814:             if (scale != 0.)
2815:             {
2816:                if (scale != row_sum)
2817:                {
2818:                    scale = row_sum/scale;
2819:                    for (j = P_diag_i[i]; j < (P_diag_i[i+1]-num_lost); j++)
2820:                           P_diag_data[j] *= scale;
2821:                    for (j = P_offd_i[i]; j < (P_offd_i[i+1]-num_lost_offd); j++)
2822:                           P_offd_data[j] *= scale;
[...]
2828:           if(my_thread_num == 0)
2829:           {   max_num_threads[0] = num_threads; }
2830:           num_lost_per_thread[my_thread_num] = num_lost;
2831:           num_lost_offd_per_thread[my_thread_num] = num_lost_offd;
[...]
2840:        if (max_elmts > 0)
[...]
2848:            for (i=start; i<stop; i++)
2849:            {
2850:               /* Note P_diag_i[stop] is the starting point for the next thread 
2851:                * in j and data, not the stop point for this thread */
2852:               last_index = P_diag_i[i+1];
2853:               last_index_offd = P_offd_i[i+1];
2854:               if(i == stop-1)
2855:               {  
2856:                   last_index -= num_lost_per_thread[my_thread_num];   
2857:                   last_index_offd -= num_lost_offd_per_thread[my_thread_num]; 
2858:               }
2859:               cnt1 = last_index-P_diag_i[i] + last_index_offd-P_offd_i[i];
2860:               if (cnt1 > P_mxnum) P_mxnum = cnt1;
2861:            }
2862: 
2863:            /* Some rows exceed max_elmts, and require truncation.  Essentially,
2864:             * each thread truncates and compresses its range of rows locally. */  
2865:            if (P_mxnum > max_elmts)
[...]
2872:                P_aux_j = hypre_CTAlloc(HYPRE_Int, P_mxnum);
2873:                P_aux_data = hypre_CTAlloc(HYPRE_Real, P_mxnum);
2874:                cnt_diag = P_diag_i[start];
2875:                cnt_offd = P_offd_i[start];
2876:                
2877:                for (i = start; i < stop; i++)
2878:                {
2879:                 /* Note P_diag_i[stop] is the starting point for the next thread 
2880:                  * in j and data, not the stop point for this thread */
2881:                 last_index = P_diag_i[i+1];
2882:                 last_index_offd = P_offd_i[i+1];
2883:                 if(i == stop-1)
2884:                 {  
2885:                     last_index -= num_lost_per_thread[my_thread_num];   
2886:                     last_index_offd -= num_lost_offd_per_thread[my_thread_num]; 
2887:                 }  
2888: 
2889:                 row_sum = 0;
2890:                 num_elmts = last_index-P_diag_i[i] + last_index_offd-P_offd_i[i];
2891:                 if (max_elmts < num_elmts)
2892:                 {
2893:                   /* copy both diagonal and off-diag parts of row i to _aux_ arrays */
2894:                   cnt = 0;
2895:                   for (j = P_diag_i[i]; j < last_index; j++)
2896:                   {
2897:                      P_aux_j[cnt] = P_diag_j[j];
2898:                      P_aux_data[cnt++] = P_diag_data[j];
2899:                      row_sum += P_diag_data[j];
2900:                   }
2901:                   num_lost += cnt;
2902:                   cnt1 = cnt;
2903:                   for (j = P_offd_i[i]; j < last_index_offd; j++)
2904:                   {
2905:                      P_aux_j[cnt] = P_offd_j[j]+num_cols;
2906:                      P_aux_data[cnt++] = P_offd_data[j];
2907:                      row_sum += P_offd_data[j];
2908:                   }
2909:                   num_lost_offd += cnt-cnt1;
2910:                   
2911:                   /* sort data */
2912:                   hypre_qsort2abs(P_aux_j,P_aux_data,0,cnt-1);
2913:                   scale = 0;
2914:                   if (i > start)
2915:                   {
2916:                      P_diag_i[i] = cnt_diag;
2917:                      P_offd_i[i] = cnt_offd;
2918:                   }
2919:                   for (j = 0; j < max_elmts; j++)
2920:                   {
2921:                      scale += P_aux_data[j];
2922:                      if (P_aux_j[j] < num_cols)
2923:                      {
2924:                         P_diag_j[cnt_diag] = P_aux_j[j];
2925:                         P_diag_data[cnt_diag++] = P_aux_data[j];
2926:                      }
2927:                      else
2928:                      {
2929:                         P_offd_j[cnt_offd] = P_aux_j[j]-num_cols;
2930:                         P_offd_data[cnt_offd++] = P_aux_data[j];
2931:                      }
2932:                   }
2933:                   num_lost -= cnt_diag-P_diag_i[i];
2934:                   num_lost_offd -= cnt_offd-P_offd_i[i];
2935: 
2936:                   /* normalize row of P */
2937:                   if (scale != 0.)
2938:                   {
2939:                      if (scale != row_sum)
2940:                      {
2941:                         scale = row_sum/scale;
2942:                         for (j = P_diag_i[i]; j < cnt_diag; j++)
2943:                                P_diag_data[j] *= scale;
2944:                         for (j = P_offd_i[i]; j < cnt_offd; j++)
2945:                                P_offd_data[j] *= scale;
[...]
2955:                   if (P_diag_i[i] != cnt_diag)
2956:                   {
2957:                      start_j = P_diag_i[i];
2958:                      P_diag_i[i] = cnt_diag;
2959:                      for (j = start_j; j < last_index; j++)
2960:                      {
2961:                         P_diag_j[cnt_diag] = P_diag_j[j];
2962:                         P_diag_data[cnt_diag++] = P_diag_data[j];
[...]
2968:                   if (P_offd_i[i] != cnt_offd)
2969:                   {
2970:                      start_j = P_offd_i[i];
2971:                      P_offd_i[i] = cnt_offd;
2972:                      for (j = start_j; j < last_index_offd; j++)
2973:                      {
2974:                         P_offd_j[cnt_offd] = P_offd_j[j];
2975:                         P_offd_data[cnt_offd++] = P_offd_data[j];
[...]
2983:                num_lost_per_thread[my_thread_num] += num_lost;
2984:                num_lost_offd_per_thread[my_thread_num] += num_lost_offd;
2985:                hypre_TFree(P_aux_j);
2986:                hypre_TFree(P_aux_data);
[...]
2996:        if(my_thread_num == 0)
2997:        {
2998:            num_lost_global = 0;
2999:            num_lost_global_offd = 0;
3000:            for(i = 0; i < max_num_threads[0]; i++)
3001:            {
3002:                num_lost_global += num_lost_per_thread[i]; 
3003:                num_lost_global_offd += num_lost_offd_per_thread[i]; 
[...]
3013:        if (num_lost_global)
[...]
3023:           if(my_thread_num == 0)
3024:           {
3025:               P_diag_size = P_diag_i[n_fine];
3026: 
3027:               for(i = 0; i < max_num_threads[0]; i++)
3028:               {   
3029:                   P_diag_size -= num_lost_per_thread[i]; 
3030:                   if(i > 0)
3031:                   {   cum_lost_per_thread[i] = num_lost_per_thread[i] + cum_lost_per_thread[i-1]; }
3032:                   else
3033:                   {   cum_lost_per_thread[i] = num_lost_per_thread[i]; }
3034:               }
3035:           
3036:               P_diag_j_new = hypre_CTAlloc(HYPRE_Int,P_diag_size);
3037:               P_diag_data_new = hypre_CTAlloc(HYPRE_Real,P_diag_size);
[...]
3044:           if(my_thread_num == 0)
[...]
3050:               next_open = P_diag_i[start] - cum_lost_per_thread[my_thread_num-1]; 
3051:           }
3052:           /* copy the j and data arrays over */
3053:           for(i = P_diag_i[start]; i < P_diag_i[stop] - num_lost_per_thread[my_thread_num]; i++)
3054:           {
3055:               P_diag_j_new[next_open] = P_diag_j[i];
3056:               P_diag_data_new[next_open] = P_diag_data[i];
[...]
3065:           if(my_thread_num > 0)
3066:           {
3067:               for(i=start; i<stop; i++)
3068:               {
3069:                   P_diag_i[i] -= cum_lost_per_thread[my_thread_num-1];
3070:               }
3071:           }
3072:           
3073:           if(my_thread_num == 0)
3074:           {
3075:               /* Set last entry */
3076:               P_diag_i[n_fine] = P_diag_size ;
3077:               
3078:               hypre_TFree(P_diag_j);
3079:               hypre_TFree(P_diag_data);
3080:               hypre_CSRMatrixJ(P_diag) = P_diag_j_new;
3081:               hypre_CSRMatrixData(P_diag) = P_diag_data_new;
3082:               hypre_CSRMatrixNumNonzeros(P_diag) = P_diag_size;
[...]
3093:        if (num_lost_global_offd)
3094:        {
3095:           /* Repeat process for off-diagonal */
3096:           if(my_thread_num == 0)
3097:           {
3098:               P_offd_size = P_offd_i[n_fine];
3099:               for(i = 0; i < max_num_threads[0]; i++)
3100:               {   
3101:                   P_offd_size -= num_lost_offd_per_thread[i]; 
3102:                   if(i > 0)
3103:                   {   cum_lost_per_thread[i] = num_lost_offd_per_thread[i] + cum_lost_per_thread[i-1]; }
3104:                   else
3105:                   {   cum_lost_per_thread[i] = num_lost_offd_per_thread[i]; }
3106:               }
3107:               
3108:               P_offd_j_new = hypre_CTAlloc(HYPRE_Int,P_offd_size);
3109:               P_offd_data_new = hypre_CTAlloc(HYPRE_Real,P_offd_size);
[...]
3116:           if(my_thread_num == 0)
[...]
3122:               next_open = P_offd_i[start] - cum_lost_per_thread[my_thread_num-1]; 
3123:           }
3124: 
3125:           /* copy the j and data arrays over */
3126:           for(i = P_offd_i[start]; i < P_offd_i[stop] - num_lost_offd_per_thread[my_thread_num]; i++)
3127:           {
3128:               P_offd_j_new[next_open] = P_offd_j[i];
3129:               P_offd_data_new[next_open] = P_offd_data[i];
[...]
3138:           if(my_thread_num > 0)
3139:           {
3140:               for(i=start; i<stop; i++)
3141:               {
3142:                   P_offd_i[i] -= cum_lost_per_thread[my_thread_num-1];
3143:               }
3144:           }
3145: 
3146:           if(my_thread_num == 0)
3147:           {
3148:               /* Set last entry */
3149:               P_offd_i[n_fine] = P_offd_size ;
3150:               
3151:               hypre_TFree(P_offd_j);
3152:               hypre_TFree(P_offd_data);
3153:               hypre_CSRMatrixJ(P_offd) = P_offd_j_new;
3154:               hypre_CSRMatrixData(P_offd) = P_offd_data_new;
3155:               hypre_CSRMatrixNumNonzeros(P_offd) = P_offd_size;
