_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VMOVUPD: 8 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 8 optimal vector unaligned load/store instructions.\n",
        },
        {
          workaround = "Avoid mixing data with different types. In particular, check if the type of constants is the same as array elements.",
          details = " - VCVTQQ2PD (INT64 to FP64, SIMD): 8 occurrences\n",
          title = "Conversion instructions",
          txt = "Detected expensive conversion instructions.",
        },
        {
          title = "Type of elements and instruction set",
          txt = "No instructions are processing arithmetic or math operations on FP elements. This loop is probably writing/copying data or processing integer elements.",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop does not contain any FP arithmetical operations.\nThe binary loop is loading 512 bytes.\nThe binary loop is storing 512 bytes.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 19\nnb uops            : 18\nloop length        : 142\nused x86 registers : 4\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 8\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 4.50 cycles\nfront end            : 4.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 4.00 | 1.00 | 5.33 | 5.33 | 8.00 | 4.00 | 1.00 | 5.33\ncycles | 4.00 | 1.00 | 5.33 | 5.33 | 8.00 | 4.00 | 1.00 | 5.33\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 1.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 8.17\nStall cycles    : 3.36\nRS full (events): 0.22\nSB full (events): 4.72\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 4.50\nDispatch  : 8.00\nData deps.: 1.00\nOverall L1: 8.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 100%\nFP\nall     : 100%\nload    : NA (no load vectorizable/vectorized instructions)\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\nINT+FP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 100%\nFP\nall     : 100%\nload    : NA (no load vectorizable/vectorized instructions)\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\nINT+FP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 8.00 cycles. At this rate:\n - 50% of peak load performance is reached (64.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 100% of peak store performance is reached (64.00 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 42d770\n\nInstruction                                             | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7   | Latency | Recip. throughput\n---------------------------------------------------------------------------------------------------------------------------------------------------\nVCVTQQ2PD (%R8,%RCX,1),%ZMM7                            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x40(%R8,%RCX,1),%ZMM8                        | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x80(%R8,%RCX,1),%ZMM9                        | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0xc0(%R8,%RCX,1),%ZMM10                       | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x100(%R8,%RCX,1),%ZMM11                      | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x140(%R8,%RCX,1),%ZMM12                      | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x180(%R8,%RCX,1),%ZMM13                      | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x1c0(%R8,%RCX,1),%ZMM14                      | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVMOVUPD %ZMM7,(%RSI,%RCX,1)                             | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM8,0x40(%RSI,%RCX,1)                         | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM9,0x80(%RSI,%RCX,1)                         | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM10,0xc0(%RSI,%RCX,1)                        | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM11,0x100(%RSI,%RCX,1)                       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM12,0x140(%RSI,%RCX,1)                       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM13,0x180(%RSI,%RCX,1)                       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM14,0x1c0(%RSI,%RCX,1)                       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nADD $0x200,%RCX                                         | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nCMP %R12,%RCX                                           | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nJNE 42d770 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x140> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0    | 0       | 0.50-1\n",
        },
      },
      header = {
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          workaround = " - Write less array elements\n - Provide more information to your compiler:\n  * hardcode the bounds of the corresponding 'for' loop\n  * use the 'restrict' C99 keyword\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by writing data to caches/RAM (the store unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 8.00 to 5.33 cycles (1.50x speedup).\n",
        },
      },
      potential = {
      },
    },
  },
  AVG = {
      hint = {
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VMOVUPD: 8 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 8 optimal vector unaligned load/store instructions.\n",
        },
        {
          workaround = "Avoid mixing data with different types. In particular, check if the type of constants is the same as array elements.",
          details = " - VCVTQQ2PD (INT64 to FP64, SIMD): 8 occurrences\n",
          title = "Conversion instructions",
          txt = "Detected expensive conversion instructions.",
        },
        {
          title = "Type of elements and instruction set",
          txt = "No instructions are processing arithmetic or math operations on FP elements. This loop is probably writing/copying data or processing integer elements.",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop does not contain any FP arithmetical operations.\nThe binary loop is loading 512 bytes.\nThe binary loop is storing 512 bytes.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 19\nnb uops            : 18\nloop length        : 142\nused x86 registers : 4\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 8\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 4.50 cycles\nfront end            : 4.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 4.00 | 1.00 | 5.33 | 5.33 | 8.00 | 4.00 | 1.00 | 5.33\ncycles | 4.00 | 1.00 | 5.33 | 5.33 | 8.00 | 4.00 | 1.00 | 5.33\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 1.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 8.17\nStall cycles    : 3.36\nRS full (events): 0.22\nSB full (events): 4.72\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 4.50\nDispatch  : 8.00\nData deps.: 1.00\nOverall L1: 8.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 100%\nFP\nall     : 100%\nload    : NA (no load vectorizable/vectorized instructions)\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\nINT+FP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 100%\nFP\nall     : 100%\nload    : NA (no load vectorizable/vectorized instructions)\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\nINT+FP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 8.00 cycles. At this rate:\n - 50% of peak load performance is reached (64.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 100% of peak store performance is reached (64.00 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 42d770\n\nInstruction                                             | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7   | Latency | Recip. throughput\n---------------------------------------------------------------------------------------------------------------------------------------------------\nVCVTQQ2PD (%R8,%RCX,1),%ZMM7                            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x40(%R8,%RCX,1),%ZMM8                        | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x80(%R8,%RCX,1),%ZMM9                        | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0xc0(%R8,%RCX,1),%ZMM10                       | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x100(%R8,%RCX,1),%ZMM11                      | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x140(%R8,%RCX,1),%ZMM12                      | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x180(%R8,%RCX,1),%ZMM13                      | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVCVTQQ2PD 0x1c0(%R8,%RCX,1),%ZMM14                      | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 7       | 0.50\nVMOVUPD %ZMM7,(%RSI,%RCX,1)                             | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM8,0x40(%RSI,%RCX,1)                         | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM9,0x80(%RSI,%RCX,1)                         | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM10,0xc0(%RSI,%RCX,1)                        | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM11,0x100(%RSI,%RCX,1)                       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM12,0x140(%RSI,%RCX,1)                       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM13,0x180(%RSI,%RCX,1)                       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD %ZMM14,0x1c0(%RSI,%RCX,1)                       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nADD $0x200,%RCX                                         | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nCMP %R12,%RCX                                           | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nJNE 42d770 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x140> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0    | 0       | 0.50-1\n",
        },
      },
      header = {
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          workaround = " - Write less array elements\n - Provide more information to your compiler:\n  * hardcode the bounds of the corresponding 'for' loop\n  * use the 'restrict' C99 keyword\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by writing data to caches/RAM (the store unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 8.00 to 5.33 cycles (1.50x speedup).\n",
        },
      },
      potential = {
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/parcsr_ls/par_coarsen.c:2142.\n",
      "The related source loop is not unrolled or unrolled with no peel/tail loop.",
    },
    nb_paths = 1,
  },
}
