_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          details = "Calling (and then returning from) a function prevents many compiler optimizations (like vectorization), breaks control flow (which reduces pipeline performance) and executes extra instructions to save/restore the registers used inside it, which is very expensive (dozens of cycles). Consider to inline small functions.\n - omp_get_num_threads@plt: 1 occurrences\n - omp_get_thread_num@plt: 1 occurrences\n",
          title = "CALL instructions",
          txt = "Detected function call instructions.\n",
        },
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - IDIV: 1 occurrences\n - VZEROUPPER: 2 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VMOVUPD: 7 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 7 optimal vector unaligned load/store instructions.\n",
        },
        {
          workaround = "Avoid mixing data with different types. In particular, check if the type of constants is the same as array elements.",
          details = " - CQTO: 1 occurrences\n - VCVTQQ2PD (INT64 to FP64, SIMD): 8 occurrences\n - VCVTSI2SD (INT32/64 to FP64, scalar): 3 occurrences\n",
          title = "Conversion instructions",
          txt = "Detected expensive conversion instructions.",
        },
        {
          title = "Type of elements and instruction set",
          txt = "No instructions are processing arithmetic or math operations on FP elements. This function is probably writing/copying data or processing integer elements.",
        },
        {
          title = "Matching between your function (in the source code) and the binary function",
          txt = "The binary function does not contain any FP arithmetical operations.\nThe binary function is loading 528 bytes.\nThe binary function is storing 504 bytes.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 126\nnb uops            : 190\nloop length        : 514\nused x86 registers : 13\nused mmx registers : 0\nused xmm registers : 4\nused ymm registers : 1\nused zmm registers : 7\nnb stack references: 1\n",
        },
        {
          title = "Front-end",
          txt = "MACRO FUSION NOT POSSIBLE\nFIT IN UOP CACHE\nmicro-operation queue: 48.25 cycles\nfront end            : 48.25 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1    | P2    | P3    | P4    | P5    | P6    | P7\n----------------------------------------------------------------------\nuops   | 35.50 | 35.50 | 12.67 | 12.67 | 16.00 | 35.50 | 35.50 | 12.67\ncycles | 35.50 | 35.50 | 12.67 | 12.67 | 16.00 | 35.50 | 35.50 | 12.67\n\nCycles executing div or sqrt instructions: 24.00-90.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles: 33.22-90.41\nStall cycles: 0.00-57.11\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 48.25\nDispatch  : 35.50\nDIV/SQRT  : 24.00-90.00\nOverall L1: 48.25-90.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 27%\nload   : 72%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: 0%\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 27%\nFP\nall     : 75%\nload    : NA (no load vectorizable/vectorized instructions)\nstore   : 72%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 38%\nload    : 72%\nstore   : 72%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 0%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 0%\nother   : 30%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 30%\nload   : 71%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: 12%\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 30%\nFP\nall     : 67%\nload    : NA (no load vectorizable/vectorized instructions)\nstore   : 71%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 25%\nINT+FP\nall     : 39%\nload    : 71%\nstore   : 71%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 12%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 12%\nother   : 31%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each call to the function takes 90.00 cycles. At this rate:\n - 4% of peak load performance is reached (5.87 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 8% of peak store performance is reached (5.60 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the function is: 42d630\n\nInstruction                                             | Nb FU | P0    | P1    | P2   | P3   | P4 | P5    | P6    | P7   | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------------------------\nPUSH %RBP                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RSP,%RBP                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nPUSH %R12                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RDI,%R12                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nPUSH %RBX                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nAND $-0x40,%RSP                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCALL 40f0b0 <omp_get_num_threads@plt>                   | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOV %EAX,%EBX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nCALL 40f1f0 <omp_get_thread_num@plt>                    | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOVSXD %EBX,%RSI                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOVSXD %EAX,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R12),%RAX                                         | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nCQTO                                                    | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nIDIV %RSI                                               | 57    | 14.25 | 14.25 | 0    | 0    | 0  | 14.25 | 14.25 | 0    | 42-95   | 24-90\nCMP %RDX,%RCX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJL 42d8a0 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x270>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nIMUL %RAX,%RCX                                          | 1     | 0     | 1     | 0    | 0    | 0  | 0     | 0     | 0    | 3       | 1\nADD %RCX,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nLEA (%RAX,%RDX,1),%R10                                  | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nCMP %R10,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJGE 42d885 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x255> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nLEA -0x1(%RAX),%RDI                                     | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nMOV 0x10(%R12),%R9                                      | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOV 0x8(%R12),%R11                                      | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOV %RDX,%RBX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nCMP $0x6,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJBE 42d8b0 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x280> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nMOV %RAX,%R12                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nLEA (,%RDX,8),%RSI                                      | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nXOR %ECX,%ECX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nSHR $0x3,%R12                                           | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nLEA (%R9,%RSI,1),%R8                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nADD %R11,%RSI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nSAL $0x6,%R12                                           | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nLEA -0x40(%R12),%RDI                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nSHR $0x6,%RDI                                           | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nINC %RDI                                                | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nAND $0x7,%EDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d770 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x140>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x1,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d755 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x125>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x2,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d743 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x113>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x3,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d731 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x101>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x4,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d71f <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0xef>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x5,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d70d <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0xdd>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x6,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d6fb <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0xcb>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVCVTQQ2PD (%R8),%ZMM0                                   | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nMOV $0x40,%ECX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD %ZMM0,(%RSI)                                    | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nVCVTQQ2PD (%R8,%RCX,1),%ZMM1                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM1,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R8,%RCX,1),%ZMM2                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM2,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R8,%RCX,1),%ZMM3                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM3,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R8,%RCX,1),%ZMM4                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM4,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R8,%RCX,1),%ZMM5                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM5,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R8,%RCX,1),%ZMM6                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM6,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCMP %R12,%RCX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d7fe <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x1ce>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nMOV %RAX,%R8                                            | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nAND $-0x8,%R8                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nADD %R8,%RDX                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCMP %R8,%RAX                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d890 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x260>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nSUB %R8,%RAX                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nLEA -0x1(%RAX),%RSI                                     | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nCMP $0x2,%RSI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJBE 42d83c <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x20c> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nADD %RBX,%R8                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV %RAX,%RBX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nAND $-0x4,%RBX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R9,%R8,8),%YMM15                            | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 7       | 0.50\nVMOVUPD %YMM15,(%R11,%R8,8)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD %RBX,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nTEST $0x3,%AL                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d890 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x260>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVXORPS %XMM0,%XMM0,%XMM0                                | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nLEA 0x1(%RDX),%R8                                       | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nLEA (,%RDX,8),%RAX                                      | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nVCVTSI2SDQ (%R9,%RDX,8),%XMM0,%XMM1                     | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 6       | 0.50\nVMOVSD %XMM1,(%R11,%RDX,8)                              | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nCMP %R8,%R10                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJLE 42d890 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x260> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVCVTSI2SDQ 0x8(%R9,%RAX,1),%XMM0,%XMM2                  | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 6       | 0.50\nADD $0x2,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVSD %XMM2,0x8(%R11,%RAX,1)                           | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nCMP %RDX,%R10                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJLE 42d890 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x260> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVCVTSI2SDQ 0x10(%R9,%RAX,1),%XMM0,%XMM3                 | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 6       | 0.50\nVMOVSD %XMM3,0x10(%R11,%RAX,1)                          | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nVZEROUPPER                                              | 4     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 1\nLEA -0x10(%RBP),%RSP                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nPOP %RBX                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R12                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %RBP                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nRET                                                     | 1     | 0     | 0     | 0.33 | 0.33 | 0  | 0     | 1     | 0.33 | 0       | 1\nXCHG %AX,%AX                                            | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nVZEROUPPER                                              | 4     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 1\nLEA -0x10(%RBP),%RSP                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nPOP %RBX                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R12                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %RBP                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nRET                                                     | 1     | 0     | 0     | 0.33 | 0.33 | 0  | 0     | 1     | 0.33 | 0       | 1\nNOPL (%RAX)                                             | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nINC %RAX                                                | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nXOR %EDX,%EDX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nJMP 42d662 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x32>  | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPW (%RAX,%RAX,1)                                      | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nXOR %R8D,%R8D                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nJMP 42d811 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x1e1> | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPL (%RAX,%RAX,1)                                      | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\n",
        },
      },
      header = {
        "Warnings:\nDetected a function call instruction: ignoring called function instructions.\nRerun with --follow-calls=append to include them to analysis  or with --follow-calls=inline to simulate inlining.",
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          title = "Code clean check",
          txt = "Detected a slowdown caused by scalar integer instructions (typically used for address computation).\nBy removing them, you can lower the cost of an iteration from 90.00 to 14.00 cycles (6.43x speedup).",
        },
        {
          workaround = " - Try another compiler or update/tune your current one\n - Make array accesses unit-stride:\n  * If your function streams arrays of structures (AoS), try to use structures of arrays instead (SoA):\nfor(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)\n",
          details = "38% of SSE/AVX instructions are used in vector version (process two or more data elements in vector registers):\n - 72% of SSE/AVX loads are used in vector version.\n - 72% of SSE/AVX stores are used in vector version.\n - 0% of SSE/AVX addition or subtraction instructions are used in vector version.\n - 0% of SSE/AVX divide and square root instructions are used in vector version.\n - 30% of SSE/AVX instructions that are not load, store, addition, subtraction nor multiply instructions are used in vector version.\nSince your execution units are vector units, only a fully vectorized function can use their full power.\n",
          title = "Vectorization",
          txt = "Your function is poorly vectorized.\nOnly 39% of vector register length is used (average across all SSE/AVX instructions).\nBy fully vectorizing your function, you can lower the cost of an iteration from 90.00 to 45.00 cycles (2.00x speedup).",
        },
        {
          workaround = "Reduce the number of division or square root instructions:\n - If denominator is constant over iterations, use reciprocal (replace x/y with x*(1/y)). Check precision impact. This will be done by your compiler with ffast-math or Ofast\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by execution of divide and square root operations (the divide/square root unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 90.00 to 48.25 cycles (1.87x speedup).\n",
        },
      },
      potential = {
        {
          title = "Expensive FP math instructions/calls",
          txt = "Detected performance impact from expensive FP math instructions/calls.\nBy removing/reexpressing them, you can lower the cost of an iteration from 90.00 to 34.25 cycles (2.63x speedup).",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          details = "Calling (and then returning from) a function prevents many compiler optimizations (like vectorization), breaks control flow (which reduces pipeline performance) and executes extra instructions to save/restore the registers used inside it, which is very expensive (dozens of cycles). Consider to inline small functions.\n - omp_get_num_threads@plt: 1 occurrences\n - omp_get_thread_num@plt: 1 occurrences\n",
          title = "CALL instructions",
          txt = "Detected function call instructions.\n",
        },
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - IDIV: 1 occurrences\n - VZEROUPPER: 2 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VMOVUPD: 7 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 7 optimal vector unaligned load/store instructions.\n",
        },
        {
          workaround = "Avoid mixing data with different types. In particular, check if the type of constants is the same as array elements.",
          details = " - CQTO: 1 occurrences\n - VCVTQQ2PD (INT64 to FP64, SIMD): 8 occurrences\n - VCVTSI2SD (INT32/64 to FP64, scalar): 3 occurrences\n",
          title = "Conversion instructions",
          txt = "Detected expensive conversion instructions.",
        },
        {
          title = "Type of elements and instruction set",
          txt = "No instructions are processing arithmetic or math operations on FP elements. This function is probably writing/copying data or processing integer elements.",
        },
        {
          title = "Matching between your function (in the source code) and the binary function",
          txt = "The binary function does not contain any FP arithmetical operations.\nThe binary function is loading 528 bytes.\nThe binary function is storing 504 bytes.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 126\nnb uops            : 190\nloop length        : 514\nused x86 registers : 13\nused mmx registers : 0\nused xmm registers : 4\nused ymm registers : 1\nused zmm registers : 7\nnb stack references: 1\n",
        },
        {
          title = "Front-end",
          txt = "MACRO FUSION NOT POSSIBLE\nFIT IN UOP CACHE\nmicro-operation queue: 48.25 cycles\nfront end            : 48.25 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1    | P2    | P3    | P4    | P5    | P6    | P7\n----------------------------------------------------------------------\nuops   | 35.50 | 35.50 | 12.67 | 12.67 | 16.00 | 35.50 | 35.50 | 12.67\ncycles | 35.50 | 35.50 | 12.67 | 12.67 | 16.00 | 35.50 | 35.50 | 12.67\n\nCycles executing div or sqrt instructions: 24.00-90.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles: 33.22-90.41\nStall cycles: 0.00-57.11\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 48.25\nDispatch  : 35.50\nDIV/SQRT  : 24.00-90.00\nOverall L1: 48.25-90.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 27%\nload   : 72%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: 0%\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 27%\nFP\nall     : 75%\nload    : NA (no load vectorizable/vectorized instructions)\nstore   : 72%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 38%\nload    : 72%\nstore   : 72%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 0%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 0%\nother   : 30%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 30%\nload   : 71%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: 12%\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 30%\nFP\nall     : 67%\nload    : NA (no load vectorizable/vectorized instructions)\nstore   : 71%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 25%\nINT+FP\nall     : 39%\nload    : 71%\nstore   : 71%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 12%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 12%\nother   : 31%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each call to the function takes 90.00 cycles. At this rate:\n - 4% of peak load performance is reached (5.87 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 8% of peak store performance is reached (5.60 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the function is: 42d630\n\nInstruction                                             | Nb FU | P0    | P1    | P2   | P3   | P4 | P5    | P6    | P7   | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------------------------\nPUSH %RBP                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RSP,%RBP                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nPUSH %R12                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RDI,%R12                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nPUSH %RBX                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nAND $-0x40,%RSP                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCALL 40f0b0 <omp_get_num_threads@plt>                   | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOV %EAX,%EBX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nCALL 40f1f0 <omp_get_thread_num@plt>                    | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOVSXD %EBX,%RSI                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOVSXD %EAX,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R12),%RAX                                         | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nCQTO                                                    | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nIDIV %RSI                                               | 57    | 14.25 | 14.25 | 0    | 0    | 0  | 14.25 | 14.25 | 0    | 42-95   | 24-90\nCMP %RDX,%RCX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJL 42d8a0 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x270>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nIMUL %RAX,%RCX                                          | 1     | 0     | 1     | 0    | 0    | 0  | 0     | 0     | 0    | 3       | 1\nADD %RCX,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nLEA (%RAX,%RDX,1),%R10                                  | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nCMP %R10,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJGE 42d885 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x255> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nLEA -0x1(%RAX),%RDI                                     | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nMOV 0x10(%R12),%R9                                      | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOV 0x8(%R12),%R11                                      | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOV %RDX,%RBX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nCMP $0x6,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJBE 42d8b0 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x280> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nMOV %RAX,%R12                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nLEA (,%RDX,8),%RSI                                      | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nXOR %ECX,%ECX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nSHR $0x3,%R12                                           | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nLEA (%R9,%RSI,1),%R8                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nADD %R11,%RSI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nSAL $0x6,%R12                                           | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nLEA -0x40(%R12),%RDI                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nSHR $0x6,%RDI                                           | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nINC %RDI                                                | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nAND $0x7,%EDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d770 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x140>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x1,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d755 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x125>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x2,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d743 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x113>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x3,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d731 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x101>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x4,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d71f <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0xef>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x5,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d70d <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0xdd>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x6,%RDI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d6fb <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0xcb>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVCVTQQ2PD (%R8),%ZMM0                                   | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nMOV $0x40,%ECX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD %ZMM0,(%RSI)                                    | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nVCVTQQ2PD (%R8,%RCX,1),%ZMM1                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM1,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R8,%RCX,1),%ZMM2                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM2,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R8,%RCX,1),%ZMM3                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM3,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R8,%RCX,1),%ZMM4                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM4,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R8,%RCX,1),%ZMM5                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM5,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R8,%RCX,1),%ZMM6                            | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 7       | 0.50\nVMOVUPD %ZMM6,(%RSI,%RCX,1)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD $0x40,%RCX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCMP %R12,%RCX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d7fe <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x1ce>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nMOV %RAX,%R8                                            | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nAND $-0x8,%R8                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nADD %R8,%RDX                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCMP %R8,%RAX                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d890 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x260>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nSUB %R8,%RAX                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nLEA -0x1(%RAX),%RSI                                     | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nCMP $0x2,%RSI                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJBE 42d83c <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x20c> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nADD %RBX,%R8                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV %RAX,%RBX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nAND $-0x4,%RBX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVCVTQQ2PD (%R9,%R8,8),%YMM15                            | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 7       | 0.50\nVMOVUPD %YMM15,(%R11,%R8,8)                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nADD %RBX,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nTEST $0x3,%AL                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42d890 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x260>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVXORPS %XMM0,%XMM0,%XMM0                                | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nLEA 0x1(%RDX),%R8                                       | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nLEA (,%RDX,8),%RAX                                      | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nVCVTSI2SDQ (%R9,%RDX,8),%XMM0,%XMM1                     | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 6       | 0.50\nVMOVSD %XMM1,(%R11,%RDX,8)                              | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nCMP %R8,%R10                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJLE 42d890 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x260> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVCVTSI2SDQ 0x8(%R9,%RAX,1),%XMM0,%XMM2                  | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 6       | 0.50\nADD $0x2,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVSD %XMM2,0x8(%R11,%RAX,1)                           | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nCMP %RDX,%R10                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJLE 42d890 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x260> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVCVTSI2SDQ 0x10(%R9,%RAX,1),%XMM0,%XMM3                 | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 6       | 0.50\nVMOVSD %XMM3,0x10(%R11,%RAX,1)                          | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nVZEROUPPER                                              | 4     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 1\nLEA -0x10(%RBP),%RSP                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nPOP %RBX                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R12                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %RBP                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nRET                                                     | 1     | 0     | 0     | 0.33 | 0.33 | 0  | 0     | 1     | 0.33 | 0       | 1\nXCHG %AX,%AX                                            | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nVZEROUPPER                                              | 4     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 1\nLEA -0x10(%RBP),%RSP                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nPOP %RBX                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R12                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %RBP                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nRET                                                     | 1     | 0     | 0     | 0.33 | 0.33 | 0  | 0     | 1     | 0.33 | 0       | 1\nNOPL (%RAX)                                             | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nINC %RAX                                                | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nXOR %EDX,%EDX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nJMP 42d662 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x32>  | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPW (%RAX,%RAX,1)                                      | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nXOR %R8D,%R8D                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nJMP 42d811 <hypre_BoomerAMGCoarsenPMIS._omp_fn.3+0x1e1> | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPL (%RAX,%RAX,1)                                      | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\n",
        },
      },
      header = {
        "Warnings:\nDetected a function call instruction: ignoring called function instructions.\nRerun with --follow-calls=append to include them to analysis  or with --follow-calls=inline to simulate inlining.",
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          title = "Code clean check",
          txt = "Detected a slowdown caused by scalar integer instructions (typically used for address computation).\nBy removing them, you can lower the cost of an iteration from 90.00 to 14.00 cycles (6.43x speedup).",
        },
        {
          workaround = " - Try another compiler or update/tune your current one\n - Make array accesses unit-stride:\n  * If your function streams arrays of structures (AoS), try to use structures of arrays instead (SoA):\nfor(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)\n",
          details = "38% of SSE/AVX instructions are used in vector version (process two or more data elements in vector registers):\n - 72% of SSE/AVX loads are used in vector version.\n - 72% of SSE/AVX stores are used in vector version.\n - 0% of SSE/AVX addition or subtraction instructions are used in vector version.\n - 0% of SSE/AVX divide and square root instructions are used in vector version.\n - 30% of SSE/AVX instructions that are not load, store, addition, subtraction nor multiply instructions are used in vector version.\nSince your execution units are vector units, only a fully vectorized function can use their full power.\n",
          title = "Vectorization",
          txt = "Your function is poorly vectorized.\nOnly 39% of vector register length is used (average across all SSE/AVX instructions).\nBy fully vectorizing your function, you can lower the cost of an iteration from 90.00 to 45.00 cycles (2.00x speedup).",
        },
        {
          workaround = "Reduce the number of division or square root instructions:\n - If denominator is constant over iterations, use reciprocal (replace x/y with x*(1/y)). Check precision impact. This will be done by your compiler with ffast-math or Ofast\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by execution of divide and square root operations (the divide/square root unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 90.00 to 48.25 cycles (1.87x speedup).\n",
        },
      },
      potential = {
        {
          title = "Expensive FP math instructions/calls",
          txt = "Detected performance impact from expensive FP math instructions/calls.\nBy removing/reexpressing them, you can lower the cost of an iteration from 90.00 to 34.25 cycles (2.63x speedup).",
        },
      },
    },
  common = {
    header = {
      "The function is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/parcsr_ls/par_coarsen.c:2139-2142.\n",
      "Warnings:\nIgnoring paths for analysis",
    },
  },
}
