_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          details = "Calling (and then returning from) a function prevents many compiler optimizations (like vectorization), breaks control flow (which reduces pipeline performance) and executes extra instructions to save/restore the registers used inside it, which is very expensive (dozens of cycles). Consider to inline small functions.\n - omp_get_num_threads@plt: 1 occurrences\n - omp_get_thread_num@plt: 1 occurrences\n",
          title = "CALL instructions",
          txt = "Detected function call instructions.\n",
        },
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - IDIV: 1 occurrences\n - INC: 7 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Avoid mixing data with different types. In particular, check if the type of constants is the same as array elements.",
          details = " - CQTO: 1 occurrences\n",
          title = "Conversion instructions",
          txt = "Detected expensive conversion instructions.",
        },
        {
          title = "Type of elements and instruction set",
          txt = "No instructions are processing arithmetic or math operations on FP elements. This function is probably writing/copying data or processing integer elements.",
        },
        {
          title = "Matching between your function (in the source code) and the binary function",
          txt = "The binary function does not contain any FP arithmetical operations.\nThe binary function is loading 152 bytes.\nThe binary function is storing 56 bytes.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 78\nnb uops            : 150\nloop length        : 272\nused x86 registers : 13\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 0\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "MACRO FUSION NOT POSSIBLE\nFIT IN UOP CACHE\nmicro-operation queue: 37.50 cycles\nfront end            : 37.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1    | P2    | P3    | P4    | P5    | P6    | P7\n----------------------------------------------------------------------\nuops   | 27.00 | 27.00 | 11.67 | 11.67 | 12.00 | 27.00 | 27.00 | 11.67\ncycles | 27.00 | 27.00 | 11.67 | 11.67 | 12.00 | 27.00 | 27.00 | 11.67\n\nCycles executing div or sqrt instructions: 24.00-90.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles     : 24.51-90.30\nStall cycles     : 2.21-68.00\nROB full (events): 2.92-68.75\nRS full (events) : 0.03-0.00\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 37.50\nDispatch  : 27.00\nDIV/SQRT  : 24.00-90.00\nOverall L1: 37.50-90.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 0%\nload    : 0%\nstore   : 0%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 0%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 0%\nother   : 0%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 12%\nload    : 12%\nstore   : 12%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 12%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 12%\nother   : 12%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each call to the function takes 90.00 cycles. At this rate:\n - 1% of peak load performance is reached (1.69 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 0% of peak store performance is reached (0.62 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the function is: 42e720\n\nInstruction                                             | Nb FU | P0    | P1    | P2   | P3   | P4 | P5    | P6    | P7   | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------------------------\nPUSH %RBP                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RSP,%RBP                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nPUSH %R12                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RDI,%R12                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nPUSH %RBX                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nCALL 40f0b0 <omp_get_num_threads@plt>                   | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOV %EAX,%EBX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nCALL 40f1f0 <omp_get_thread_num@plt>                    | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOV 0x10(%R12),%RDX                                     | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOVSXD %EBX,%RSI                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOVSXD %EAX,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R12),%RAX                                         | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOV (%RAX,%RDX,8),%RAX                                  | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nCQTO                                                    | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nIDIV %RSI                                               | 57    | 14.25 | 14.25 | 0    | 0    | 0  | 14.25 | 14.25 | 0    | 42-95   | 24-90\nCMP %RDX,%RCX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJL 42e868 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0x148>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nIMUL %RAX,%RCX                                          | 1     | 0     | 1     | 0    | 0    | 0  | 0     | 0     | 0    | 3       | 1\nADD %RCX,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nADD %RDX,%RAX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCMP %RAX,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJGE 42e860 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0x140> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nMOV 0x8(%R12),%R8                                       | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOV 0x18(%R12),%RDI                                     | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLEA (%R8,%RAX,8),%R10                                   | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nLEA (%R8,%RDX,8),%R9                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nMOV %R10,%R11                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nSUB %R9,%R11                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nSUB $0x8,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nSHR $0x3,%R11                                           | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nINC %R11                                                | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nAND $0x7,%R11D                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e810 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xf0>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x1,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7ff <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xdf>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x2,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7f3 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xd3>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x3,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7e7 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xc7>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x4,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7db <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xbb>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x5,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7cf <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xaf>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x6,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7c3 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xa3>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nMOV (%R9),%R12                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%R12,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%RBX                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%RBX,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%RCX                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%RCX,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%RSI                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%RSI,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%RAX                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%RAX,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%RDX                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%RDX,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%R8                                           | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%R8,8)                                  | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCMP %R9,%R10                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e860 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0x140>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nPOP %RBX                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R12                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %RBP                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nRET                                                     | 1     | 0     | 0     | 0.33 | 0.33 | 0  | 0     | 1     | 0.33 | 0       | 1\nNOPL (%RAX)                                             | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nINC %RAX                                                | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nXOR %EDX,%EDX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nJMP 42e757 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0x37>  | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPW %CS:(%RAX,%RAX,1)                                  | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nNOPL (%RAX)                                             | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\n",
        },
      },
      header = {
        "Warnings:\nDetected a function call instruction: ignoring called function instructions.\nRerun with --follow-calls=append to include them to analysis  or with --follow-calls=inline to simulate inlining.",
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = " - Try another compiler or update/tune your current one\n - Make array accesses unit-stride:\n  * If your function streams arrays of structures (AoS), try to use structures of arrays instead (SoA):\nfor(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)\n",
          details = "All SSE/AVX instructions are used in scalar version (process only one data element in vector registers).\nSince your execution units are vector units, only a vectorized function can use their full power.\n",
          title = "Vectorization",
          txt = "Your function is not vectorized.\nOnly 12% of vector register length is used (average across all SSE/AVX instructions).\nBy vectorizing your function, you can lower the cost of an iteration from 90.00 to 45.00 cycles (2.00x speedup).",
        },
        {
          workaround = "Reduce the number of division or square root instructions:\n - If denominator is constant over iterations, use reciprocal (replace x/y with x*(1/y)). Check precision impact. This will be done by your compiler with ffast-math or Ofast\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by execution of divide and square root operations (the divide/square root unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 90.00 to 37.50 cycles (2.40x speedup).\n",
        },
      },
      potential = {
        {
          title = "Expensive FP math instructions/calls",
          txt = "Detected performance impact from expensive FP math instructions/calls.\nBy removing/reexpressing them, you can lower the cost of an iteration from 90.00 to 23.50 cycles (3.83x speedup).",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          details = "Calling (and then returning from) a function prevents many compiler optimizations (like vectorization), breaks control flow (which reduces pipeline performance) and executes extra instructions to save/restore the registers used inside it, which is very expensive (dozens of cycles). Consider to inline small functions.\n - omp_get_num_threads@plt: 1 occurrences\n - omp_get_thread_num@plt: 1 occurrences\n",
          title = "CALL instructions",
          txt = "Detected function call instructions.\n",
        },
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - IDIV: 1 occurrences\n - INC: 7 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Avoid mixing data with different types. In particular, check if the type of constants is the same as array elements.",
          details = " - CQTO: 1 occurrences\n",
          title = "Conversion instructions",
          txt = "Detected expensive conversion instructions.",
        },
        {
          title = "Type of elements and instruction set",
          txt = "No instructions are processing arithmetic or math operations on FP elements. This function is probably writing/copying data or processing integer elements.",
        },
        {
          title = "Matching between your function (in the source code) and the binary function",
          txt = "The binary function does not contain any FP arithmetical operations.\nThe binary function is loading 152 bytes.\nThe binary function is storing 56 bytes.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 78\nnb uops            : 150\nloop length        : 272\nused x86 registers : 13\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 0\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "MACRO FUSION NOT POSSIBLE\nFIT IN UOP CACHE\nmicro-operation queue: 37.50 cycles\nfront end            : 37.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1    | P2    | P3    | P4    | P5    | P6    | P7\n----------------------------------------------------------------------\nuops   | 27.00 | 27.00 | 11.67 | 11.67 | 12.00 | 27.00 | 27.00 | 11.67\ncycles | 27.00 | 27.00 | 11.67 | 11.67 | 12.00 | 27.00 | 27.00 | 11.67\n\nCycles executing div or sqrt instructions: 24.00-90.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles     : 24.51-90.30\nStall cycles     : 2.21-68.00\nROB full (events): 2.92-68.75\nRS full (events) : 0.03-0.00\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 37.50\nDispatch  : 27.00\nDIV/SQRT  : 24.00-90.00\nOverall L1: 37.50-90.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 0%\nload    : 0%\nstore   : 0%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 0%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 0%\nother   : 0%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 12%\nload    : 12%\nstore   : 12%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 12%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 12%\nother   : 12%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each call to the function takes 90.00 cycles. At this rate:\n - 1% of peak load performance is reached (1.69 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 0% of peak store performance is reached (0.62 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the function is: 42e720\n\nInstruction                                             | Nb FU | P0    | P1    | P2   | P3   | P4 | P5    | P6    | P7   | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------------------------\nPUSH %RBP                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RSP,%RBP                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nPUSH %R12                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RDI,%R12                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nPUSH %RBX                                               | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nCALL 40f0b0 <omp_get_num_threads@plt>                   | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOV %EAX,%EBX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nCALL 40f1f0 <omp_get_thread_num@plt>                    | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOV 0x10(%R12),%RDX                                     | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOVSXD %EBX,%RSI                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOVSXD %EAX,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R12),%RAX                                         | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOV (%RAX,%RDX,8),%RAX                                  | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nCQTO                                                    | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nIDIV %RSI                                               | 57    | 14.25 | 14.25 | 0    | 0    | 0  | 14.25 | 14.25 | 0    | 42-95   | 24-90\nCMP %RDX,%RCX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJL 42e868 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0x148>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nIMUL %RAX,%RCX                                          | 1     | 0     | 1     | 0    | 0    | 0  | 0     | 0     | 0    | 3       | 1\nADD %RCX,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nADD %RDX,%RAX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCMP %RAX,%RDX                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJGE 42e860 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0x140> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nMOV 0x8(%R12),%R8                                       | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOV 0x18(%R12),%RDI                                     | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLEA (%R8,%RAX,8),%R10                                   | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nLEA (%R8,%RDX,8),%R9                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nMOV %R10,%R11                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nSUB %R9,%R11                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nSUB $0x8,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nSHR $0x3,%R11                                           | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nINC %R11                                                | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nAND $0x7,%R11D                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e810 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xf0>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x1,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7ff <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xdf>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x2,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7f3 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xd3>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x3,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7e7 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xc7>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x4,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7db <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xbb>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x5,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7cf <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xaf>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x6,%R11                                           | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e7c3 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0xa3>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nMOV (%R9),%R12                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%R12,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%RBX                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%RBX,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%RCX                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%RCX,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%RSI                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%RSI,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%RAX                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%RAX,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%RDX                                          | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%RDX,8)                                 | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV (%R9),%R8                                           | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLOCK INCQ (%RDI,%R8,8)                                  | 3     | 0.25  | 0.25  | 0.83 | 0.83 | 1  | 0.25  | 0.25  | 0.33 | 5       | 1\nADD $0x8,%R9                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCMP %R9,%R10                                            | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 42e860 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0x140>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nPOP %RBX                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R12                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %RBP                                                | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nRET                                                     | 1     | 0     | 0     | 0.33 | 0.33 | 0  | 0     | 1     | 0.33 | 0       | 1\nNOPL (%RAX)                                             | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nINC %RAX                                                | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nXOR %EDX,%EDX                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nJMP 42e757 <hypre_BoomerAMGCoarsenPMIS._omp_fn.2+0x37>  | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPW %CS:(%RAX,%RAX,1)                                  | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nNOPL (%RAX)                                             | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\n",
        },
      },
      header = {
        "Warnings:\nDetected a function call instruction: ignoring called function instructions.\nRerun with --follow-calls=append to include them to analysis  or with --follow-calls=inline to simulate inlining.",
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = " - Try another compiler or update/tune your current one\n - Make array accesses unit-stride:\n  * If your function streams arrays of structures (AoS), try to use structures of arrays instead (SoA):\nfor(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)\n",
          details = "All SSE/AVX instructions are used in scalar version (process only one data element in vector registers).\nSince your execution units are vector units, only a vectorized function can use their full power.\n",
          title = "Vectorization",
          txt = "Your function is not vectorized.\nOnly 12% of vector register length is used (average across all SSE/AVX instructions).\nBy vectorizing your function, you can lower the cost of an iteration from 90.00 to 45.00 cycles (2.00x speedup).",
        },
        {
          workaround = "Reduce the number of division or square root instructions:\n - If denominator is constant over iterations, use reciprocal (replace x/y with x*(1/y)). Check precision impact. This will be done by your compiler with ffast-math or Ofast\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by execution of divide and square root operations (the divide/square root unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 90.00 to 37.50 cycles (2.40x speedup).\n",
        },
      },
      potential = {
        {
          title = "Expensive FP math instructions/calls",
          txt = "Detected performance impact from expensive FP math instructions/calls.\nBy removing/reexpressing them, you can lower the cost of an iteration from 90.00 to 23.50 cycles (3.83x speedup).",
        },
      },
    },
  common = {
    header = {
      "The function is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/parcsr_ls/par_coarsen.c:2132-2136.\n",
      "Warnings:\nIgnoring paths for analysis",
    },
  },
}
