_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          details = "Calling (and then returning from) a function prevents many compiler optimizations (like vectorization), breaks control flow (which reduces pipeline performance) and executes extra instructions to save/restore the registers used inside it, which is very expensive (dozens of cycles). Consider to inline small functions.\n - omp_get_num_threads@plt: 1 occurrences\n - omp_get_thread_num@plt: 1 occurrences\n",
          title = "CALL instructions",
          txt = "Detected function call instructions.\n",
        },
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - IDIV: 1 occurrences\n - VZEROUPPER: 2 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VEXTRACTF64X2: 2 occurrences\n - VEXTRACTF64X4: 1 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 3 suboptimal vector unaligned load/store instructions.\n",
        },
        {
          workaround = "Avoid mixing data with different types. In particular, check if the type of constants is the same as array elements.",
          details = " - CQTO: 1 occurrences\n",
          title = "Conversion instructions",
          txt = "Detected expensive conversion instructions.",
        },
        {
          title = "Type of elements and instruction set",
          txt = "3 SSE or AVX instructions are processing arithmetic or math operations on double precision FP elements in scalar mode (one at a time).\n4 SSE or AVX instructions are processing arithmetic or math operations on double precision FP elements in vector mode (two at a time).\n3 AVX instructions are processing arithmetic or math operations on double precision FP elements in vector mode (four at a time).\n7 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your function (in the source code) and the binary function",
          txt = "The binary function is composed of 142 FP arithmetical operations:\n - 79: addition or subtraction (63 inside FMA instructions)\n - 63: multiply (all inside FMA instructions)\nThe binary function is loading 1040 bytes (130 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.14 FP operations per loaded or stored byte.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 141\nnb uops            : 205\nloop length        : 587\nused x86 registers : 15\nused mmx registers : 0\nused xmm registers : 13\nused ymm registers : 4\nused zmm registers : 8\nnb stack references: 1\n",
        },
        {
          title = "Front-end",
          txt = "MACRO FUSION NOT POSSIBLE\nFIT IN UOP CACHE\nmicro-operation queue: 53.75 cycles\nfront end            : 53.75 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1    | P2    | P3    | P4   | P5    | P6    | P7\n--------------------------------------------------------------------\nuops   | 38.25 | 38.25 | 15.50 | 15.50 | 7.00 | 38.25 | 38.25 | 8.00\ncycles | 38.25 | 38.25 | 15.50 | 15.50 | 7.00 | 38.25 | 38.25 | 8.00\n\nCycles executing div or sqrt instructions: 24.00-90.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles     : 40.37-90.47\nStall cycles     : 1.57-51.67\nROB full (events): 1.84-52.91\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 53.75\nDispatch  : 38.25\nDIV/SQRT  : 24.00-90.00\nOverall L1: 53.75-90.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 8%\nload   : NA (no load vectorizable/vectorized instructions)\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: 0%\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 8%\nFP\nall     : 77%\nload    : 72%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 100%\nfma     : 72%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 75%\nINT+FP\nall     : 49%\nload    : 72%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 85%\nfma     : 72%\ndiv/sqrt: 0%\nother   : 25%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 13%\nload   : NA (no load vectorizable/vectorized instructions)\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: 12%\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 13%\nFP\nall     : 54%\nload    : 71%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 33%\nfma     : 71%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 25%\nINT+FP\nall     : 37%\nload    : 71%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 30%\nfma     : 71%\ndiv/sqrt: 12%\nother   : 16%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each call to the function takes 90.00 cycles. At this rate:\n - 9% of peak load performance is reached (11.56 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the function is: 5a8860\n\nInstruction                                           | Nb FU | P0    | P1    | P2   | P3   | P4 | P5    | P6    | P7   | Latency | Recip. throughput\n-----------------------------------------------------------------------------------------------------------------------------------------------------\nPUSH %RBP                                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RSP,%RBP                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nPUSH %R14                                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nPUSH %R13                                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nPUSH %R12                                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nPUSH %RBX                                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RDI,%RBX                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nAND $-0x40,%RSP                                       | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV 0x8(%RDI),%R13                                    | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOV (%RDI),%R14                                       | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nCALL 40f0b0 <omp_get_num_threads@plt>                 | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOVSXD %EAX,%R12                                      | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCALL 40f1f0 <omp_get_thread_num@plt>                  | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOVSXD %EAX,%RCX                                      | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV 0x10(%RBX),%RAX                                   | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nCQTO                                                  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nIDIV %R12                                             | 57    | 14.25 | 14.25 | 0    | 0    | 0  | 14.25 | 14.25 | 0    | 42-95   | 24-90\nCMP %RDX,%RCX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJL 5a8b28 <hypre_SeqVectorInnerProd._omp_fn.0+0x2c8>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nIMUL %RAX,%RCX                                        | 1     | 0     | 1     | 0    | 0    | 0  | 0     | 0     | 0    | 3       | 1\nVXORPD %XMM15,%XMM15,%XMM15                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nADD %RCX,%RDX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nLEA (%RAX,%RDX,1),%R9                                 | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nCMP %R9,%RDX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJGE 5a8af2 <hypre_SeqVectorInnerProd._omp_fn.0+0x292> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nLEA -0x1(%RAX),%RSI                                   | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nMOV %RDX,%R10                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nCMP $0x6,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJBE 5a8b38 <hypre_SeqVectorInnerProd._omp_fn.0+0x2d8> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nMOV %RAX,%R11                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nLEA (,%RDX,8),%R12                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nVXORPD %XMM8,%XMM8,%XMM8                              | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nXOR %ECX,%ECX                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nSHR $0x3,%R11                                         | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nLEA (%R13,%R12,1),%R8                                 | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nADD %R14,%R12                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nSAL $0x6,%R11                                         | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nLEA -0x40(%R11),%RSI                                  | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nSHR $0x6,%RSI                                         | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nINC %RSI                                              | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nAND $0x7,%ESI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a89aa <hypre_SeqVectorInnerProd._omp_fn.0+0x14a>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x1,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a898f <hypre_SeqVectorInnerProd._omp_fn.0+0x12f>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x2,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a897d <hypre_SeqVectorInnerProd._omp_fn.0+0x11d>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x3,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a896b <hypre_SeqVectorInnerProd._omp_fn.0+0x10b>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x4,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8959 <hypre_SeqVectorInnerProd._omp_fn.0+0xf9>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x5,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8947 <hypre_SeqVectorInnerProd._omp_fn.0+0xe7>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x6,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8935 <hypre_SeqVectorInnerProd._omp_fn.0+0xd5>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVMOVUPD (%R8),%ZMM2                                   | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nMOV $0x40,%ECX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVFMADD231PD (%R12),%ZMM2,%ZMM8                        | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nVMOVUPD (%R8,%RCX,1),%ZMM1                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM1,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD (%R8,%RCX,1),%ZMM6                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM6,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD (%R8,%RCX,1),%ZMM7                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM7,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD (%R8,%RCX,1),%ZMM5                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM5,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD (%R8,%RCX,1),%ZMM3                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM3,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD (%R8,%RCX,1),%ZMM4                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM4,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCMP %R11,%RCX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8a38 <hypre_SeqVectorInnerProd._omp_fn.0+0x1d8>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVEXTRACTF64X4 $0x1,%ZMM8,%YMM6                        | 1     | 0     | 0     | 0    | 0    | 0  | 1     | 0     | 0    | 3       | 1\nMOV %RAX,%RDI                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nVADDPD %YMM8,%YMM6,%YMM1                              | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nAND $-0x8,%RDI                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVADDPD %YMM6,%YMM8,%YMM0                              | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nADD %RDI,%RDX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVEXTRACTF64X2 $0x1,%YMM1,%XMM2                        | 1     | 0     | 0     | 0    | 0    | 0  | 1     | 0     | 0    | 3       | 1\nVADDPD %XMM1,%XMM2,%XMM7                              | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nVUNPCKHPD %XMM7,%XMM7,%XMM5                           | 1     | 0     | 0     | 0    | 0    | 0  | 1     | 0     | 0    | 1       | 1\nVADDPD %XMM7,%XMM5,%XMM15                             | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nCMP %RAX,%RDI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8b20 <hypre_SeqVectorInnerProd._omp_fn.0+0x2c0>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nSUB %RDI,%RAX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nLEA -0x1(%RAX),%R12                                   | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nCMP $0x2,%R12                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJBE 5a8aac <hypre_SeqVectorInnerProd._omp_fn.0+0x24c> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nADD %R10,%RDI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV %RAX,%R10                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nVMOVUPD (%R13,%RDI,8),%YMM8                           | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nAND $-0x4,%R10                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nADD %R10,%RDX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVFMADD231PD (%R14,%RDI,8),%YMM8,%YMM0                 | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4       | 0.50\nVEXTRACTF64X2 $0x1,%YMM0,%XMM3                        | 1     | 0     | 0     | 0    | 0    | 0  | 1     | 0     | 0    | 3       | 1\nVADDPD %XMM0,%XMM3,%XMM4                              | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nVUNPCKHPD %XMM4,%XMM4,%XMM9                           | 1     | 0     | 0     | 0    | 0    | 0  | 1     | 0     | 0    | 1       | 1\nVADDPD %XMM4,%XMM9,%XMM15                             | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nTEST $0x3,%AL                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8b20 <hypre_SeqVectorInnerProd._omp_fn.0+0x2c0>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVMOVSD (%R13,%RDX,8),%XMM10                           | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLEA 0x1(%RDX),%RDI                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nLEA (,%RDX,8),%RAX                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nVFMADD231SD (%R14,%RDX,8),%XMM10,%XMM15               | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4       | 0.50\nCMP %RDI,%R9                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJLE 5a8b20 <hypre_SeqVectorInnerProd._omp_fn.0+0x2c0> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVMOVSD 0x8(%R14,%RAX,1),%XMM11                        | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nADD $0x2,%RDX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVFMADD231SD 0x8(%R13,%RAX,1),%XMM11,%XMM15            | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4       | 0.50\nCMP %RDX,%R9                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJLE 5a8b20 <hypre_SeqVectorInnerProd._omp_fn.0+0x2c0> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVMOVSD 0x10(%R13,%RAX,1),%XMM12                       | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nVFMADD231SD 0x10(%R14,%RAX,1),%XMM12,%XMM15           | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4       | 0.50\nVZEROUPPER                                            | 4     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 1\nMOV 0x18(%RBX),%RAX                                   | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLEA 0x18(%RBX),%R13                                   | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nLEA -0x20(%RBP),%RSP                                  | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nPOP %RBX                                              | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R12                                              | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R13                                              | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R14                                              | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %RBP                                              | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nRET                                                   | 1     | 0     | 0     | 0.33 | 0.33 | 0  | 0     | 1     | 0.33 | 0       | 1\nXCHG %AX,%AX                                          | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nVZEROUPPER                                            | 4     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 1\nJMP 5a8af2 <hypre_SeqVectorInnerProd._omp_fn.0+0x292> | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPL (%RAX)                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nINC %RAX                                              | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nXOR %EDX,%EDX                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nJMP 5a889b <hypre_SeqVectorInnerProd._omp_fn.0+0x3b>  | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPW (%RAX,%RAX,1)                                    | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nVXORPD %XMM0,%XMM0,%XMM0                              | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nXOR %EDI,%EDI                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nJMP 5a8a6e <hypre_SeqVectorInnerProd._omp_fn.0+0x20e> | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPW %CS:(%RAX,%RAX,1)                                | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nXCHG %AX,%AX                                          | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\n",
        },
      },
      header = {
        "Warnings:\nDetected a function call instruction: ignoring called function instructions.\nRerun with --follow-calls=append to include them to analysis  or with --follow-calls=inline to simulate inlining.",
        "4% of peak computational performance is used (1.58 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          title = "Code clean check",
          txt = "Detected a slowdown caused by scalar integer instructions (typically used for address computation).\nBy removing them, you can lower the cost of an iteration from 90.00 to 19.25 cycles (4.68x speedup).",
        },
        {
          workaround = " - Try another compiler or update/tune your current one\n - Make array accesses unit-stride:\n  * If your function streams arrays of structures (AoS), try to use structures of arrays instead (SoA):\nfor(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)\n",
          details = "49% of SSE/AVX instructions are used in vector version (process two or more data elements in vector registers):\n - 72% of SSE/AVX loads are used in vector version.\n - 85% of SSE/AVX addition or subtraction instructions are used in vector version.\n - 72% of SSE/AVX fused multiply-add instructions are used in vector version.\n - 0% of SSE/AVX divide and square root instructions are used in vector version.\n - 25% of SSE/AVX instructions that are not load, store, addition, subtraction nor multiply instructions are used in vector version.\nSince your execution units are vector units, only a fully vectorized function can use their full power.\n",
          title = "Vectorization",
          txt = "Your function is poorly vectorized.\nOnly 37% of vector register length is used (average across all SSE/AVX instructions).\nBy fully vectorizing your function, you can lower the cost of an iteration from 90.00 to 45.00 cycles (2.00x speedup).",
        },
        {
          workaround = " - Reduce the number of division or square root instructions:\n  * If denominator is constant over iterations, use reciprocal (replace x/y with x*(1/y)). Check precision impact. This will be done by your compiler with ffast-math or Ofast\n - Check whether you really need double precision. If not, switch to single precision to speedup execution\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by execution of divide and square root operations (the divide/square root unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 90.00 to 53.75 cycles (1.67x speedup).\n",
        },
      },
      potential = {
        {
          title = "Expensive FP math instructions/calls",
          txt = "Detected performance impact from expensive FP math instructions/calls.\nBy removing/reexpressing them, you can lower the cost of an iteration from 90.00 to 39.75 cycles (2.26x speedup).",
        },
        {
          title = "FMA",
          txt = "Detected 63 FMA (fused multiply-add) operations.",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          details = "Calling (and then returning from) a function prevents many compiler optimizations (like vectorization), breaks control flow (which reduces pipeline performance) and executes extra instructions to save/restore the registers used inside it, which is very expensive (dozens of cycles). Consider to inline small functions.\n - omp_get_num_threads@plt: 1 occurrences\n - omp_get_thread_num@plt: 1 occurrences\n",
          title = "CALL instructions",
          txt = "Detected function call instructions.\n",
        },
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - IDIV: 1 occurrences\n - VZEROUPPER: 2 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VEXTRACTF64X2: 2 occurrences\n - VEXTRACTF64X4: 1 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 3 suboptimal vector unaligned load/store instructions.\n",
        },
        {
          workaround = "Avoid mixing data with different types. In particular, check if the type of constants is the same as array elements.",
          details = " - CQTO: 1 occurrences\n",
          title = "Conversion instructions",
          txt = "Detected expensive conversion instructions.",
        },
        {
          title = "Type of elements and instruction set",
          txt = "3 SSE or AVX instructions are processing arithmetic or math operations on double precision FP elements in scalar mode (one at a time).\n4 SSE or AVX instructions are processing arithmetic or math operations on double precision FP elements in vector mode (two at a time).\n3 AVX instructions are processing arithmetic or math operations on double precision FP elements in vector mode (four at a time).\n7 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your function (in the source code) and the binary function",
          txt = "The binary function is composed of 142 FP arithmetical operations:\n - 79: addition or subtraction (63 inside FMA instructions)\n - 63: multiply (all inside FMA instructions)\nThe binary function is loading 1040 bytes (130 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.14 FP operations per loaded or stored byte.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 141\nnb uops            : 205\nloop length        : 587\nused x86 registers : 15\nused mmx registers : 0\nused xmm registers : 13\nused ymm registers : 4\nused zmm registers : 8\nnb stack references: 1\n",
        },
        {
          title = "Front-end",
          txt = "MACRO FUSION NOT POSSIBLE\nFIT IN UOP CACHE\nmicro-operation queue: 53.75 cycles\nfront end            : 53.75 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1    | P2    | P3    | P4   | P5    | P6    | P7\n--------------------------------------------------------------------\nuops   | 38.25 | 38.25 | 15.50 | 15.50 | 7.00 | 38.25 | 38.25 | 8.00\ncycles | 38.25 | 38.25 | 15.50 | 15.50 | 7.00 | 38.25 | 38.25 | 8.00\n\nCycles executing div or sqrt instructions: 24.00-90.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles     : 40.37-90.47\nStall cycles     : 1.57-51.67\nROB full (events): 1.84-52.91\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 53.75\nDispatch  : 38.25\nDIV/SQRT  : 24.00-90.00\nOverall L1: 53.75-90.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 8%\nload   : NA (no load vectorizable/vectorized instructions)\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: 0%\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 8%\nFP\nall     : 77%\nload    : 72%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 100%\nfma     : 72%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 75%\nINT+FP\nall     : 49%\nload    : 72%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 85%\nfma     : 72%\ndiv/sqrt: 0%\nother   : 25%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 13%\nload   : NA (no load vectorizable/vectorized instructions)\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: 12%\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 13%\nFP\nall     : 54%\nload    : 71%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 33%\nfma     : 71%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 25%\nINT+FP\nall     : 37%\nload    : 71%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 30%\nfma     : 71%\ndiv/sqrt: 12%\nother   : 16%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each call to the function takes 90.00 cycles. At this rate:\n - 9% of peak load performance is reached (11.56 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the function is: 5a8860\n\nInstruction                                           | Nb FU | P0    | P1    | P2   | P3   | P4 | P5    | P6    | P7   | Latency | Recip. throughput\n-----------------------------------------------------------------------------------------------------------------------------------------------------\nPUSH %RBP                                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RSP,%RBP                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nPUSH %R14                                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nPUSH %R13                                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nPUSH %R12                                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nPUSH %RBX                                             | 1     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 0     | 0.33 | 3       | 1\nMOV %RDI,%RBX                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nAND $-0x40,%RSP                                       | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV 0x8(%RDI),%R13                                    | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nMOV (%RDI),%R14                                       | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nCALL 40f0b0 <omp_get_num_threads@plt>                 | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOVSXD %EAX,%R12                                      | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCALL 40f1f0 <omp_get_thread_num@plt>                  | 2     | 0     | 0     | 0.33 | 0.33 | 1  | 0     | 1     | 0.33 | 0       | 1\nMOVSXD %EAX,%RCX                                      | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV 0x10(%RBX),%RAX                                   | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nCQTO                                                  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nIDIV %R12                                             | 57    | 14.25 | 14.25 | 0    | 0    | 0  | 14.25 | 14.25 | 0    | 42-95   | 24-90\nCMP %RDX,%RCX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJL 5a8b28 <hypre_SeqVectorInnerProd._omp_fn.0+0x2c8>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nIMUL %RAX,%RCX                                        | 1     | 0     | 1     | 0    | 0    | 0  | 0     | 0     | 0    | 3       | 1\nVXORPD %XMM15,%XMM15,%XMM15                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nADD %RCX,%RDX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nLEA (%RAX,%RDX,1),%R9                                 | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nCMP %R9,%RDX                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJGE 5a8af2 <hypre_SeqVectorInnerProd._omp_fn.0+0x292> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nLEA -0x1(%RAX),%RSI                                   | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nMOV %RDX,%R10                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nCMP $0x6,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJBE 5a8b38 <hypre_SeqVectorInnerProd._omp_fn.0+0x2d8> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nMOV %RAX,%R11                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nLEA (,%RDX,8),%R12                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nVXORPD %XMM8,%XMM8,%XMM8                              | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nXOR %ECX,%ECX                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nSHR $0x3,%R11                                         | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nLEA (%R13,%R12,1),%R8                                 | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nADD %R14,%R12                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nSAL $0x6,%R11                                         | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nLEA -0x40(%R11),%RSI                                  | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nSHR $0x6,%RSI                                         | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 1       | 0.50\nINC %RSI                                              | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nAND $0x7,%ESI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a89aa <hypre_SeqVectorInnerProd._omp_fn.0+0x14a>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x1,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a898f <hypre_SeqVectorInnerProd._omp_fn.0+0x12f>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x2,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a897d <hypre_SeqVectorInnerProd._omp_fn.0+0x11d>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x3,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a896b <hypre_SeqVectorInnerProd._omp_fn.0+0x10b>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x4,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8959 <hypre_SeqVectorInnerProd._omp_fn.0+0xf9>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x5,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8947 <hypre_SeqVectorInnerProd._omp_fn.0+0xe7>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nCMP $0x6,%RSI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8935 <hypre_SeqVectorInnerProd._omp_fn.0+0xd5>   | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVMOVUPD (%R8),%ZMM2                                   | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nMOV $0x40,%ECX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVFMADD231PD (%R12),%ZMM2,%ZMM8                        | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nVMOVUPD (%R8,%RCX,1),%ZMM1                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM1,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD (%R8,%RCX,1),%ZMM6                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM6,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD (%R8,%RCX,1),%ZMM7                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM7,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD (%R8,%RCX,1),%ZMM5                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM5,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD (%R8,%RCX,1),%ZMM3                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM3,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVMOVUPD (%R8,%RCX,1),%ZMM4                            | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM4,%ZMM8                 | 1     | 0.50  | 0     | 0.50 | 0.50 | 0  | 0.50  | 0     | 0    | 4       | 0.50\nADD $0x40,%RCX                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nCMP %R11,%RCX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8a38 <hypre_SeqVectorInnerProd._omp_fn.0+0x1d8>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVEXTRACTF64X4 $0x1,%ZMM8,%YMM6                        | 1     | 0     | 0     | 0    | 0    | 0  | 1     | 0     | 0    | 3       | 1\nMOV %RAX,%RDI                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nVADDPD %YMM8,%YMM6,%YMM1                              | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nAND $-0x8,%RDI                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVADDPD %YMM6,%YMM8,%YMM0                              | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nADD %RDI,%RDX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVEXTRACTF64X2 $0x1,%YMM1,%XMM2                        | 1     | 0     | 0     | 0    | 0    | 0  | 1     | 0     | 0    | 3       | 1\nVADDPD %XMM1,%XMM2,%XMM7                              | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nVUNPCKHPD %XMM7,%XMM7,%XMM5                           | 1     | 0     | 0     | 0    | 0    | 0  | 1     | 0     | 0    | 1       | 1\nVADDPD %XMM7,%XMM5,%XMM15                             | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nCMP %RAX,%RDI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8b20 <hypre_SeqVectorInnerProd._omp_fn.0+0x2c0>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nSUB %RDI,%RAX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nLEA -0x1(%RAX),%R12                                   | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nCMP $0x2,%R12                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJBE 5a8aac <hypre_SeqVectorInnerProd._omp_fn.0+0x24c> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nADD %R10,%RDI                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nMOV %RAX,%R10                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nVMOVUPD (%R13,%RDI,8),%YMM8                           | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 5-6     | 0.50\nAND $-0x4,%R10                                        | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nADD %R10,%RDX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVFMADD231PD (%R14,%RDI,8),%YMM8,%YMM0                 | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4       | 0.50\nVEXTRACTF64X2 $0x1,%YMM0,%XMM3                        | 1     | 0     | 0     | 0    | 0    | 0  | 1     | 0     | 0    | 3       | 1\nVADDPD %XMM0,%XMM3,%XMM4                              | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nVUNPCKHPD %XMM4,%XMM4,%XMM9                           | 1     | 0     | 0     | 0    | 0    | 0  | 1     | 0     | 0    | 1       | 1\nVADDPD %XMM4,%XMM9,%XMM15                             | 1     | 0.50  | 0.50  | 0    | 0    | 0  | 0     | 0     | 0    | 4       | 0.50\nTEST $0x3,%AL                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJE 5a8b20 <hypre_SeqVectorInnerProd._omp_fn.0+0x2c0>  | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVMOVSD (%R13,%RDX,8),%XMM10                           | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLEA 0x1(%RDX),%RDI                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nLEA (,%RDX,8),%RAX                                    | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nVFMADD231SD (%R14,%RDX,8),%XMM10,%XMM15               | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4       | 0.50\nCMP %RDI,%R9                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJLE 5a8b20 <hypre_SeqVectorInnerProd._omp_fn.0+0x2c0> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVMOVSD 0x8(%R14,%RAX,1),%XMM11                        | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nADD $0x2,%RDX                                         | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nVFMADD231SD 0x8(%R13,%RAX,1),%XMM11,%XMM15            | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4       | 0.50\nCMP %RDX,%R9                                          | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nJLE 5a8b20 <hypre_SeqVectorInnerProd._omp_fn.0+0x2c0> | 1     | 0.50  | 0     | 0    | 0    | 0  | 0     | 0.50  | 0    | 0       | 0.50-1\nVMOVSD 0x10(%R13,%RAX,1),%XMM12                       | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nVFMADD231SD 0x10(%R14,%RAX,1),%XMM12,%XMM15           | 1     | 0.50  | 0.50  | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4       | 0.50\nVZEROUPPER                                            | 4     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 1\nMOV 0x18(%RBX),%RAX                                   | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 4-5     | 0.50\nLEA 0x18(%RBX),%R13                                   | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nLEA -0x20(%RBP),%RSP                                  | 1     | 0     | 0.50  | 0    | 0    | 0  | 0.50  | 0     | 0    | 1       | 0.50\nPOP %RBX                                              | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R12                                              | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R13                                              | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %R14                                              | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nPOP %RBP                                              | 1     | 0     | 0     | 0.50 | 0.50 | 0  | 0     | 0     | 0    | 2       | 0.50\nRET                                                   | 1     | 0     | 0     | 0.33 | 0.33 | 0  | 0     | 1     | 0.33 | 0       | 1\nXCHG %AX,%AX                                          | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nVZEROUPPER                                            | 4     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 1\nJMP 5a8af2 <hypre_SeqVectorInnerProd._omp_fn.0+0x292> | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPL (%RAX)                                           | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nINC %RAX                                              | 1     | 0.25  | 0.25  | 0    | 0    | 0  | 0.25  | 0.25  | 0    | 1       | 0.25\nXOR %EDX,%EDX                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nJMP 5a889b <hypre_SeqVectorInnerProd._omp_fn.0+0x3b>  | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPW (%RAX,%RAX,1)                                    | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nVXORPD %XMM0,%XMM0,%XMM0                              | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nXOR %EDI,%EDI                                         | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nJMP 5a8a6e <hypre_SeqVectorInnerProd._omp_fn.0+0x20e> | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 1     | 0    | 0       | 1-2\nNOPW %CS:(%RAX,%RAX,1)                                | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\nXCHG %AX,%AX                                          | 1     | 0     | 0     | 0    | 0    | 0  | 0     | 0     | 0    | 0       | 0.25\n",
        },
      },
      header = {
        "Warnings:\nDetected a function call instruction: ignoring called function instructions.\nRerun with --follow-calls=append to include them to analysis  or with --follow-calls=inline to simulate inlining.",
        "4% of peak computational performance is used (1.58 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          title = "Code clean check",
          txt = "Detected a slowdown caused by scalar integer instructions (typically used for address computation).\nBy removing them, you can lower the cost of an iteration from 90.00 to 19.25 cycles (4.68x speedup).",
        },
        {
          workaround = " - Try another compiler or update/tune your current one\n - Make array accesses unit-stride:\n  * If your function streams arrays of structures (AoS), try to use structures of arrays instead (SoA):\nfor(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)\n",
          details = "49% of SSE/AVX instructions are used in vector version (process two or more data elements in vector registers):\n - 72% of SSE/AVX loads are used in vector version.\n - 85% of SSE/AVX addition or subtraction instructions are used in vector version.\n - 72% of SSE/AVX fused multiply-add instructions are used in vector version.\n - 0% of SSE/AVX divide and square root instructions are used in vector version.\n - 25% of SSE/AVX instructions that are not load, store, addition, subtraction nor multiply instructions are used in vector version.\nSince your execution units are vector units, only a fully vectorized function can use their full power.\n",
          title = "Vectorization",
          txt = "Your function is poorly vectorized.\nOnly 37% of vector register length is used (average across all SSE/AVX instructions).\nBy fully vectorizing your function, you can lower the cost of an iteration from 90.00 to 45.00 cycles (2.00x speedup).",
        },
        {
          workaround = " - Reduce the number of division or square root instructions:\n  * If denominator is constant over iterations, use reciprocal (replace x/y with x*(1/y)). Check precision impact. This will be done by your compiler with ffast-math or Ofast\n - Check whether you really need double precision. If not, switch to single precision to speedup execution\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by execution of divide and square root operations (the divide/square root unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 90.00 to 53.75 cycles (1.67x speedup).\n",
        },
      },
      potential = {
        {
          title = "Expensive FP math instructions/calls",
          txt = "Detected performance impact from expensive FP math instructions/calls.\nBy removing/reexpressing them, you can lower the cost of an iteration from 90.00 to 39.75 cycles (2.26x speedup).",
        },
        {
          title = "FMA",
          txt = "Detected 63 FMA (fused multiply-add) operations.",
        },
      },
    },
  common = {
    header = {
      "The function is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/seq_mv/vector.c:483-486.\n",
      "Warnings:\nIgnoring paths for analysis",
    },
  },
}
