_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - VDIVPD: 8 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VMOVUPD: 16 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 16 optimal vector unaligned load/store instructions.\n",
        },
        {
          title = "Type of elements and instruction set",
          txt = "16 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 128 FP arithmetical operations:\n - 64: addition or subtraction\n - 64: divide\nThe binary loop is loading 1536 bytes (192 double precision FP elements).\nThe binary loop is storing 512 bytes (64 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.06 FP operations per loaded or stored byte.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 35\nnb uops            : 58\nloop length        : 269\nused x86 registers : 5\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 16\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 16.50 cycles\nfront end            : 16.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1   | P2    | P3    | P4   | P5    | P6   | P7\n------------------------------------------------------------------\nuops   | 16.00 | 1.00 | 12.00 | 12.00 | 8.00 | 16.00 | 1.00 | 8.00\ncycles | 16.00 | 4.00 | 12.00 | 12.00 | 8.00 | 16.00 | 1.00 | 8.00\n\nCycles executing div or sqrt instructions: 128.00\nLongest recurrence chain latency (RecMII): 1.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 129.02\nStall cycles    : 112.11\nRS full (events): 128.62\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 16.50\nDispatch  : 16.00\nDIV/SQRT  : 128.00\nData deps.: 1.00\nOverall L1: 128.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 100%\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 100%\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 128.00 cycles. At this rate:\n - 9% of peak load performance is reached (12.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 6% of peak store performance is reached (4.00 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 534a13\n\nInstruction                            | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7   | Latency | Recip. throughput\n----------------------------------------------------------------------------------------------------------------------------------\nVMOVUPD (%R14,%RAX,1),%ZMM7            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD (%R15,%RAX,1),%ZMM7,%ZMM3       | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD (%R13,%RAX,1),%ZMM3,%ZMM4       | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM4,(%R13,%RAX,1)            | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x40(%R14,%RAX,1),%ZMM11       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x40(%R15,%RAX,1),%ZMM11,%ZMM9  | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x40(%R13,%RAX,1),%ZMM9,%ZMM14  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM14,0x40(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x80(%R14,%RAX,1),%ZMM5        | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x80(%R15,%RAX,1),%ZMM5,%ZMM0   | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x80(%R13,%RAX,1),%ZMM0,%ZMM12  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM12,0x80(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0xc0(%R14,%RAX,1),%ZMM10       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0xc0(%R15,%RAX,1),%ZMM10,%ZMM13 | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0xc0(%R13,%RAX,1),%ZMM13,%ZMM2  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM2,0xc0(%R13,%RAX,1)        | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x100(%R14,%RAX,1),%ZMM15      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x100(%R15,%RAX,1),%ZMM15,%ZMM1 | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x100(%R13,%RAX,1),%ZMM1,%ZMM6  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM6,0x100(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x140(%R14,%RAX,1),%ZMM8       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x140(%R15,%RAX,1),%ZMM8,%ZMM7  | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x140(%R13,%RAX,1),%ZMM7,%ZMM3  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM3,0x140(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x180(%R14,%RAX,1),%ZMM4       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x180(%R15,%RAX,1),%ZMM4,%ZMM11 | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x180(%R13,%RAX,1),%ZMM11,%ZMM9 | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM9,0x180(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x1c0(%R14,%RAX,1),%ZMM14      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x1c0(%R15,%RAX,1),%ZMM14,%ZMM5 | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x1c0(%R13,%RAX,1),%ZMM5,%ZMM0  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM0,0x1c0(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nADD $0x200,%RAX                        | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nCMP %RAX,%R8                           | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nJNE 534a13 <hypre_ParCSRRelax+0x4c03>  | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0    | 0       | 0.50-1\n",
        },
      },
      header = {
        "3% of peak computational performance is used (1.00 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          workaround = " - Reduce the number of division or square root instructions:\n  * If denominator is constant over iterations, use reciprocal (replace x/y with x*(1/y)). Check precision impact. This will be done by your compiler with ffast-math or Ofast\n - Check whether you really need double precision. If not, switch to single precision to speedup execution\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by execution of divide and square root operations (the divide/square root unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 128.00 to 16.50 cycles (7.76x speedup).\n",
        },
      },
      potential = {
        {
          title = "Expensive FP math instructions/calls",
          txt = "Detected performance impact from expensive FP math instructions/calls.\nBy removing/reexpressing them, you can lower the cost of an iteration from 128.00 to 10.50 cycles (12.19x speedup).",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - VDIVPD: 8 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VMOVUPD: 16 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 16 optimal vector unaligned load/store instructions.\n",
        },
        {
          title = "Type of elements and instruction set",
          txt = "16 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 128 FP arithmetical operations:\n - 64: addition or subtraction\n - 64: divide\nThe binary loop is loading 1536 bytes (192 double precision FP elements).\nThe binary loop is storing 512 bytes (64 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.06 FP operations per loaded or stored byte.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 35\nnb uops            : 58\nloop length        : 269\nused x86 registers : 5\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 16\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 16.50 cycles\nfront end            : 16.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1   | P2    | P3    | P4   | P5    | P6   | P7\n------------------------------------------------------------------\nuops   | 16.00 | 1.00 | 12.00 | 12.00 | 8.00 | 16.00 | 1.00 | 8.00\ncycles | 16.00 | 4.00 | 12.00 | 12.00 | 8.00 | 16.00 | 1.00 | 8.00\n\nCycles executing div or sqrt instructions: 128.00\nLongest recurrence chain latency (RecMII): 1.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 129.02\nStall cycles    : 112.11\nRS full (events): 128.62\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 16.50\nDispatch  : 16.00\nDIV/SQRT  : 128.00\nData deps.: 1.00\nOverall L1: 128.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 100%\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: 100%\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 128.00 cycles. At this rate:\n - 9% of peak load performance is reached (12.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 6% of peak store performance is reached (4.00 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 534a13\n\nInstruction                            | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7   | Latency | Recip. throughput\n----------------------------------------------------------------------------------------------------------------------------------\nVMOVUPD (%R14,%RAX,1),%ZMM7            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD (%R15,%RAX,1),%ZMM7,%ZMM3       | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD (%R13,%RAX,1),%ZMM3,%ZMM4       | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM4,(%R13,%RAX,1)            | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x40(%R14,%RAX,1),%ZMM11       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x40(%R15,%RAX,1),%ZMM11,%ZMM9  | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x40(%R13,%RAX,1),%ZMM9,%ZMM14  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM14,0x40(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x80(%R14,%RAX,1),%ZMM5        | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x80(%R15,%RAX,1),%ZMM5,%ZMM0   | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x80(%R13,%RAX,1),%ZMM0,%ZMM12  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM12,0x80(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0xc0(%R14,%RAX,1),%ZMM10       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0xc0(%R15,%RAX,1),%ZMM10,%ZMM13 | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0xc0(%R13,%RAX,1),%ZMM13,%ZMM2  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM2,0xc0(%R13,%RAX,1)        | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x100(%R14,%RAX,1),%ZMM15      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x100(%R15,%RAX,1),%ZMM15,%ZMM1 | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x100(%R13,%RAX,1),%ZMM1,%ZMM6  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM6,0x100(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x140(%R14,%RAX,1),%ZMM8       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x140(%R15,%RAX,1),%ZMM8,%ZMM7  | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x140(%R13,%RAX,1),%ZMM7,%ZMM3  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM3,0x140(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x180(%R14,%RAX,1),%ZMM4       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x180(%R15,%RAX,1),%ZMM4,%ZMM11 | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x180(%R13,%RAX,1),%ZMM11,%ZMM9 | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM9,0x180(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVMOVUPD 0x1c0(%R14,%RAX,1),%ZMM14      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVDIVPD 0x1c0(%R15,%RAX,1),%ZMM14,%ZMM5 | 4     | 2.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 24      | 16\nVADDPD 0x1c0(%R13,%RAX,1),%ZMM5,%ZMM0  | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVUPD %ZMM0,0x1c0(%R13,%RAX,1)       | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nADD $0x200,%RAX                        | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nCMP %RAX,%R8                           | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nJNE 534a13 <hypre_ParCSRRelax+0x4c03>  | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0    | 0       | 0.50-1\n",
        },
      },
      header = {
        "3% of peak computational performance is used (1.00 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          workaround = " - Reduce the number of division or square root instructions:\n  * If denominator is constant over iterations, use reciprocal (replace x/y with x*(1/y)). Check precision impact. This will be done by your compiler with ffast-math or Ofast\n - Check whether you really need double precision. If not, switch to single precision to speedup execution\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by execution of divide and square root operations (the divide/square root unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 128.00 to 16.50 cycles (7.76x speedup).\n",
        },
      },
      potential = {
        {
          title = "Expensive FP math instructions/calls",
          txt = "Detected performance impact from expensive FP math instructions/calls.\nBy removing/reexpressing them, you can lower the cost of an iteration from 128.00 to 10.50 cycles (12.19x speedup).",
        },
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:78-79.\n",
      "It is main loop of related source loop which is unrolled by 8 (including vectorization).",
    },
    nb_paths = 1,
  },
}
