_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n<ul><li>VDIVPD: 8 occurrences</li></ul>",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Use vector aligned instructions:\n<ol><li>align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&amp;p, 64, size); }.</li><li>inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.</li></ol>",
          details = "<ul><li>VMOVUPD: 16 occurrences</li></ul>",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 16 optimal vector unaligned load/store instructions.\n",
        },
        {
          title = "Type of elements and instruction set",
          txt = "16 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 128 FP arithmetical operations:\n<ul><li>64: addition or subtraction</li><li>64: divide</li></ul>The binary loop is loading 1536 bytes (192 double precision FP elements).\nThe binary loop is storing 512 bytes (64 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.06 FP operations per loaded or stored byte.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "<table><tr><td>nb instructions</td><td>35</td></tr><tr><td>nb uops</td><td>58</td></tr><tr><td>loop length</td><td>269</td></tr><tr><td>used x86 registers</td><td>5</td></tr><tr><td>used mmx registers</td><td>0</td></tr><tr><td>used xmm registers</td><td>0</td></tr><tr><td>used ymm registers</td><td>0</td></tr><tr><td>used zmm registers</td><td>16</td></tr><tr><td>nb stack references</td><td>0</td></tr></table>",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\n<table><tr><td>micro-operation queue</td><td>16.50 cycles</td></tr><tr><td>front end</td><td>16.50 cycles</td></tr></table>",
        },
        {
          title = "Back-end",
          txt = "<table><tr><th>      </th><th>P0</th><th>P1</th><th>P2</th><th>P3</th><th>P4</th><th>P5</th><th>P6</th><th>P7</th></tr><tr><td>uops</td><td>16.00</td><td>1.00</td><td>12.00</td><td>12.00</td><td>8.00</td><td>16.00</td><td>1.00</td><td>8.00</td></tr><tr><td>cycles</td><td>16.00</td><td>4.00</td><td>12.00</td><td>12.00</td><td>8.00</td><td>16.00</td><td>1.00</td><td>8.00</td></tr></table>\n<table><tr><td>Cycles executing div or sqrt instructions</td><td>128.00</td></tr><tr><td>Longest recurrence chain latency (RecMII)</td><td>1.00</td></tr></table>",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "<table><tr><td>FE+BE cycles</td><td>129.02</td></tr><tr><td>Stall cycles</td><td>112.11</td></tr><tr><td>RS full (events)</td><td>128.62</td></tr></table>",
        },
        {
          title = "Cycles summary",
          txt = "<table><tr><td>Front-end</td><td>16.50</td></tr><tr><td>Dispatch</td><td>16.00</td></tr><tr><td>DIV/SQRT</td><td>128.00</td></tr><tr><td>Data deps.</td><td>1.00</td></tr><tr><td>Overall L1</td><td>128.00</td></tr></table>",
        },
        {
          title = "Vectorization ratios",
          txt = "<table><tr><td>all</td><td>100%</td></tr><tr><td>load</td><td>100%</td></tr><tr><td>store</td><td>100%</td></tr><tr><td>mul</td><td>NA (no mul vectorizable/vectorized instructions)</td></tr><tr><td>add-sub</td><td>100%</td></tr><tr><td>fma</td><td>NA (no fma vectorizable/vectorized instructions)</td></tr><tr><td>div/sqrt</td><td>100%</td></tr><tr><td>other</td><td>NA (no other vectorizable/vectorized instructions)</td></tr></table>",
        },
        {
          title = "Vector efficiency ratios",
          txt = "<table><tr><td>all</td><td>100%</td></tr><tr><td>load</td><td>100%</td></tr><tr><td>store</td><td>100%</td></tr><tr><td>mul</td><td>NA (no mul vectorizable/vectorized instructions)</td></tr><tr><td>add-sub</td><td>100%</td></tr><tr><td>fma</td><td>NA (no fma vectorizable/vectorized instructions)</td></tr><tr><td>div/sqrt</td><td>100%</td></tr><tr><td>other</td><td>NA (no other vectorizable/vectorized instructions)</td></tr></table>",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 128.00 cycles. At this rate:\n<ul><li>9% of peak load performance is reached (12.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))</li><li>6% of peak store performance is reached (4.00 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))</li></ul>",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 534a13\n\n<table><tr><th>Instruction</th><th>Nb FU</th><th>P0</th><th>P1</th><th>P2</th><th>P3</th><th>P4</th><th>P5</th><th>P6</th><th>P7</th><th>Latency</th><th>Recip. throughput</th></tr><tr><td>VMOVUPD (%R14,%RAX,1),%ZMM7</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD (%R15,%RAX,1),%ZMM7,%ZMM3</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD (%R13,%RAX,1),%ZMM3,%ZMM4</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM4,(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x40(%R14,%RAX,1),%ZMM11</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x40(%R15,%RAX,1),%ZMM11,%ZMM9</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x40(%R13,%RAX,1),%ZMM9,%ZMM14</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM14,0x40(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x80(%R14,%RAX,1),%ZMM5</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x80(%R15,%RAX,1),%ZMM5,%ZMM0</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x80(%R13,%RAX,1),%ZMM0,%ZMM12</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM12,0x80(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0xc0(%R14,%RAX,1),%ZMM10</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0xc0(%R15,%RAX,1),%ZMM10,%ZMM13</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0xc0(%R13,%RAX,1),%ZMM13,%ZMM2</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM2,0xc0(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x100(%R14,%RAX,1),%ZMM15</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x100(%R15,%RAX,1),%ZMM15,%ZMM1</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x100(%R13,%RAX,1),%ZMM1,%ZMM6</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM6,0x100(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x140(%R14,%RAX,1),%ZMM8</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x140(%R15,%RAX,1),%ZMM8,%ZMM7</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x140(%R13,%RAX,1),%ZMM7,%ZMM3</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM3,0x140(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x180(%R14,%RAX,1),%ZMM4</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x180(%R15,%RAX,1),%ZMM4,%ZMM11</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x180(%R13,%RAX,1),%ZMM11,%ZMM9</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM9,0x180(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x1c0(%R14,%RAX,1),%ZMM14</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x1c0(%R15,%RAX,1),%ZMM14,%ZMM5</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x1c0(%R13,%RAX,1),%ZMM5,%ZMM0</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM0,0x1c0(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>ADD $0x200,%RAX</td><td>1</td><td>0.25</td><td>0.25</td><td>0</td><td>0</td><td>0</td><td>0.25</td><td>0.25</td><td>0</td><td>1</td><td>0.25</td></tr><tr><td>CMP %RAX,%R8</td><td>1</td><td>0.25</td><td>0.25</td><td>0</td><td>0</td><td>0</td><td>0.25</td><td>0.25</td><td>0</td><td>1</td><td>0.25</td></tr><tr><td>JNE 534a13 <hypre_ParCSRRelax+0x4c03></td><td>1</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>0.50-1</td></tr></table>",
        },
      },
      header = {
        "3% of peak computational performance is used (1.00 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          workaround = "<ul><li>Reduce the number of division or square root instructions:\n<ul><li>If denominator is constant over iterations, use reciprocal (replace x/y with x*(1/y)). Check precision impact. This will be done by your compiler with ffast-math or Ofast</li></ul></li><li>Check whether you really need double precision. If not, switch to single precision to speedup execution</li></ul>",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by execution of divide and square root operations (the divide/square root unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 128.00 to 16.50 cycles (7.76x speedup).\n",
        },
      },
      potential = {
        {
          title = "Expensive FP math instructions/calls",
          txt = "Detected performance impact from expensive FP math instructions/calls.\nBy removing/reexpressing them, you can lower the cost of an iteration from 128.00 to 10.50 cycles (12.19x speedup).",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n<ul><li>VDIVPD: 8 occurrences</li></ul>",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Use vector aligned instructions:\n<ol><li>align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&amp;p, 64, size); }.</li><li>inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.</li></ol>",
          details = "<ul><li>VMOVUPD: 16 occurrences</li></ul>",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 16 optimal vector unaligned load/store instructions.\n",
        },
        {
          title = "Type of elements and instruction set",
          txt = "16 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 128 FP arithmetical operations:\n<ul><li>64: addition or subtraction</li><li>64: divide</li></ul>The binary loop is loading 1536 bytes (192 double precision FP elements).\nThe binary loop is storing 512 bytes (64 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.06 FP operations per loaded or stored byte.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "<table><tr><td>nb instructions</td><td>35</td></tr><tr><td>nb uops</td><td>58</td></tr><tr><td>loop length</td><td>269</td></tr><tr><td>used x86 registers</td><td>5</td></tr><tr><td>used mmx registers</td><td>0</td></tr><tr><td>used xmm registers</td><td>0</td></tr><tr><td>used ymm registers</td><td>0</td></tr><tr><td>used zmm registers</td><td>16</td></tr><tr><td>nb stack references</td><td>0</td></tr></table>",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\n<table><tr><td>micro-operation queue</td><td>16.50 cycles</td></tr><tr><td>front end</td><td>16.50 cycles</td></tr></table>",
        },
        {
          title = "Back-end",
          txt = "<table><tr><th>      </th><th>P0</th><th>P1</th><th>P2</th><th>P3</th><th>P4</th><th>P5</th><th>P6</th><th>P7</th></tr><tr><td>uops</td><td>16.00</td><td>1.00</td><td>12.00</td><td>12.00</td><td>8.00</td><td>16.00</td><td>1.00</td><td>8.00</td></tr><tr><td>cycles</td><td>16.00</td><td>4.00</td><td>12.00</td><td>12.00</td><td>8.00</td><td>16.00</td><td>1.00</td><td>8.00</td></tr></table>\n<table><tr><td>Cycles executing div or sqrt instructions</td><td>128.00</td></tr><tr><td>Longest recurrence chain latency (RecMII)</td><td>1.00</td></tr></table>",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "<table><tr><td>FE+BE cycles</td><td>129.02</td></tr><tr><td>Stall cycles</td><td>112.11</td></tr><tr><td>RS full (events)</td><td>128.62</td></tr></table>",
        },
        {
          title = "Cycles summary",
          txt = "<table><tr><td>Front-end</td><td>16.50</td></tr><tr><td>Dispatch</td><td>16.00</td></tr><tr><td>DIV/SQRT</td><td>128.00</td></tr><tr><td>Data deps.</td><td>1.00</td></tr><tr><td>Overall L1</td><td>128.00</td></tr></table>",
        },
        {
          title = "Vectorization ratios",
          txt = "<table><tr><td>all</td><td>100%</td></tr><tr><td>load</td><td>100%</td></tr><tr><td>store</td><td>100%</td></tr><tr><td>mul</td><td>NA (no mul vectorizable/vectorized instructions)</td></tr><tr><td>add-sub</td><td>100%</td></tr><tr><td>fma</td><td>NA (no fma vectorizable/vectorized instructions)</td></tr><tr><td>div/sqrt</td><td>100%</td></tr><tr><td>other</td><td>NA (no other vectorizable/vectorized instructions)</td></tr></table>",
        },
        {
          title = "Vector efficiency ratios",
          txt = "<table><tr><td>all</td><td>100%</td></tr><tr><td>load</td><td>100%</td></tr><tr><td>store</td><td>100%</td></tr><tr><td>mul</td><td>NA (no mul vectorizable/vectorized instructions)</td></tr><tr><td>add-sub</td><td>100%</td></tr><tr><td>fma</td><td>NA (no fma vectorizable/vectorized instructions)</td></tr><tr><td>div/sqrt</td><td>100%</td></tr><tr><td>other</td><td>NA (no other vectorizable/vectorized instructions)</td></tr></table>",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 128.00 cycles. At this rate:\n<ul><li>9% of peak load performance is reached (12.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))</li><li>6% of peak store performance is reached (4.00 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))</li></ul>",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 534a13\n\n<table><tr><th>Instruction</th><th>Nb FU</th><th>P0</th><th>P1</th><th>P2</th><th>P3</th><th>P4</th><th>P5</th><th>P6</th><th>P7</th><th>Latency</th><th>Recip. throughput</th></tr><tr><td>VMOVUPD (%R14,%RAX,1),%ZMM7</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD (%R15,%RAX,1),%ZMM7,%ZMM3</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD (%R13,%RAX,1),%ZMM3,%ZMM4</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM4,(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x40(%R14,%RAX,1),%ZMM11</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x40(%R15,%RAX,1),%ZMM11,%ZMM9</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x40(%R13,%RAX,1),%ZMM9,%ZMM14</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM14,0x40(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x80(%R14,%RAX,1),%ZMM5</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x80(%R15,%RAX,1),%ZMM5,%ZMM0</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x80(%R13,%RAX,1),%ZMM0,%ZMM12</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM12,0x80(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0xc0(%R14,%RAX,1),%ZMM10</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0xc0(%R15,%RAX,1),%ZMM10,%ZMM13</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0xc0(%R13,%RAX,1),%ZMM13,%ZMM2</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM2,0xc0(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x100(%R14,%RAX,1),%ZMM15</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x100(%R15,%RAX,1),%ZMM15,%ZMM1</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x100(%R13,%RAX,1),%ZMM1,%ZMM6</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM6,0x100(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x140(%R14,%RAX,1),%ZMM8</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x140(%R15,%RAX,1),%ZMM8,%ZMM7</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x140(%R13,%RAX,1),%ZMM7,%ZMM3</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM3,0x140(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x180(%R14,%RAX,1),%ZMM4</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x180(%R15,%RAX,1),%ZMM4,%ZMM11</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x180(%R13,%RAX,1),%ZMM11,%ZMM9</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM9,0x180(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x1c0(%R14,%RAX,1),%ZMM14</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VDIVPD 0x1c0(%R15,%RAX,1),%ZMM14,%ZMM5</td><td>4</td><td>2.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>24</td><td>16</td></tr><tr><td>VADDPD 0x1c0(%R13,%RAX,1),%ZMM5,%ZMM0</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM0,0x1c0(%R13,%RAX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>ADD $0x200,%RAX</td><td>1</td><td>0.25</td><td>0.25</td><td>0</td><td>0</td><td>0</td><td>0.25</td><td>0.25</td><td>0</td><td>1</td><td>0.25</td></tr><tr><td>CMP %RAX,%R8</td><td>1</td><td>0.25</td><td>0.25</td><td>0</td><td>0</td><td>0</td><td>0.25</td><td>0.25</td><td>0</td><td>1</td><td>0.25</td></tr><tr><td>JNE 534a13 <hypre_ParCSRRelax+0x4c03></td><td>1</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>0.50-1</td></tr></table>",
        },
      },
      header = {
        "3% of peak computational performance is used (1.00 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          workaround = "<ul><li>Reduce the number of division or square root instructions:\n<ul><li>If denominator is constant over iterations, use reciprocal (replace x/y with x*(1/y)). Check precision impact. This will be done by your compiler with ffast-math or Ofast</li></ul></li><li>Check whether you really need double precision. If not, switch to single precision to speedup execution</li></ul>",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by execution of divide and square root operations (the divide/square root unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 128.00 to 16.50 cycles (7.76x speedup).\n",
        },
      },
      potential = {
        {
          title = "Expensive FP math instructions/calls",
          txt = "Detected performance impact from expensive FP math instructions/calls.\nBy removing/reexpressing them, you can lower the cost of an iteration from 128.00 to 10.50 cycles (12.19x speedup).",
        },
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:78-79.\n",
      "It is main loop of related source loop which is unrolled by 8 (including vectorization).",
    },
    nb_paths = 1,
  },
}
