_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          workaround = "Use vector aligned instructions:\n<ol><li>align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&amp;p, 64, size); }.</li><li>inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.</li></ol>",
          details = "<ul><li>VMOVUPD: 16 occurrences</li></ul>",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 16 optimal vector unaligned load/store instructions.\n",
        },
        {
          title = "Type of elements and instruction set",
          txt = "8 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 128 FP arithmetical operations:\n<ul><li>64: addition or subtraction (all inside FMA instructions)</li><li>64: multiply (all inside FMA instructions)</li></ul>The binary loop is loading 1024 bytes (128 double precision FP elements).\nThe binary loop is storing 512 bytes (64 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.08 FP operations per loaded or stored byte.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "<table><tr><td>nb instructions</td><td>27</td></tr><tr><td>nb uops</td><td>26</td></tr><tr><td>loop length</td><td>205</td></tr><tr><td>used x86 registers</td><td>4</td></tr><tr><td>used mmx registers</td><td>0</td></tr><tr><td>used xmm registers</td><td>0</td></tr><tr><td>used ymm registers</td><td>0</td></tr><tr><td>used zmm registers</td><td>9</td></tr><tr><td>nb stack references</td><td>0</td></tr></table>",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\n<table><tr><td>micro-operation queue</td><td>8.50 cycles</td></tr><tr><td>front end</td><td>8.50 cycles</td></tr></table>",
        },
        {
          title = "Back-end",
          txt = "<table><tr><th>      </th><th>P0</th><th>P1</th><th>P2</th><th>P3</th><th>P4</th><th>P5</th><th>P6</th><th>P7</th></tr><tr><td>uops</td><td>4.00</td><td>1.00</td><td>8.00</td><td>8.00</td><td>8.00</td><td>4.00</td><td>1.00</td><td>8.00</td></tr><tr><td>cycles</td><td>4.00</td><td>4.00</td><td>8.00</td><td>8.00</td><td>8.00</td><td>4.00</td><td>1.00</td><td>8.00</td></tr></table>\n<table><tr><td>Cycles executing div or sqrt instructions</td><td>NA</td></tr><tr><td>Longest recurrence chain latency (RecMII)</td><td>1.00</td></tr></table>",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "<table><tr><td>FE+BE cycles</td><td>8.66</td></tr><tr><td>Stall cycles</td><td>0.00</td></tr></table>",
        },
        {
          title = "Cycles summary",
          txt = "<table><tr><td>Front-end</td><td>8.50</td></tr><tr><td>Dispatch</td><td>8.00</td></tr><tr><td>Data deps.</td><td>1.00</td></tr><tr><td>Overall L1</td><td>8.50</td></tr></table>",
        },
        {
          title = "Vectorization ratios",
          txt = "<table><tr><td>all</td><td>100%</td></tr><tr><td>load</td><td>100%</td></tr><tr><td>store</td><td>100%</td></tr><tr><td>mul</td><td>NA (no mul vectorizable/vectorized instructions)</td></tr><tr><td>add-sub</td><td>NA (no add-sub vectorizable/vectorized instructions)</td></tr><tr><td>fma</td><td>100%</td></tr><tr><td>div/sqrt</td><td>NA (no div/sqrt vectorizable/vectorized instructions)</td></tr><tr><td>other</td><td>NA (no other vectorizable/vectorized instructions)</td></tr></table>",
        },
        {
          title = "Vector efficiency ratios",
          txt = "<table><tr><td>all</td><td>100%</td></tr><tr><td>load</td><td>100%</td></tr><tr><td>store</td><td>100%</td></tr><tr><td>mul</td><td>NA (no mul vectorizable/vectorized instructions)</td></tr><tr><td>add-sub</td><td>NA (no add-sub vectorizable/vectorized instructions)</td></tr><tr><td>fma</td><td>100%</td></tr><tr><td>div/sqrt</td><td>NA (no div/sqrt vectorizable/vectorized instructions)</td></tr><tr><td>other</td><td>NA (no other vectorizable/vectorized instructions)</td></tr></table>",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 8.50 cycles. At this rate:\n<ul><li>94% of peak load performance is reached (120.47 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))</li><li>94% of peak store performance is reached (60.24 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))</li></ul>",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Performance is limited by instruction throughput (loading/decoding program instructions to execution core) (front-end is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 8.50 to 8.00 cycles (1.06x speedup).\n",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 5a8684\n\n<table><tr><th>Instruction</th><th>Nb FU</th><th>P0</th><th>P1</th><th>P2</th><th>P3</th><th>P4</th><th>P5</th><th>P6</th><th>P7</th><th>Latency</th><th>Recip. throughput</th></tr><tr><td>VMOVUPD (%R9,%RBX,1),%ZMM9</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD (%R8,%RBX,1),%ZMM1,%ZMM9</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM9,(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x40(%R9,%RBX,1),%ZMM10</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x40(%R8,%RBX,1),%ZMM1,%ZMM10</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM10,0x40(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x80(%R9,%RBX,1),%ZMM11</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x80(%R8,%RBX,1),%ZMM1,%ZMM11</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM11,0x80(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0xc0(%R9,%RBX,1),%ZMM12</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0xc0(%R8,%RBX,1),%ZMM1,%ZMM12</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM12,0xc0(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x100(%R9,%RBX,1),%ZMM13</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x100(%R8,%RBX,1),%ZMM1,%ZMM13</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM13,0x100(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x140(%R9,%RBX,1),%ZMM14</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x140(%R8,%RBX,1),%ZMM1,%ZMM14</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM14,0x140(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x180(%R9,%RBX,1),%ZMM15</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x180(%R8,%RBX,1),%ZMM1,%ZMM15</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM15,0x180(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x1c0(%R9,%RBX,1),%ZMM2</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x1c0(%R8,%RBX,1),%ZMM1,%ZMM2</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM2,0x1c0(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>ADD $0x200,%RBX</td><td>1</td><td>0.25</td><td>0.25</td><td>0</td><td>0</td><td>0</td><td>0.25</td><td>0.25</td><td>0</td><td>1</td><td>0.25</td></tr><tr><td>CMP %R13,%RBX</td><td>1</td><td>0.25</td><td>0.25</td><td>0</td><td>0</td><td>0</td><td>0.25</td><td>0.25</td><td>0</td><td>1</td><td>0.25</td></tr><tr><td>JNE 5a8684 <hypre_SeqVectorAxpy._omp_fn.0+0x304></td><td>1</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>0.50-1</td></tr></table>",
        },
      },
      header = {
        "47% of peak computational performance is used (15.06 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          title = "FMA",
          txt = "Detected 64 FMA (fused multiply-add) operations.",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          workaround = "Use vector aligned instructions:\n<ol><li>align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&amp;p, 64, size); }.</li><li>inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.</li></ol>",
          details = "<ul><li>VMOVUPD: 16 occurrences</li></ul>",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 16 optimal vector unaligned load/store instructions.\n",
        },
        {
          title = "Type of elements and instruction set",
          txt = "8 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 128 FP arithmetical operations:\n<ul><li>64: addition or subtraction (all inside FMA instructions)</li><li>64: multiply (all inside FMA instructions)</li></ul>The binary loop is loading 1024 bytes (128 double precision FP elements).\nThe binary loop is storing 512 bytes (64 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.08 FP operations per loaded or stored byte.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "<table><tr><td>nb instructions</td><td>27</td></tr><tr><td>nb uops</td><td>26</td></tr><tr><td>loop length</td><td>205</td></tr><tr><td>used x86 registers</td><td>4</td></tr><tr><td>used mmx registers</td><td>0</td></tr><tr><td>used xmm registers</td><td>0</td></tr><tr><td>used ymm registers</td><td>0</td></tr><tr><td>used zmm registers</td><td>9</td></tr><tr><td>nb stack references</td><td>0</td></tr></table>",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\n<table><tr><td>micro-operation queue</td><td>8.50 cycles</td></tr><tr><td>front end</td><td>8.50 cycles</td></tr></table>",
        },
        {
          title = "Back-end",
          txt = "<table><tr><th>      </th><th>P0</th><th>P1</th><th>P2</th><th>P3</th><th>P4</th><th>P5</th><th>P6</th><th>P7</th></tr><tr><td>uops</td><td>4.00</td><td>1.00</td><td>8.00</td><td>8.00</td><td>8.00</td><td>4.00</td><td>1.00</td><td>8.00</td></tr><tr><td>cycles</td><td>4.00</td><td>4.00</td><td>8.00</td><td>8.00</td><td>8.00</td><td>4.00</td><td>1.00</td><td>8.00</td></tr></table>\n<table><tr><td>Cycles executing div or sqrt instructions</td><td>NA</td></tr><tr><td>Longest recurrence chain latency (RecMII)</td><td>1.00</td></tr></table>",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "<table><tr><td>FE+BE cycles</td><td>8.66</td></tr><tr><td>Stall cycles</td><td>0.00</td></tr></table>",
        },
        {
          title = "Cycles summary",
          txt = "<table><tr><td>Front-end</td><td>8.50</td></tr><tr><td>Dispatch</td><td>8.00</td></tr><tr><td>Data deps.</td><td>1.00</td></tr><tr><td>Overall L1</td><td>8.50</td></tr></table>",
        },
        {
          title = "Vectorization ratios",
          txt = "<table><tr><td>all</td><td>100%</td></tr><tr><td>load</td><td>100%</td></tr><tr><td>store</td><td>100%</td></tr><tr><td>mul</td><td>NA (no mul vectorizable/vectorized instructions)</td></tr><tr><td>add-sub</td><td>NA (no add-sub vectorizable/vectorized instructions)</td></tr><tr><td>fma</td><td>100%</td></tr><tr><td>div/sqrt</td><td>NA (no div/sqrt vectorizable/vectorized instructions)</td></tr><tr><td>other</td><td>NA (no other vectorizable/vectorized instructions)</td></tr></table>",
        },
        {
          title = "Vector efficiency ratios",
          txt = "<table><tr><td>all</td><td>100%</td></tr><tr><td>load</td><td>100%</td></tr><tr><td>store</td><td>100%</td></tr><tr><td>mul</td><td>NA (no mul vectorizable/vectorized instructions)</td></tr><tr><td>add-sub</td><td>NA (no add-sub vectorizable/vectorized instructions)</td></tr><tr><td>fma</td><td>100%</td></tr><tr><td>div/sqrt</td><td>NA (no div/sqrt vectorizable/vectorized instructions)</td></tr><tr><td>other</td><td>NA (no other vectorizable/vectorized instructions)</td></tr></table>",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 8.50 cycles. At this rate:\n<ul><li>94% of peak load performance is reached (120.47 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))</li><li>94% of peak store performance is reached (60.24 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))</li></ul>",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Performance is limited by instruction throughput (loading/decoding program instructions to execution core) (front-end is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 8.50 to 8.00 cycles (1.06x speedup).\n",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 5a8684\n\n<table><tr><th>Instruction</th><th>Nb FU</th><th>P0</th><th>P1</th><th>P2</th><th>P3</th><th>P4</th><th>P5</th><th>P6</th><th>P7</th><th>Latency</th><th>Recip. throughput</th></tr><tr><td>VMOVUPD (%R9,%RBX,1),%ZMM9</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD (%R8,%RBX,1),%ZMM1,%ZMM9</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM9,(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x40(%R9,%RBX,1),%ZMM10</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x40(%R8,%RBX,1),%ZMM1,%ZMM10</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM10,0x40(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x80(%R9,%RBX,1),%ZMM11</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x80(%R8,%RBX,1),%ZMM1,%ZMM11</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM11,0x80(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0xc0(%R9,%RBX,1),%ZMM12</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0xc0(%R8,%RBX,1),%ZMM1,%ZMM12</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM12,0xc0(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x100(%R9,%RBX,1),%ZMM13</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x100(%R8,%RBX,1),%ZMM1,%ZMM13</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM13,0x100(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x140(%R9,%RBX,1),%ZMM14</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x140(%R8,%RBX,1),%ZMM1,%ZMM14</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM14,0x140(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x180(%R9,%RBX,1),%ZMM15</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x180(%R8,%RBX,1),%ZMM1,%ZMM15</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM15,0x180(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>VMOVUPD 0x1c0(%R9,%RBX,1),%ZMM2</td><td>1</td><td>0</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>5-6</td><td>0.50</td></tr><tr><td>VFMADD213PD 0x1c0(%R8,%RBX,1),%ZMM1,%ZMM2</td><td>1</td><td>0.50</td><td>0</td><td>0.50</td><td>0.50</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>4</td><td>0.50</td></tr><tr><td>VMOVUPD %ZMM2,0x1c0(%R8,%RBX,1)</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>0.33</td><td>1</td><td>0</td><td>0</td><td>0.33</td><td>3</td><td>1</td></tr><tr><td>ADD $0x200,%RBX</td><td>1</td><td>0.25</td><td>0.25</td><td>0</td><td>0</td><td>0</td><td>0.25</td><td>0.25</td><td>0</td><td>1</td><td>0.25</td></tr><tr><td>CMP %R13,%RBX</td><td>1</td><td>0.25</td><td>0.25</td><td>0</td><td>0</td><td>0</td><td>0.25</td><td>0.25</td><td>0</td><td>1</td><td>0.25</td></tr><tr><td>JNE 5a8684 <hypre_SeqVectorAxpy._omp_fn.0+0x304></td><td>1</td><td>0.50</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0.50</td><td>0</td><td>0</td><td>0.50-1</td></tr></table>",
        },
      },
      header = {
        "47% of peak computational performance is used (15.06 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          title = "FMA",
          txt = "Detected 64 FMA (fused multiply-add) operations.",
        },
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/seq_mv/vector.c:452.\n",
      "It is main loop of related source loop which is unrolled by 8 (including vectorization).",
    },
    nb_paths = 1,
  },
}
