_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - VGATHERQPD: 8 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Try to remove indirect accesses. If applicable, precompute elements out of the innermost loop.",
          details = " - Irregular (variable stride) or indirect: 8 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          workaround = "Try to simplify your code and/or replace indirect accesses with unit-stride ones.",
          details = " - VGATHERQPD: 8 occurrences\n",
          title = "Gather/scatter instructions",
          txt = "Detected gather/scatter instructions (typically caused by indirect accesses).",
        },
        {
          title = "Type of elements and instruction set",
          txt = "8 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 128 FP arithmetical operations:\n - 64: addition or subtraction (all inside FMA instructions)\n - 64: multiply (all inside FMA instructions)\nThe binary loop is loading 1536 bytes (192 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.08 FP operations per loaded or stored byte.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 35\nnb uops            : 58\nloop length        : 229\nused x86 registers : 5\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 15\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 16.50 cycles\nfront end            : 16.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1   | P2    | P3    | P4   | P5    | P6   | P7\n------------------------------------------------------------------\nuops   | 16.00 | 1.00 | 40.00 | 40.00 | 0.00 | 16.00 | 1.00 | 0.00\ncycles | 16.00 | 4.00 | 40.00 | 40.00 | 0.00 | 16.00 | 1.00 | 0.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 58.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 102.55\nStall cycles    : 89.69\nLB full (events): 93.20\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 16.50\nDispatch  : 40.00\nData deps.: 58.00\nOverall L1: 58.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : NA (no other vectorizable/vectorized instructions)\nFP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : NA (no other vectorizable/vectorized instructions)\nFP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 58.00 cycles. At this rate:\n - 20% of peak load performance is reached (26.48 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 5a1c9b\n\nInstruction                                                   | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7 | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------------------------\nVMOVDQU64 (%RCX,%RAX,1),%ZMM6                                 | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K4                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVMOVDQU64 0x40(%RCX,%RAX,1),%ZMM11                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K2                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVMOVDQU64 0x80(%RCX,%RAX,1),%ZMM12                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K1                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVMOVDQU64 0xc0(%RCX,%RAX,1),%ZMM4                             | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K5                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVGATHERQPD (%RBX,%ZMM6,8),%ZMM14{%K4}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVGATHERQPD (%RBX,%ZMM11,8),%ZMM7{%K2}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVMOVDQU64 0x100(%RCX,%RAX,1),%ZMM13                           | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K6                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVFNMADD231PD (%R9,%RAX,1),%ZMM14,%ZMM0                        | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVGATHERQPD (%RBX,%ZMM12,8),%ZMM2{%K1}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVMOVDQU64 0x140(%RCX,%RAX,1),%ZMM1                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K3                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVGATHERQPD (%RBX,%ZMM13,8),%ZMM10{%K6}                        | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVMOVDQU64 0x180(%RCX,%RAX,1),%ZMM3                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K7                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVMOVDQU64 0x1c0(%RCX,%RAX,1),%ZMM8                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVGATHERQPD (%RBX,%ZMM1,8),%ZMM9{%K3}                          | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nKMOVB %K0,%K4                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVGATHERQPD (%RBX,%ZMM3,8),%ZMM15{%K7}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVFNMADD231PD 0x40(%R9,%RAX,1),%ZMM7,%ZMM0                     | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFNMADD132PD 0x80(%R9,%RAX,1),%ZMM0,%ZMM2                     | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVGATHERQPD (%RBX,%ZMM4,8),%ZMM0{%K5}                          | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVFNMADD132PD 0xc0(%R9,%RAX,1),%ZMM2,%ZMM0                     | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFNMADD132PD 0x100(%R9,%RAX,1),%ZMM0,%ZMM10                   | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVGATHERQPD (%RBX,%ZMM8,8),%ZMM0{%K4}                          | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVFNMADD132PD 0x140(%R9,%RAX,1),%ZMM10,%ZMM9                   | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFNMADD132PD 0x180(%R9,%RAX,1),%ZMM9,%ZMM15                   | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFNMADD132PD 0x1c0(%R9,%RAX,1),%ZMM15,%ZMM0                   | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nADD $0x200,%RAX                                               | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nCMP %RAX,%R10                                                 | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nJNE 5a1c9b <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1beb> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0  | 0       | 0.50-1\n",
        },
      },
      header = {
        "6% of peak computational performance is used (2.21 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          title = "FMA",
          txt = "Detected 64 FMA (fused multiply-add) operations.",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - VGATHERQPD: 8 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Try to remove indirect accesses. If applicable, precompute elements out of the innermost loop.",
          details = " - Irregular (variable stride) or indirect: 8 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          workaround = "Try to simplify your code and/or replace indirect accesses with unit-stride ones.",
          details = " - VGATHERQPD: 8 occurrences\n",
          title = "Gather/scatter instructions",
          txt = "Detected gather/scatter instructions (typically caused by indirect accesses).",
        },
        {
          title = "Type of elements and instruction set",
          txt = "8 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 128 FP arithmetical operations:\n - 64: addition or subtraction (all inside FMA instructions)\n - 64: multiply (all inside FMA instructions)\nThe binary loop is loading 1536 bytes (192 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.08 FP operations per loaded or stored byte.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 35\nnb uops            : 58\nloop length        : 229\nused x86 registers : 5\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 15\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 16.50 cycles\nfront end            : 16.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1   | P2    | P3    | P4   | P5    | P6   | P7\n------------------------------------------------------------------\nuops   | 16.00 | 1.00 | 40.00 | 40.00 | 0.00 | 16.00 | 1.00 | 0.00\ncycles | 16.00 | 4.00 | 40.00 | 40.00 | 0.00 | 16.00 | 1.00 | 0.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 58.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 102.55\nStall cycles    : 89.69\nLB full (events): 93.20\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 16.50\nDispatch  : 40.00\nData deps.: 58.00\nOverall L1: 58.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : NA (no other vectorizable/vectorized instructions)\nFP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : NA (no other vectorizable/vectorized instructions)\nFP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 58.00 cycles. At this rate:\n - 20% of peak load performance is reached (26.48 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 5a1c9b\n\nInstruction                                                   | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7 | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------------------------\nVMOVDQU64 (%RCX,%RAX,1),%ZMM6                                 | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K4                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVMOVDQU64 0x40(%RCX,%RAX,1),%ZMM11                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K2                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVMOVDQU64 0x80(%RCX,%RAX,1),%ZMM12                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K1                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVMOVDQU64 0xc0(%RCX,%RAX,1),%ZMM4                             | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K5                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVGATHERQPD (%RBX,%ZMM6,8),%ZMM14{%K4}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVGATHERQPD (%RBX,%ZMM11,8),%ZMM7{%K2}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVMOVDQU64 0x100(%RCX,%RAX,1),%ZMM13                           | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K6                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVFNMADD231PD (%R9,%RAX,1),%ZMM14,%ZMM0                        | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVGATHERQPD (%RBX,%ZMM12,8),%ZMM2{%K1}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVMOVDQU64 0x140(%RCX,%RAX,1),%ZMM1                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K3                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVGATHERQPD (%RBX,%ZMM13,8),%ZMM10{%K6}                        | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVMOVDQU64 0x180(%RCX,%RAX,1),%ZMM3                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKMOVB %K0,%K7                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVMOVDQU64 0x1c0(%RCX,%RAX,1),%ZMM8                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVGATHERQPD (%RBX,%ZMM1,8),%ZMM9{%K3}                          | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nKMOVB %K0,%K4                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVGATHERQPD (%RBX,%ZMM3,8),%ZMM15{%K7}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVFNMADD231PD 0x40(%R9,%RAX,1),%ZMM7,%ZMM0                     | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFNMADD132PD 0x80(%R9,%RAX,1),%ZMM0,%ZMM2                     | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVGATHERQPD (%RBX,%ZMM4,8),%ZMM0{%K5}                          | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVFNMADD132PD 0xc0(%R9,%RAX,1),%ZMM2,%ZMM0                     | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFNMADD132PD 0x100(%R9,%RAX,1),%ZMM0,%ZMM10                   | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVGATHERQPD (%RBX,%ZMM8,8),%ZMM0{%K4}                          | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0  | 21      | 5\nVFNMADD132PD 0x140(%R9,%RAX,1),%ZMM10,%ZMM9                   | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFNMADD132PD 0x180(%R9,%RAX,1),%ZMM9,%ZMM15                   | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFNMADD132PD 0x1c0(%R9,%RAX,1),%ZMM15,%ZMM0                   | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nADD $0x200,%RAX                                               | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nCMP %RAX,%R10                                                 | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nJNE 5a1c9b <hypre_CSRMatrixMatvecOutOfPlace._omp_fn.6+0x1beb> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0  | 0       | 0.50-1\n",
        },
      },
      header = {
        "6% of peak computational performance is used (2.21 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          title = "FMA",
          txt = "Detected 64 FMA (fused multiply-add) operations.",
        },
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/seq_mv/csr_matvec.c:310-312.\n",
      "The related source loop is not unrolled or unrolled with no peel/tail loop.",
    },
    nb_paths = 1,
  },
}
