_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          details = " - Constant non-unit stride: 1 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          title = "Type of elements and instruction set",
          txt = "",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop does not contain any FP arithmetical operations.\nThe binary loop is loading 1024 bytes.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 43\nnb uops            : 34\nloop length        : 304\nused x86 registers : 2\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 16\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 8.50 cycles\nfront end            : 8.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 8.00 | 1.00 | 8.00 | 8.00 | 0.00 | 8.00 | 1.00 | 0.00\ncycles | 8.00 | 1.00 | 8.00 | 8.00 | 0.00 | 8.00 | 1.00 | 0.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 8.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles: 8.63\nStall cycles: 0.00\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 8.50\nDispatch  : 8.00\nData deps.: 8.00\nOverall L1: 8.50\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Detected masked instructions: assuming all mask elements are active.\nAssuming all data fit into the L1 cache, each iteration of the binary loop takes 8.50 cycles. At this rate:\n - 94% of peak load performance is reached (120.47 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Performance is limited by instruction throughput (loading/decoding program instructions to execution core) (front-end is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 8.50 to 8.00 cycles (1.06x speedup).\n",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 59a343\n\nInstruction                                 | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7 | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------\nVMOVDQU64 0x8(%R8),%ZMM11                   | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVMOVDQU64 0x48(%R8),%ZMM15                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nADD $0x200,%R8                              | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nVMOVDQU64 -0x178(%R8),%ZMM6                 | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVMOVDQU64 -0x138(%R8),%ZMM10                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVPSUBQ -0x200(%R8),%ZMM11,%ZMM12            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPSUBQ -0x1c0(%R8),%ZMM15,%ZMM5             | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPSUBQ -0x180(%R8),%ZMM6,%ZMM7              | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPCMPNLEQ %ZMM2,%ZMM12,%K1\nVPCMPNLEQ %ZMM2,%ZMM5,%K2\nVPCMPNLEQ %ZMM2,%ZMM7,%K3\nVMOVDQA64 %ZMM1,%ZMM13{%K1}{z}              | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVMOVDQA64 %ZMM1,%ZMM3{%K2}{z}               | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPSUBQ %ZMM13,%ZMM0,%ZMM14                  | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPSUBQ -0x140(%R8),%ZMM10,%ZMM0             | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQU64 -0xf8(%R8),%ZMM13                 | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVMOVDQA64 %ZMM1,%ZMM8{%K3}{z}               | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPSUBQ %ZMM3,%ZMM14,%ZMM4                   | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQU64 -0xb8(%R8),%ZMM3                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVPSUBQ -0x100(%R8),%ZMM13,%ZMM14            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPCMPNLEQ %ZMM2,%ZMM0,%K4\nVPSUBQ %ZMM8,%ZMM4,%ZMM9                    | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQU64 -0x78(%R8),%ZMM8                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVPSUBQ -0xc0(%R8),%ZMM3,%ZMM4               | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPCMPNLEQ %ZMM2,%ZMM14,%K5\nVMOVDQA64 %ZMM1,%ZMM11{%K4}{z}              | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPCMPNLEQ %ZMM2,%ZMM4,%K6\nVPSUBQ %ZMM11,%ZMM9,%ZMM12                  | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQU64 -0x38(%R8),%ZMM11                 | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVPSUBQ -0x80(%R8),%ZMM8,%ZMM9               | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQA64 %ZMM1,%ZMM15{%K5}{z}              | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPSUBQ %ZMM15,%ZMM12,%ZMM5                  | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPSUBQ -0x40(%R8),%ZMM11,%ZMM12             | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPCMPNLEQ %ZMM2,%ZMM9,%K7\nVMOVDQA64 %ZMM1,%ZMM6{%K6}{z}               | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPCMPNLEQ %ZMM2,%ZMM12,%K1\nVPSUBQ %ZMM6,%ZMM5,%ZMM7                    | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQA64 %ZMM1,%ZMM10{%K7}{z}              | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPSUBQ %ZMM10,%ZMM7,%ZMM0                   | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQA64 %ZMM1,%ZMM13{%K1}{z}              | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPSUBQ %ZMM13,%ZMM0,%ZMM0                   | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nCMP %R8,%RSI                                | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nJNE 59a343 <hypre_CSRMatrixSetRownnz+0x193> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0  | 0       | 0.50-1\n",
        },
      },
      header = {
        "Warnings:\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM12,%K1] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM5,%K2] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM7,%K3] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM0,%K4] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM14,%K5] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM4,%K6] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM9,%K7] is unknown\n",
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          workaround = "If your loop is irregular, try to remove or hoist conditional structures out of your loop. If it mixes elements of different sizes, try to uniformize them.",
          details = "Vector registers are partially exploited, which is expected if your loop is irregular or mixes elements of different sizes.",
          title = "Masked instructions",
          txt = "Detected masked instructions.",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          details = " - Constant non-unit stride: 1 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          title = "Type of elements and instruction set",
          txt = "",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop does not contain any FP arithmetical operations.\nThe binary loop is loading 1024 bytes.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 43\nnb uops            : 34\nloop length        : 304\nused x86 registers : 2\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 16\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 8.50 cycles\nfront end            : 8.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 8.00 | 1.00 | 8.00 | 8.00 | 0.00 | 8.00 | 1.00 | 0.00\ncycles | 8.00 | 1.00 | 8.00 | 8.00 | 0.00 | 8.00 | 1.00 | 0.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 8.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles: 8.63\nStall cycles: 0.00\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 8.50\nDispatch  : 8.00\nData deps.: 8.00\nOverall L1: 8.50\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Detected masked instructions: assuming all mask elements are active.\nAssuming all data fit into the L1 cache, each iteration of the binary loop takes 8.50 cycles. At this rate:\n - 94% of peak load performance is reached (120.47 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Performance is limited by instruction throughput (loading/decoding program instructions to execution core) (front-end is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 8.50 to 8.00 cycles (1.06x speedup).\n",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 59a343\n\nInstruction                                 | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7 | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------\nVMOVDQU64 0x8(%R8),%ZMM11                   | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVMOVDQU64 0x48(%R8),%ZMM15                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nADD $0x200,%R8                              | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nVMOVDQU64 -0x178(%R8),%ZMM6                 | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVMOVDQU64 -0x138(%R8),%ZMM10                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVPSUBQ -0x200(%R8),%ZMM11,%ZMM12            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPSUBQ -0x1c0(%R8),%ZMM15,%ZMM5             | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPSUBQ -0x180(%R8),%ZMM6,%ZMM7              | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPCMPNLEQ %ZMM2,%ZMM12,%K1\nVPCMPNLEQ %ZMM2,%ZMM5,%K2\nVPCMPNLEQ %ZMM2,%ZMM7,%K3\nVMOVDQA64 %ZMM1,%ZMM13{%K1}{z}              | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVMOVDQA64 %ZMM1,%ZMM3{%K2}{z}               | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPSUBQ %ZMM13,%ZMM0,%ZMM14                  | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPSUBQ -0x140(%R8),%ZMM10,%ZMM0             | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQU64 -0xf8(%R8),%ZMM13                 | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVMOVDQA64 %ZMM1,%ZMM8{%K3}{z}               | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPSUBQ %ZMM3,%ZMM14,%ZMM4                   | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQU64 -0xb8(%R8),%ZMM3                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVPSUBQ -0x100(%R8),%ZMM13,%ZMM14            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPCMPNLEQ %ZMM2,%ZMM0,%K4\nVPSUBQ %ZMM8,%ZMM4,%ZMM9                    | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQU64 -0x78(%R8),%ZMM8                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVPSUBQ -0xc0(%R8),%ZMM3,%ZMM4               | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPCMPNLEQ %ZMM2,%ZMM14,%K5\nVMOVDQA64 %ZMM1,%ZMM11{%K4}{z}              | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPCMPNLEQ %ZMM2,%ZMM4,%K6\nVPSUBQ %ZMM11,%ZMM9,%ZMM12                  | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQU64 -0x38(%R8),%ZMM11                 | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVPSUBQ -0x80(%R8),%ZMM8,%ZMM9               | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQA64 %ZMM1,%ZMM15{%K5}{z}              | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPSUBQ %ZMM15,%ZMM12,%ZMM5                  | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPSUBQ -0x40(%R8),%ZMM11,%ZMM12             | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVPCMPNLEQ %ZMM2,%ZMM9,%K7\nVMOVDQA64 %ZMM1,%ZMM6{%K6}{z}               | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPCMPNLEQ %ZMM2,%ZMM12,%K1\nVPSUBQ %ZMM6,%ZMM5,%ZMM7                    | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQA64 %ZMM1,%ZMM10{%K7}{z}              | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPSUBQ %ZMM10,%ZMM7,%ZMM0                   | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nVMOVDQA64 %ZMM1,%ZMM13{%K1}{z}              | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPSUBQ %ZMM13,%ZMM0,%ZMM0                   | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0  | 1       | 0.50\nCMP %R8,%RSI                                | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nJNE 59a343 <hypre_CSRMatrixSetRownnz+0x193> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0  | 0       | 0.50-1\n",
        },
      },
      header = {
        "Warnings:\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM12,%K1] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM5,%K2] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM7,%K3] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM0,%K4] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM14,%K5] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM4,%K6] is unknown\n - The number of fused uops of the instruction [VPCMPNLEQ	%ZMM2,%ZMM9,%K7] is unknown\n",
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          workaround = "If your loop is irregular, try to remove or hoist conditional structures out of your loop. If it mixes elements of different sizes, try to uniformize them.",
          details = "Vector registers are partially exploited, which is expected if your loop is irregular or mixes elements of different sizes.",
          title = "Masked instructions",
          txt = "Detected masked instructions.",
        },
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/seq_mv/csr_matrix.c:145-148.\n",
      "The related source loop is not unrolled or unrolled with no peel/tail loop.",
    },
    nb_paths = 1,
  },
}
