_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          details = " - Constant non-unit stride: 1 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          title = "Type of elements and instruction set",
          txt = "",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop does not contain any FP arithmetical operations.\nThe binary loop is loading 72 bytes.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 17\nnb uops            : 16\nloop length        : 78\nused x86 registers : 4\nused mmx registers : 0\nused xmm registers : 1\nused ymm registers : 5\nused zmm registers : 0\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 4.00 cycles\nfront end            : 4.00 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 2.00 | 2.00 | 1.50 | 1.50 | 0.00 | 6.00 | 2.00 | 0.00\ncycles | 2.00 | 2.00 | 1.50 | 1.50 | 0.00 | 6.00 | 2.00 | 0.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 1.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles     : 6.21\nStall cycles     : 1.78\nRS full (events) : 3.02\nPRF full (events): 0.90\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 4.00\nDispatch  : 6.00\nData deps.: 1.00\nOverall L1: 6.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 63%\nload    : 66%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 0%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 71%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 36%\nload    : 37%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 12%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 39%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 6.00 cycles. At this rate:\n - 9% of peak load performance is reached (12.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 4e66c0\n\nInstruction                                | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7 | Latency | Recip. throughput\n------------------------------------------------------------------------------------------------------------------------------------\nVMOVDQU (%RDX),%YMM1                       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVMOVDQU 0x20(%RDX),%YMM2                   | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVMOVQ %RCX,%XMM3                           | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 1       | 1\nVMOVDQA %YMM1,%YMM4                        | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPERMT2Q %YMM3,%YMM0,%YMM4                 | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 3       | 1\nVALIGNQ $0x3,%YMM1,%YMM2,%YMM3             | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 3       | 1\nVPCMPGTQ %YMM3,%YMM2,%K0                   | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 3       | 1\nKSHIFTLB $0x4,%K0,%K0                      | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 4       | 1\nVPCMPGTQ %YMM4,%YMM1,%K1                   | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 3       | 1\nKORB %K0,%K1,%K0                           | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nKMOVB %K0,%ECX                             | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 2       | 1\nPOPCNT %RCX,%RCX                           | 1     | 0    | 1    | 0    | 0    | 0  | 0    | 0    | 0  | 3       | 1\nADD %RCX,%RAX                              | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nMOV 0x38(%RDX),%RCX                        | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 4-5     | 0.50\nADD $0x40,%RDX                             | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nDEC %RSI                                   | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nJNE 4e66c0 <hypre_CSRMatrixSetRownnz+0x70> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0  | 0       | 0.50-1\n",
        },
      },
      header = {
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          title = "Code clean check",
          txt = "Detected a slowdown caused by scalar integer instructions (typically used for address computation).\nBy removing them, you can lower the cost of an iteration from 6.00 to 5.00 cycles (1.20x speedup).",
        },
        {
          workaround = "Read the \"512-bits vectorization on Skylake SP\" report at \"Potential\" confidence level.",
          details = "Store and arithmetical SSE/AVX instructions are used in scalar version (process only one data element in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is probably not vectorized.\nOnly 36% of vector register length is used (average across all SSE/AVX instructions).\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          title = "512-bits vectorization on Skylake SP and Icelake Server",
          txt = "On Gold 5122, 6xxx and Platinum Skylake processors and Icelake Server processors, performance can be improved by using 512-bits vectorization if the number of vectorized loops is high and with high trip count.\n",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          details = " - Constant non-unit stride: 1 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          title = "Type of elements and instruction set",
          txt = "",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop does not contain any FP arithmetical operations.\nThe binary loop is loading 72 bytes.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 17\nnb uops            : 16\nloop length        : 78\nused x86 registers : 4\nused mmx registers : 0\nused xmm registers : 1\nused ymm registers : 5\nused zmm registers : 0\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 4.00 cycles\nfront end            : 4.00 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 2.00 | 2.00 | 1.50 | 1.50 | 0.00 | 6.00 | 2.00 | 0.00\ncycles | 2.00 | 2.00 | 1.50 | 1.50 | 0.00 | 6.00 | 2.00 | 0.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 1.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles     : 6.21\nStall cycles     : 1.78\nRS full (events) : 3.02\nPRF full (events): 0.90\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 4.00\nDispatch  : 6.00\nData deps.: 1.00\nOverall L1: 6.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 63%\nload    : 66%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 0%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 71%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 36%\nload    : 37%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : 12%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 39%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 6.00 cycles. At this rate:\n - 9% of peak load performance is reached (12.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 4e66c0\n\nInstruction                                | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7 | Latency | Recip. throughput\n------------------------------------------------------------------------------------------------------------------------------------\nVMOVDQU (%RDX),%YMM1                       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVMOVDQU 0x20(%RDX),%YMM2                   | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nVMOVQ %RCX,%XMM3                           | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 1       | 1\nVMOVDQA %YMM1,%YMM4                        | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPERMT2Q %YMM3,%YMM0,%YMM4                 | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 3       | 1\nVALIGNQ $0x3,%YMM1,%YMM2,%YMM3             | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 3       | 1\nVPCMPGTQ %YMM3,%YMM2,%K0                   | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 3       | 1\nKSHIFTLB $0x4,%K0,%K0                      | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 4       | 1\nVPCMPGTQ %YMM4,%YMM1,%K1                   | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 3       | 1\nKORB %K0,%K1,%K0                           | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nKMOVB %K0,%ECX                             | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 2       | 1\nPOPCNT %RCX,%RCX                           | 1     | 0    | 1    | 0    | 0    | 0  | 0    | 0    | 0  | 3       | 1\nADD %RCX,%RAX                              | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nMOV 0x38(%RDX),%RCX                        | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 4-5     | 0.50\nADD $0x40,%RDX                             | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nDEC %RSI                                   | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nJNE 4e66c0 <hypre_CSRMatrixSetRownnz+0x70> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0  | 0       | 0.50-1\n",
        },
      },
      header = {
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          title = "Code clean check",
          txt = "Detected a slowdown caused by scalar integer instructions (typically used for address computation).\nBy removing them, you can lower the cost of an iteration from 6.00 to 5.00 cycles (1.20x speedup).",
        },
        {
          workaround = "Read the \"512-bits vectorization on Skylake SP\" report at \"Potential\" confidence level.",
          details = "Store and arithmetical SSE/AVX instructions are used in scalar version (process only one data element in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is probably not vectorized.\nOnly 36% of vector register length is used (average across all SSE/AVX instructions).\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          title = "512-bits vectorization on Skylake SP and Icelake Server",
          txt = "On Gold 5122, 6xxx and Platinum Skylake processors and Icelake Server processors, performance can be improved by using 512-bits vectorization if the number of vectorized loops is high and with high trip count.\n",
        },
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/seq_mv/csr_matrix.c:145-148.\n",
      "It is main loop of related source loop which is unrolled by 9 (including vectorization).",
    },
    nb_paths = 1,
  },
}
