_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          details = " - Constant unknown stride: 2 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          title = "Type of elements and instruction set",
          txt = "No instructions are processing arithmetic or math operations on FP elements. This loop is probably writing/copying data or processing integer elements.",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop does not contain any FP arithmetical operations.\nThe binary loop is prefetching 256 bytes.\nThe binary loop is loading 256 bytes.\nThe binary loop is storing 256 bytes.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 25\nnb uops            : 24\nloop length        : 164\nused x86 registers : 3\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 8\nused zmm registers : 0\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 6.00 cycles\nfront end            : 6.00 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 1.00 | 1.00 | 6.67 | 6.67 | 8.00 | 1.00 | 1.00 | 6.67\ncycles | 1.00 | 1.25 | 6.67 | 6.67 | 8.00 | 1.00 | 1.00 | 6.67\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 1.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 8.12\nStall cycles    : 1.76\nLB full (events): 1.68\nLM full (events): 0.21\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 6.00\nDispatch  : 8.00\nData deps.: 1.00\nOverall L1: 8.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 50%\nload    : 50%\nstore   : 50%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 8.00 cycles. At this rate:\n - 25% of peak load performance is reached (32.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 50% of peak store performance is reached (32.00 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 4fcfb0\n\nInstruction                               | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7   | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------\nPREFETCHT0 0x200(%RSI)                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nPREFETCHT0 0x240(%RSI)                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nPREFETCHT0 0x280(%RSI)                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nPREFETCHT0 0x2c0(%RSI)                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nVMOVDQU (%RSI),%YMM1                      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0x20(%RSI),%YMM2                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0x40(%RSI),%YMM3                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0x60(%RSI),%YMM4                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0x80(%RSI),%YMM5                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0xa0(%RSI),%YMM6                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0xc0(%RSI),%YMM7                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0xe0(%RSI),%YMM8                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nSUB $0x100,%RCX                           | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nVMOVNTDQ %YMM1,(%RDI)                     | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM2,0x20(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM3,0x40(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM4,0x60(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM5,0x80(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM6,0xa0(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM7,0xc0(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM8,0xe0(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nLEA 0x100(%RDI),%RDI                      | 1     | 0    | 0.50 | 0    | 0    | 0  | 0.50 | 0    | 0    | 1       | 0.50\nLEA 0x100(%RSI),%RSI                      | 1     | 0    | 0.50 | 0    | 0    | 0  | 0.50 | 0    | 0    | 1       | 0.50\nCMP $0x300,%RCX                           | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nJGE 4fcfb0 <__intel_avx_rep_memcpy+0x150> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0    | 0       | 0.50-1\n",
        },
      },
      header = {
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = "Read the \"512-bits vectorization on Skylake SP\" report at \"Potential\" confidence level.",
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is vectorized, but using only 256 out of 512 bits (AVX/AVX2 instructions on AVX-512 processors).\n",
        },
        {
          workaround = " - Write less array elements\n - Provide more information to your compiler:\n  * hardcode the bounds of the corresponding 'for' loop\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by writing data to caches/RAM (the store unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 8.00 to 6.67 cycles (1.20x speedup).\n",
        },
      },
      potential = {
        {
          title = "512-bits vectorization on Skylake SP and Icelake Server",
          txt = "On Gold 5122, 6xxx and Platinum Skylake processors and Icelake Server processors, performance can be improved by using 512-bits vectorization if the number of vectorized loops is high and with high trip count.\n",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          details = " - Constant unknown stride: 2 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          title = "Type of elements and instruction set",
          txt = "No instructions are processing arithmetic or math operations on FP elements. This loop is probably writing/copying data or processing integer elements.",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop does not contain any FP arithmetical operations.\nThe binary loop is prefetching 256 bytes.\nThe binary loop is loading 256 bytes.\nThe binary loop is storing 256 bytes.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 25\nnb uops            : 24\nloop length        : 164\nused x86 registers : 3\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 8\nused zmm registers : 0\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 6.00 cycles\nfront end            : 6.00 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 1.00 | 1.00 | 6.67 | 6.67 | 8.00 | 1.00 | 1.00 | 6.67\ncycles | 1.00 | 1.25 | 6.67 | 6.67 | 8.00 | 1.00 | 1.00 | 6.67\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 1.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 8.12\nStall cycles    : 1.76\nLB full (events): 1.68\nLM full (events): 0.21\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 6.00\nDispatch  : 8.00\nData deps.: 1.00\nOverall L1: 8.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : 100%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 50%\nload    : 50%\nstore   : 50%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 8.00 cycles. At this rate:\n - 25% of peak load performance is reached (32.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 50% of peak store performance is reached (32.00 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 4fcfb0\n\nInstruction                               | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7   | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------\nPREFETCHT0 0x200(%RSI)                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nPREFETCHT0 0x240(%RSI)                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nPREFETCHT0 0x280(%RSI)                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nPREFETCHT0 0x2c0(%RSI)                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nVMOVDQU (%RSI),%YMM1                      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0x20(%RSI),%YMM2                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0x40(%RSI),%YMM3                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0x60(%RSI),%YMM4                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0x80(%RSI),%YMM5                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0xa0(%RSI),%YMM6                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0xc0(%RSI),%YMM7                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVMOVDQU 0xe0(%RSI),%YMM8                  | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nSUB $0x100,%RCX                           | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nVMOVNTDQ %YMM1,(%RDI)                     | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM2,0x20(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM3,0x40(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM4,0x60(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM5,0x80(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM6,0xa0(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM7,0xc0(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nVMOVNTDQ %YMM8,0xe0(%RDI)                 | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 400     | 1\nLEA 0x100(%RDI),%RDI                      | 1     | 0    | 0.50 | 0    | 0    | 0  | 0.50 | 0    | 0    | 1       | 0.50\nLEA 0x100(%RSI),%RSI                      | 1     | 0    | 0.50 | 0    | 0    | 0  | 0.50 | 0    | 0    | 1       | 0.50\nCMP $0x300,%RCX                           | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nJGE 4fcfb0 <__intel_avx_rep_memcpy+0x150> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0    | 0       | 0.50-1\n",
        },
      },
      header = {
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = "Read the \"512-bits vectorization on Skylake SP\" report at \"Potential\" confidence level.",
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is vectorized, but using only 256 out of 512 bits (AVX/AVX2 instructions on AVX-512 processors).\n",
        },
        {
          workaround = " - Write less array elements\n - Provide more information to your compiler:\n  * hardcode the bounds of the corresponding 'for' loop\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by writing data to caches/RAM (the store unit is a bottleneck).\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 8.00 to 6.67 cycles (1.20x speedup).\n",
        },
      },
      potential = {
        {
          title = "512-bits vectorization on Skylake SP and Icelake Server",
          txt = "On Gold 5122, 6xxx and Platinum Skylake processors and Icelake Server processors, performance can be improved by using 512-bits vectorization if the number of vectorized loops is high and with high trip count.\n",
        },
      },
    },
  common = {
    header = {
      "",
    },
    nb_paths = 1,
  },
}
