_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - VPGATHERQQ: 1 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Try to remove indirect accesses. If applicable, precompute elements out of the innermost loop.",
          details = " - Irregular (variable stride) or indirect: 1 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          workaround = "Try to simplify your code and/or replace indirect accesses with unit-stride ones.",
          details = " - VPGATHERQQ: 1 occurrences\n",
          title = "Gather/scatter instructions",
          txt = "Detected gather/scatter instructions (typically caused by indirect accesses).",
        },
        {
          title = "Type of elements and instruction set",
          txt = "3 AVX instructions are processing arithmetic or math operations on double precision FP elements in vector mode (four at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 8 FP arithmetical operations:\n - 4: addition or subtraction\n - 4: multiply\nThe binary loop is loading 96 bytes (12 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.08 FP operations per loaded or stored byte.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 14\nnb uops            : 15\nloop length        : 71\nused x86 registers : 5\nused mmx registers : 0\nused xmm registers : 1\nused ymm registers : 7\nused zmm registers : 0\nnb stack references: 0\nADD-SUB / MUL ratio: 1.00\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 3.75 cycles\nfront end            : 3.75 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 3.00 | 2.00 | 3.00 | 3.00 | 0.00 | 3.00 | 2.00 | 0.00\ncycles | 3.00 | 2.00 | 3.00 | 3.00 | 0.00 | 3.00 | 2.00 | 0.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 4.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles     : 4.12\nStall cycles     : 0.38\nPRF full (events): 0.60\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 3.75\nDispatch  : 3.00\nData deps.: 4.00\nOverall L1: 4.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 100%\nFP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 45%\nload   : 50%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 43%\nFP\nall     : 50%\nload    : 50%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : 50%\nadd-sub : 50%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 50%\nINT+FP\nall     : 47%\nload    : 50%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : 50%\nadd-sub : 50%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 45%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Detected masked instructions: assuming all mask elements are active.\nAssuming all data fit into the L1 cache, each iteration of the binary loop takes 4.00 cycles. At this rate:\n - 18% of peak load performance is reached (24.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 453260\n\nInstruction                                                   | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7 | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------------------------\nVMOVDQU (%R15,%R12,8),%YMM1                                   | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKXNORW %K0,%K0,%K1                                            | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVPXOR %XMM4,%XMM4,%XMM4                                       | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPGATHERQQ (%RBX,%YMM1,8),%YMM4{%K1}                          | 4     | 1    | 0    | 2    | 2    | 0  | 1    | 0    | 0  | 20      | 4\nVPCMPEQQ %YMM7,%YMM1,%K0                                      | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 1       | 1\nVPCMPNLTQ %YMM8,%YMM4,%K1\nKORW %K0,%K1,%K1                                              | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVMOVUPD (%R11,%R12,8),%YMM1{%K1}{z}                           | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMULPD %YMM1,%YMM12,%YMM4                                     | 1     | 0.50 | 0.50 | 0    | 0    | 0  | 0    | 0    | 0  | 4       | 0.50\nVCMPPD $0x1,%YMM2,%YMM4,%K1{%K1}                              | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 3       | 1\nVADDPD %YMM1,%YMM3,%YMM3{%K1}                                 | 1     | 0.50 | 0.50 | 0    | 0    | 0  | 0    | 0    | 0  | 4       | 0.50\nADD $0x4,%R12                                                 | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nCMP %RSI,%R12                                                 | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nJBE 453260 <hypre_BoomerAMGBuildExtPIInterp.extracted+0x1500> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0  | 0       | 0.50-1\n",
        },
      },
      header = {
        "Warnings:\nThe number of fused uops of the instruction [VPCMPNLTQ	%YMM8,%YMM4,%K1] is unknown",
        "6% of peak computational performance is used (2.00 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = "Read the \"512-bits vectorization on Skylake SP\" report at \"Potential\" confidence level.",
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is vectorized, but using 47% register length (average across all SSE/AVX instructions).\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          title = "512-bits vectorization on Skylake SP and Icelake Server",
          txt = "On Gold 5122, 6xxx and Platinum Skylake processors and Icelake Server processors, performance can be improved by using 512-bits vectorization if the number of vectorized loops is high and with high trip count.\n",
        },
        {
          workaround = "If your loop is irregular, try to remove or hoist conditional structures out of your loop. If it mixes elements of different sizes, try to uniformize them.",
          details = "Vector registers are partially exploited, which is expected if your loop is irregular or mixes elements of different sizes.",
          title = "Masked instructions",
          txt = "Detected masked instructions.",
        },
        {
          workaround = " - Pass to your compiler a micro-architecture specialization option:\n  * Please read your compiler manual\n - Try to change order in which elements are evaluated (using parentheses) in arithmetic expressions containing both ADD/SUB and MUL operations to enable your compiler to generate FMA instructions wherever possible.\nFor instance a + b*c is a valid FMA (MUL then ADD).\nHowever (a+b)* c cannot be translated into an FMA (ADD then MUL).\n",
          title = "FMA",
          txt = "Presence of both ADD/SUB and MUL operations.",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - VPGATHERQQ: 1 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Try to remove indirect accesses. If applicable, precompute elements out of the innermost loop.",
          details = " - Irregular (variable stride) or indirect: 1 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          workaround = "Try to simplify your code and/or replace indirect accesses with unit-stride ones.",
          details = " - VPGATHERQQ: 1 occurrences\n",
          title = "Gather/scatter instructions",
          txt = "Detected gather/scatter instructions (typically caused by indirect accesses).",
        },
        {
          title = "Type of elements and instruction set",
          txt = "3 AVX instructions are processing arithmetic or math operations on double precision FP elements in vector mode (four at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 8 FP arithmetical operations:\n - 4: addition or subtraction\n - 4: multiply\nThe binary loop is loading 96 bytes (12 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.08 FP operations per loaded or stored byte.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 14\nnb uops            : 15\nloop length        : 71\nused x86 registers : 5\nused mmx registers : 0\nused xmm registers : 1\nused ymm registers : 7\nused zmm registers : 0\nnb stack references: 0\nADD-SUB / MUL ratio: 1.00\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 3.75 cycles\nfront end            : 3.75 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 3.00 | 2.00 | 3.00 | 3.00 | 0.00 | 3.00 | 2.00 | 0.00\ncycles | 3.00 | 2.00 | 3.00 | 3.00 | 0.00 | 3.00 | 2.00 | 0.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 4.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles     : 4.12\nStall cycles     : 0.38\nPRF full (events): 0.60\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 3.75\nDispatch  : 3.00\nData deps.: 4.00\nOverall L1: 4.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 100%\nFP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 45%\nload   : 50%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 43%\nFP\nall     : 50%\nload    : 50%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : 50%\nadd-sub : 50%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 50%\nINT+FP\nall     : 47%\nload    : 50%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : 50%\nadd-sub : 50%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 45%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Detected masked instructions: assuming all mask elements are active.\nAssuming all data fit into the L1 cache, each iteration of the binary loop takes 4.00 cycles. At this rate:\n - 18% of peak load performance is reached (24.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 453260\n\nInstruction                                                   | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7 | Latency | Recip. throughput\n-------------------------------------------------------------------------------------------------------------------------------------------------------\nVMOVDQU (%R15,%R12,8),%YMM1                                   | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5       | 0.50\nKXNORW %K0,%K0,%K1                                            | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVPXOR %XMM4,%XMM4,%XMM4                                       | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 0       | 0.25\nVPGATHERQQ (%RBX,%YMM1,8),%YMM4{%K1}                          | 4     | 1    | 0    | 2    | 2    | 0  | 1    | 0    | 0  | 20      | 4\nVPCMPEQQ %YMM7,%YMM1,%K0                                      | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 1       | 1\nVPCMPNLTQ %YMM8,%YMM4,%K1\nKORW %K0,%K1,%K1                                              | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0  | 1       | 1\nVMOVUPD (%R11,%R12,8),%YMM1{%K1}{z}                           | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMULPD %YMM1,%YMM12,%YMM4                                     | 1     | 0.50 | 0.50 | 0    | 0    | 0  | 0    | 0    | 0  | 4       | 0.50\nVCMPPD $0x1,%YMM2,%YMM4,%K1{%K1}                              | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0  | 3       | 1\nVADDPD %YMM1,%YMM3,%YMM3{%K1}                                 | 1     | 0.50 | 0.50 | 0    | 0    | 0  | 0    | 0    | 0  | 4       | 0.50\nADD $0x4,%R12                                                 | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nCMP %RSI,%R12                                                 | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nJBE 453260 <hypre_BoomerAMGBuildExtPIInterp.extracted+0x1500> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0  | 0       | 0.50-1\n",
        },
      },
      header = {
        "Warnings:\nThe number of fused uops of the instruction [VPCMPNLTQ	%YMM8,%YMM4,%K1] is unknown",
        "6% of peak computational performance is used (2.00 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = "Read the \"512-bits vectorization on Skylake SP\" report at \"Potential\" confidence level.",
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is vectorized, but using 47% register length (average across all SSE/AVX instructions).\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          title = "512-bits vectorization on Skylake SP and Icelake Server",
          txt = "On Gold 5122, 6xxx and Platinum Skylake processors and Icelake Server processors, performance can be improved by using 512-bits vectorization if the number of vectorized loops is high and with high trip count.\n",
        },
        {
          workaround = "If your loop is irregular, try to remove or hoist conditional structures out of your loop. If it mixes elements of different sizes, try to uniformize them.",
          details = "Vector registers are partially exploited, which is expected if your loop is irregular or mixes elements of different sizes.",
          title = "Masked instructions",
          txt = "Detected masked instructions.",
        },
        {
          workaround = " - Pass to your compiler a micro-architecture specialization option:\n  * Please read your compiler manual\n - Try to change order in which elements are evaluated (using parentheses) in arithmetic expressions containing both ADD/SUB and MUL operations to enable your compiler to generate FMA instructions wherever possible.\nFor instance a + b*c is a valid FMA (MUL then ADD).\nHowever (a+b)* c cannot be translated into an FMA (ADD then MUL).\n",
          title = "FMA",
          txt = "Presence of both ADD/SUB and MUL operations.",
        },
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/parcsr_ls/par_lr_interp.c:1221,1624-1627.\n",
      "The related source loop is not unrolled or unrolled with no peel/tail loop.",
    },
    nb_paths = 1,
  },
}
