_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - VPGATHERQQ: 4 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Try to remove indirect accesses. If applicable, precompute elements out of the innermost loop.",
          details = " - Irregular (variable stride) or indirect: 2 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VMOVUPD: 3 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 3 optimal vector unaligned load/store instructions.\n",
        },
        {
          workaround = "Try to simplify your code and/or replace indirect accesses with unit-stride ones.",
          details = " - VPGATHERQQ: 4 occurrences\n",
          title = "Gather/scatter instructions",
          txt = "Detected gather/scatter instructions (typically caused by indirect accesses).",
        },
        {
          title = "Type of elements and instruction set",
          txt = "12 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 64 FP arithmetical operations:\n - 32: addition or subtraction\n - 32: multiply\nThe binary loop is loading 832 bytes (104 double precision FP elements).\nThe binary loop is storing 64 bytes (8 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.07 FP operations per loaded or stored byte.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 49\nnb uops            : 56\nloop length        : 324\nused x86 registers : 5\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 8\nnb stack references: 1\nADD-SUB / MUL ratio: 1.00\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 14.00 cycles\nfront end            : 14.00 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1   | P2    | P3    | P4   | P5    | P6   | P7\n------------------------------------------------------------------\nuops   | 16.50 | 1.00 | 20.50 | 20.50 | 1.00 | 16.50 | 1.00 | 1.00\ncycles | 16.50 | 4.00 | 20.50 | 20.50 | 1.00 | 16.50 | 1.00 | 1.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 16.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles     : 17.03\nStall cycles     : 4.49\nPRF full (events): 6.50\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 14.00\nDispatch  : 20.50\nData deps.: 16.00\nOverall L1: 20.50\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 100%\nFP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 100%\nFP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Detected masked instructions: assuming all mask elements are active.\nAssuming all data fit into the L1 cache, each iteration of the binary loop takes 20.50 cycles. At this rate:\n - 31% of peak load performance is reached (40.59 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 4% of peak store performance is reached (3.12 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 47d0ac\n\nInstruction                                                   | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7   | Latency | Recip. throughput\n---------------------------------------------------------------------------------------------------------------------------------------------------------\nVMOVDQU64 -0x8(%RCX,%R10,1),%ZMM14                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nKMOVB %K3,%K6                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVMOVAPD -0x170(%RBP),%ZMM1                                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nADD $0x100,%RCX                                               | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nVPGATHERQQ (%R14,%ZMM14,8),%ZMM2{%K6}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0    | 22      | 5\nVPCMPEQQ %ZMM12,%ZMM14,%K7                                    | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 1       | 1\nKMOVB %K3,%K6                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVPCMPNLTQ %ZMM11,%ZMM2,%K5\nKORB %K5,%K7,%K1                                              | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVMOVUPD -0x100(%RCX),%ZMM1{%K1}                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVMULPD %ZMM10,%ZMM1,%ZMM14                                    | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVCMPPD $0x1,%ZMM13,%ZMM14,%K4{%K1}                            | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 3       | 1\nVMOVAPD %ZMM1,%ZMM2{%K4}{z}                                   | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 0       | 0.25\nKMOVB %K3,%K4                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVADDPD %ZMM2,%ZMM0,%ZMM0                                      | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVDQU64 -0xc8(%RCX,%R10,1),%ZMM2                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVPGATHERQQ (%R14,%ZMM2,8),%ZMM14{%K6}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0    | 22      | 5\nVPCMPEQQ %ZMM12,%ZMM2,%K7                                     | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 1       | 1\nVPCMPNLTQ %ZMM11,%ZMM14,%K0\nKORB %K0,%K7,%K5                                              | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVMOVUPD -0xc0(%RCX),%ZMM1{%K5}                                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVMULPD %ZMM10,%ZMM1,%ZMM2                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVCMPPD $0x1,%ZMM13,%ZMM2,%K1{%K5}                             | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 3       | 1\nVMOVDQU64 -0x88(%RCX,%R10,1),%ZMM2                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVPCMPEQQ %ZMM12,%ZMM2,%K6                                     | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 1       | 1\nVMOVAPD %ZMM1,%ZMM14{%K1}{z}                                  | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 0       | 0.25\nVADDPD %ZMM14,%ZMM0,%ZMM0                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVPGATHERQQ (%R14,%ZMM2,8),%ZMM14{%K4}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0    | 22      | 5\nKMOVB %K3,%K4                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVPCMPNLTQ %ZMM11,%ZMM14,%K7\nKORB %K7,%K6,%K5                                              | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVMOVUPD -0x80(%RCX),%ZMM1{%K5}                                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVMULPD %ZMM10,%ZMM1,%ZMM2                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVCMPPD $0x1,%ZMM13,%ZMM2,%K1{%K5}                             | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 3       | 1\nVMOVDQU64 -0x48(%RCX,%R10,1),%ZMM2                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVPCMPEQQ %ZMM12,%ZMM2,%K6                                     | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 1       | 1\nVMOVAPD %ZMM1,%ZMM14{%K1}{z}                                  | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 0       | 0.25\nVADDPD %ZMM14,%ZMM0,%ZMM0                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVPGATHERQQ (%R14,%ZMM2,8),%ZMM14{%K4}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0    | 22      | 5\nVPCMPNLTQ %ZMM11,%ZMM14,%K0\nKORB %K0,%K6,%K7                                              | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVBLENDMPD -0x40(%RCX),%ZMM1,%ZMM2{%K7}                        | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 1       | 0.50\nVMULPD %ZMM2,%ZMM10,%ZMM1                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVAPD %ZMM2,-0x170(%RBP)                                    | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVCMPPD $0x1,%ZMM13,%ZMM1,%K5{%K7}                             | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 3       | 1\nVMOVAPD %ZMM2,%ZMM14{%K5}{z}                                  | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 0       | 0.25\nVADDPD %ZMM14,%ZMM0,%ZMM0                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nCMP %R8,%RCX                                                  | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nJNE 47d0ac <hypre_BoomerAMGBuildExtPIInterp._omp_fn.0+0x19ec> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0    | 0       | 0.50-1\n",
        },
      },
      header = {
        "Warnings:\n - The number of fused uops of the instruction [VPCMPNLTQ	%ZMM11,%ZMM2,%K5] is unknown\n - The number of fused uops of the instruction [VPCMPNLTQ	%ZMM11,%ZMM14,%K0] is unknown\n - The number of fused uops of the instruction [VPCMPNLTQ	%ZMM11,%ZMM14,%K7] is unknown\n",
        "9% of peak computational performance is used (3.12 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          workaround = " - Read less array elements\n - Write less array elements\n - Provide more information to your compiler:\n  * hardcode the bounds of the corresponding 'for' loop\n  * use the 'restrict' C99 keyword\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by:\n - reading data from caches/RAM (load units are a bottleneck)\n - writing data to caches/RAM (the store unit is a bottleneck)\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 20.50 to 16.50 cycles (1.24x speedup).\n",
        },
      },
      potential = {
        {
          workaround = "If your loop is irregular, try to remove or hoist conditional structures out of your loop. If it mixes elements of different sizes, try to uniformize them.",
          details = "Vector registers are partially exploited, which is expected if your loop is irregular or mixes elements of different sizes.",
          title = "Masked instructions",
          txt = "Detected masked instructions.",
        },
        {
          workaround = "Try to change order in which elements are evaluated (using parentheses) in arithmetic expressions containing both ADD/SUB and MUL operations to enable your compiler to generate FMA instructions wherever possible.\nFor instance a + b*c is a valid FMA (MUL then ADD).\nHowever (a+b)* c cannot be translated into an FMA (ADD then MUL).",
          title = "FMA",
          txt = "Presence of both ADD/SUB and MUL operations.",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - VPGATHERQQ: 4 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = "Try to remove indirect accesses. If applicable, precompute elements out of the innermost loop.",
          details = " - Irregular (variable stride) or indirect: 2 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VMOVUPD: 3 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 3 optimal vector unaligned load/store instructions.\n",
        },
        {
          workaround = "Try to simplify your code and/or replace indirect accesses with unit-stride ones.",
          details = " - VPGATHERQQ: 4 occurrences\n",
          title = "Gather/scatter instructions",
          txt = "Detected gather/scatter instructions (typically caused by indirect accesses).",
        },
        {
          title = "Type of elements and instruction set",
          txt = "12 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 64 FP arithmetical operations:\n - 32: addition or subtraction\n - 32: multiply\nThe binary loop is loading 832 bytes (104 double precision FP elements).\nThe binary loop is storing 64 bytes (8 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.07 FP operations per loaded or stored byte.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 49\nnb uops            : 56\nloop length        : 324\nused x86 registers : 5\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 8\nnb stack references: 1\nADD-SUB / MUL ratio: 1.00\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 14.00 cycles\nfront end            : 14.00 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0    | P1   | P2    | P3    | P4   | P5    | P6   | P7\n------------------------------------------------------------------\nuops   | 16.50 | 1.00 | 20.50 | 20.50 | 1.00 | 16.50 | 1.00 | 1.00\ncycles | 16.50 | 4.00 | 20.50 | 20.50 | 1.00 | 16.50 | 1.00 | 1.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 16.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles     : 17.03\nStall cycles     : 4.49\nPRF full (events): 6.50\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 14.00\nDispatch  : 20.50\nData deps.: 16.00\nOverall L1: 20.50\n",
        },
        {
          title = "Vectorization ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 100%\nFP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "INT\nall    : 100%\nload   : 100%\nstore  : NA (no store vectorizable/vectorized instructions)\nmul    : NA (no mul vectorizable/vectorized instructions)\nadd-sub: NA (no add-sub vectorizable/vectorized instructions)\nfma    : NA (no fma vectorizable/vectorized instructions)\nother  : 100%\nFP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\nINT+FP\nall     : 100%\nload    : 100%\nstore   : 100%\nmul     : 100%\nadd-sub : 100%\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : 100%\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Detected masked instructions: assuming all mask elements are active.\nAssuming all data fit into the L1 cache, each iteration of the binary loop takes 20.50 cycles. At this rate:\n - 31% of peak load performance is reached (40.59 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 4% of peak store performance is reached (3.12 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 47d0ac\n\nInstruction                                                   | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7   | Latency | Recip. throughput\n---------------------------------------------------------------------------------------------------------------------------------------------------------\nVMOVDQU64 -0x8(%RCX,%R10,1),%ZMM14                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nKMOVB %K3,%K6                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVMOVAPD -0x170(%RBP),%ZMM1                                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nADD $0x100,%RCX                                               | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nVPGATHERQQ (%R14,%ZMM14,8),%ZMM2{%K6}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0    | 22      | 5\nVPCMPEQQ %ZMM12,%ZMM14,%K7                                    | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 1       | 1\nKMOVB %K3,%K6                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVPCMPNLTQ %ZMM11,%ZMM2,%K5\nKORB %K5,%K7,%K1                                              | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVMOVUPD -0x100(%RCX),%ZMM1{%K1}                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVMULPD %ZMM10,%ZMM1,%ZMM14                                    | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVCMPPD $0x1,%ZMM13,%ZMM14,%K4{%K1}                            | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 3       | 1\nVMOVAPD %ZMM1,%ZMM2{%K4}{z}                                   | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 0       | 0.25\nKMOVB %K3,%K4                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVADDPD %ZMM2,%ZMM0,%ZMM0                                      | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVDQU64 -0xc8(%RCX,%R10,1),%ZMM2                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVPGATHERQQ (%R14,%ZMM2,8),%ZMM14{%K6}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0    | 22      | 5\nVPCMPEQQ %ZMM12,%ZMM2,%K7                                     | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 1       | 1\nVPCMPNLTQ %ZMM11,%ZMM14,%K0\nKORB %K0,%K7,%K5                                              | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVMOVUPD -0xc0(%RCX),%ZMM1{%K5}                                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVMULPD %ZMM10,%ZMM1,%ZMM2                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVCMPPD $0x1,%ZMM13,%ZMM2,%K1{%K5}                             | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 3       | 1\nVMOVDQU64 -0x88(%RCX,%R10,1),%ZMM2                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVPCMPEQQ %ZMM12,%ZMM2,%K6                                     | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 1       | 1\nVMOVAPD %ZMM1,%ZMM14{%K1}{z}                                  | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 0       | 0.25\nVADDPD %ZMM14,%ZMM0,%ZMM0                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVPGATHERQQ (%R14,%ZMM2,8),%ZMM14{%K4}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0    | 22      | 5\nKMOVB %K3,%K4                                                 | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVPCMPNLTQ %ZMM11,%ZMM14,%K7\nKORB %K7,%K6,%K5                                              | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVMOVUPD -0x80(%RCX),%ZMM1{%K5}                                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5-6     | 0.50\nVMULPD %ZMM10,%ZMM1,%ZMM2                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVCMPPD $0x1,%ZMM13,%ZMM2,%K1{%K5}                             | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 3       | 1\nVMOVDQU64 -0x48(%RCX,%R10,1),%ZMM2                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 5       | 0.50\nVPCMPEQQ %ZMM12,%ZMM2,%K6                                     | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 1       | 1\nVMOVAPD %ZMM1,%ZMM14{%K1}{z}                                  | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 0       | 0.25\nVADDPD %ZMM14,%ZMM0,%ZMM0                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVPGATHERQQ (%R14,%ZMM2,8),%ZMM14{%K4}                         | 4     | 1    | 0    | 4    | 4    | 0  | 1    | 0    | 0    | 22      | 5\nVPCMPNLTQ %ZMM11,%ZMM14,%K0\nKORB %K0,%K6,%K7                                              | 1     | 1    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 1       | 1\nVBLENDMPD -0x40(%RCX),%ZMM1,%ZMM2{%K7}                        | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0    | 1       | 0.50\nVMULPD %ZMM2,%ZMM10,%ZMM1                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nVMOVAPD %ZMM2,-0x170(%RBP)                                    | 1     | 0    | 0    | 0.33 | 0.33 | 1  | 0    | 0    | 0.33 | 3       | 1\nVCMPPD $0x1,%ZMM13,%ZMM1,%K5{%K7}                             | 1     | 0    | 0    | 0    | 0    | 0  | 1    | 0    | 0    | 3       | 1\nVMOVAPD %ZMM2,%ZMM14{%K5}{z}                                  | 1     | 0    | 0    | 0    | 0    | 0  | 0    | 0    | 0    | 0       | 0.25\nVADDPD %ZMM14,%ZMM0,%ZMM0                                     | 1     | 0.50 | 0    | 0    | 0    | 0  | 0.50 | 0    | 0    | 4       | 0.50\nCMP %R8,%RCX                                                  | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nJNE 47d0ac <hypre_BoomerAMGBuildExtPIInterp._omp_fn.0+0x19ec> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0    | 0       | 0.50-1\n",
        },
      },
      header = {
        "Warnings:\n - The number of fused uops of the instruction [VPCMPNLTQ	%ZMM11,%ZMM2,%K5] is unknown\n - The number of fused uops of the instruction [VPCMPNLTQ	%ZMM11,%ZMM14,%K0] is unknown\n - The number of fused uops of the instruction [VPCMPNLTQ	%ZMM11,%ZMM14,%K7] is unknown\n",
        "9% of peak computational performance is used (3.12 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          workaround = " - Read less array elements\n - Write less array elements\n - Provide more information to your compiler:\n  * hardcode the bounds of the corresponding 'for' loop\n  * use the 'restrict' C99 keyword\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by:\n - reading data from caches/RAM (load units are a bottleneck)\n - writing data to caches/RAM (the store unit is a bottleneck)\n\nBy removing all these bottlenecks, you can lower the cost of an iteration from 20.50 to 16.50 cycles (1.24x speedup).\n",
        },
      },
      potential = {
        {
          workaround = "If your loop is irregular, try to remove or hoist conditional structures out of your loop. If it mixes elements of different sizes, try to uniformize them.",
          details = "Vector registers are partially exploited, which is expected if your loop is irregular or mixes elements of different sizes.",
          title = "Masked instructions",
          txt = "Detected masked instructions.",
        },
        {
          workaround = "Try to change order in which elements are evaluated (using parentheses) in arithmetic expressions containing both ADD/SUB and MUL operations to enable your compiler to generate FMA instructions wherever possible.\nFor instance a + b*c is a valid FMA (MUL then ADD).\nHowever (a+b)* c cannot be translated into an FMA (ADD then MUL).",
          title = "FMA",
          txt = "Presence of both ADD/SUB and MUL operations.",
        },
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/parcsr_ls/par_lr_interp.c:1624-1628.\n",
      "The related source loop is not unrolled or unrolled with no peel/tail loop.",
    },
    nb_paths = 1,
  },
}
