_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VMOVUPD: 8 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 8 optimal vector unaligned load/store instructions.\n",
        },
        {
          title = "Type of elements and instruction set",
          txt = "8 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 128 FP arithmetical operations:\n - 64: addition or subtraction (all inside FMA instructions)\n - 64: multiply (all inside FMA instructions)\nThe binary loop is loading 1024 bytes (128 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.12 FP operations per loaded or stored byte.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 19\nnb uops            : 18\nloop length        : 142\nused x86 registers : 4\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 9\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 6.50 cycles\nfront end            : 6.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 4.00 | 1.00 | 8.00 | 8.00 | 0.00 | 4.00 | 1.00 | 0.00\ncycles | 4.00 | 4.00 | 8.00 | 8.00 | 0.00 | 4.00 | 1.00 | 0.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 32.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 32.12\nStall cycles    : 25.34\nLB full (events): 28.80\nLM full (events): 0.04\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 6.50\nDispatch  : 8.00\nData deps.: 32.00\nOverall L1: 32.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 32.00 cycles. At this rate:\n - 25% of peak load performance is reached (32.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 5a89aa\n\nInstruction                                           | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7 | Latency | Recip. throughput\n-----------------------------------------------------------------------------------------------------------------------------------------------\nVMOVUPD (%R8,%RCX,1),%ZMM9                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0x40(%R8,%RCX,1),%ZMM10                       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0x80(%R8,%RCX,1),%ZMM11                       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0xc0(%R8,%RCX,1),%ZMM12                       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM9,%ZMM8                 | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVMOVUPD 0x100(%R8,%RCX,1),%ZMM13                      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0x140(%R8,%RCX,1),%ZMM14                      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0x180(%R8,%RCX,1),%ZMM15                      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0x1c0(%R8,%RCX,1),%ZMM0                       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVFMADD231PD 0x40(%R12,%RCX,1),%ZMM10,%ZMM8            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0x80(%R12,%RCX,1),%ZMM11,%ZMM8            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0xc0(%R12,%RCX,1),%ZMM12,%ZMM8            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0x100(%R12,%RCX,1),%ZMM13,%ZMM8           | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0x140(%R12,%RCX,1),%ZMM14,%ZMM8           | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0x180(%R12,%RCX,1),%ZMM15,%ZMM8           | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0x1c0(%R12,%RCX,1),%ZMM0,%ZMM8            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nADD $0x200,%RCX                                       | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nCMP %R11,%RCX                                         | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nJNE 5a89aa <hypre_SeqVectorInnerProd._omp_fn.0+0x14a> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0  | 0       | 0.50-1\n",
        },
      },
      header = {
        "12% of peak computational performance is used (4.00 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          title = "FMA",
          txt = "Detected 64 FMA (fused multiply-add) operations.",
        },
      },
    },
  },
  AVG = {
      hint = {
        {
          workaround = "Use vector aligned instructions:\n 1) align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.\n 2) inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.\n",
          details = " - VMOVUPD: 8 occurrences\n",
          title = "Vector unaligned load/store instructions",
          txt = "Detected 8 optimal vector unaligned load/store instructions.\n",
        },
        {
          title = "Type of elements and instruction set",
          txt = "8 AVX-512 instructions are processing arithmetic or math operations on double precision FP elements in vector mode (eight at a time).\n",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop is composed of 128 FP arithmetical operations:\n - 64: addition or subtraction (all inside FMA instructions)\n - 64: multiply (all inside FMA instructions)\nThe binary loop is loading 1024 bytes (128 double precision FP elements).",
        },
        {
          title = "Arithmetic intensity",
          txt = "Arithmetic intensity is 0.12 FP operations per loaded or stored byte.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 19\nnb uops            : 18\nloop length        : 142\nused x86 registers : 4\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 9\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 6.50 cycles\nfront end            : 6.50 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2   | P3   | P4   | P5   | P6   | P7\n--------------------------------------------------------------\nuops   | 4.00 | 1.00 | 8.00 | 8.00 | 0.00 | 4.00 | 1.00 | 0.00\ncycles | 4.00 | 4.00 | 8.00 | 8.00 | 0.00 | 4.00 | 1.00 | 0.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 32.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 32.12\nStall cycles    : 25.34\nLB full (events): 28.80\nLM full (events): 0.04\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 6.50\nDispatch  : 8.00\nData deps.: 32.00\nOverall L1: 32.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 100%\nload    : 100%\nstore   : NA (no store vectorizable/vectorized instructions)\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : 100%\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 32.00 cycles. At this rate:\n - 25% of peak load performance is reached (32.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 5a89aa\n\nInstruction                                           | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7 | Latency | Recip. throughput\n-----------------------------------------------------------------------------------------------------------------------------------------------\nVMOVUPD (%R8,%RCX,1),%ZMM9                            | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0x40(%R8,%RCX,1),%ZMM10                       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0x80(%R8,%RCX,1),%ZMM11                       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0xc0(%R8,%RCX,1),%ZMM12                       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVFMADD231PD (%R12,%RCX,1),%ZMM9,%ZMM8                 | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVMOVUPD 0x100(%R8,%RCX,1),%ZMM13                      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0x140(%R8,%RCX,1),%ZMM14                      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0x180(%R8,%RCX,1),%ZMM15                      | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVMOVUPD 0x1c0(%R8,%RCX,1),%ZMM0                       | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0  | 5-6     | 0.50\nVFMADD231PD 0x40(%R12,%RCX,1),%ZMM10,%ZMM8            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0x80(%R12,%RCX,1),%ZMM11,%ZMM8            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0xc0(%R12,%RCX,1),%ZMM12,%ZMM8            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0x100(%R12,%RCX,1),%ZMM13,%ZMM8           | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0x140(%R12,%RCX,1),%ZMM14,%ZMM8           | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0x180(%R12,%RCX,1),%ZMM15,%ZMM8           | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nVFMADD231PD 0x1c0(%R12,%RCX,1),%ZMM0,%ZMM8            | 1     | 0.50 | 0    | 0.50 | 0.50 | 0  | 0.50 | 0    | 0  | 4       | 0.50\nADD $0x200,%RCX                                       | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nCMP %R11,%RCX                                         | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0  | 1       | 0.25\nJNE 5a89aa <hypre_SeqVectorInnerProd._omp_fn.0+0x14a> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0  | 0       | 0.50-1\n",
        },
      },
      header = {
        "12% of peak computational performance is used (4.00 out of 32.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          details = "All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers).\n",
          title = "Vectorization",
          txt = "Your loop is fully vectorized, using full register length.\n",
        },
        {
          title = "Execution units bottlenecks",
          txt = "Found no such bottlenecks but see expert reports for more complex bottlenecks.",
        },
      },
      potential = {
        {
          title = "FMA",
          txt = "Detected 64 FMA (fused multiply-add) operations.",
        },
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/seq_mv/vector.c:486.\n",
      "The related source loop is not unrolled or unrolled with no peel/tail loop.",
    },
    nb_paths = 1,
  },
}
