_cqa_text_report = {
  paths = {
    {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - ADD: 16 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          details = " - Constant non-unit stride: 2 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          title = "Type of elements and instruction set",
          txt = "No instructions are processing arithmetic or math operations on FP elements. This loop is probably writing/copying data or processing integer elements.",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop does not contain any FP arithmetical operations.\nThe binary loop is loading 256 bytes.\nThe binary loop is storing 128 bytes.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 36\nnb uops            : 51\nloop length        : 141\nused x86 registers : 13\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 0\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 12.75 cycles\nfront end            : 12.75 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2    | P3    | P4    | P5   | P6   | P7\n------------------------------------------------------------------\nuops   | 4.75 | 4.75 | 16.00 | 16.00 | 16.00 | 4.75 | 4.75 | 16.00\ncycles | 4.75 | 4.75 | 16.00 | 16.00 | 16.00 | 4.75 | 4.75 | 16.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 1.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 16.15\nStall cycles    : 3.21\nLM full (events): 12.85\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 12.75\nDispatch  : 16.00\nData deps.: 1.00\nOverall L1: 16.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 0%\nload    : 0%\nstore   : 0%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 12%\nload    : 12%\nstore   : 12%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 16.00 cycles. At this rate:\n - 12% of peak load performance is reached (16.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 12% of peak store performance is reached (8.00 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 4776fa\n\nInstruction                                       | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7   | Latency | Recip. throughput\n---------------------------------------------------------------------------------------------------------------------------------------------\nMOV -0x8(%R10),%R12                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R12,(%R10)                                   | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV -0x8(%RDX),%R11                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R11,(%RDX)                                   | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV (%R10),%R8                                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R8,0x8(%R10)                                 | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV (%RDX),%RCX                                   | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RCX,0x8(%RDX)                                | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x8(%R10),%RDI                                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RDI,0x10(%R10)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x8(%RDX),%R14                                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R14,0x10(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x10(%R10),%R15                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R15,0x18(%R10)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x10(%RDX),%RAX                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RAX,0x18(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x18(%R10),%RSI                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RSI,0x20(%R10)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x18(%RDX),%R13                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R13,0x20(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x20(%R10),%R12                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R12,0x28(%R10)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x20(%RDX),%R11                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R11,0x28(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x28(%R10),%R8                                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R8,0x30(%R10)                                | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x28(%RDX),%RCX                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RCX,0x30(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x30(%R10),%RDI                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RDI,0x38(%R10)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nADD $0x40,%R10                                    | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nMOV 0x30(%RDX),%R14                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R14,0x38(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nADD $0x40,%RDX                                    | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nCMP %R9,%R10                                      | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nJNE 4776fa <hypre_BoomerAMGBuildMultipass+0x27ca> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0    | 0       | 0.50-1\n",
        },
      },
      header = {
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = " - Try another compiler or update/tune your current one\n - Remove inter-iterations dependences from your loop and make it unit-stride:\n  * If your arrays have 2 or more dimensions, check whether elements are accessed contiguously and, otherwise, try to permute loops accordingly:\nC storage order is row-major: for(i) for(j) a[j][i] = b[j][i]; (slow, non stride 1) => for(i) for(j) a[i][j] = b[i][j]; (fast, stride 1)\n  * If your loop streams arrays of structures (AoS), try to use structures of arrays instead (SoA):\nfor(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)\n",
          details = "All SSE/AVX instructions are used in scalar version (process only one data element in vector registers).\nSince your execution units are vector units, only a vectorized loop can use their full power.\n",
          title = "Vectorization",
          txt = "Your loop is not vectorized.\n8 data elements could be processed at once in vector registers.\nBy vectorizing your loop, you can lower the cost of an iteration from 16.00 to 2.00 cycles (8.00x speedup).",
        },
        {
          workaround = " - Read less array elements\n - Write less array elements\n - Provide more information to your compiler:\n  * hardcode the bounds of the corresponding 'for' loop\n  * use the 'restrict' C99 keyword\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by:\n - reading data from caches/RAM (load units are a bottleneck)\n - writing data to caches/RAM (the store unit is a bottleneck)\n",
        },
      },
      potential = {
      },
    },
  },
  AVG = {
      hint = {
        {
          details = "These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.\n - ADD: 16 occurrences\n",
          title = "Complex instructions",
          txt = "Detected COMPLEX INSTRUCTIONS.\n",
        },
        {
          workaround = " - Try to reorganize arrays of structures to structures of arrays\n - Consider to permute loops (see vectorization gain report)\n",
          details = " - Constant non-unit stride: 2 occurrence(s)\nNon-unit stride (uncontiguous) accesses are not efficiently using data caches\n",
          title = "Slow data structures access",
          txt = "Detected data structures (typically arrays) that cannot be efficiently read/written",
        },
        {
          title = "Type of elements and instruction set",
          txt = "No instructions are processing arithmetic or math operations on FP elements. This loop is probably writing/copying data or processing integer elements.",
        },
        {
          title = "Matching between your loop (in the source code) and the binary loop",
          txt = "The binary loop does not contain any FP arithmetical operations.\nThe binary loop is loading 256 bytes.\nThe binary loop is storing 128 bytes.",
        },
        {
          workaround = "Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam.",
          title = "Unroll opportunity",
          txt = "Loop is data access bound.",
        },
      },
      expert = {
        {
          title = "General properties",
          txt = "nb instructions    : 36\nnb uops            : 51\nloop length        : 141\nused x86 registers : 13\nused mmx registers : 0\nused xmm registers : 0\nused ymm registers : 0\nused zmm registers : 0\nnb stack references: 0\n",
        },
        {
          title = "Front-end",
          txt = "ASSUMED MACRO FUSION\nFIT IN UOP CACHE\nmicro-operation queue: 12.75 cycles\nfront end            : 12.75 cycles\n",
        },
        {
          title = "Back-end",
          txt = "       | P0   | P1   | P2    | P3    | P4    | P5   | P6   | P7\n------------------------------------------------------------------\nuops   | 4.75 | 4.75 | 16.00 | 16.00 | 16.00 | 4.75 | 4.75 | 16.00\ncycles | 4.75 | 4.75 | 16.00 | 16.00 | 16.00 | 4.75 | 4.75 | 16.00\n\nCycles executing div or sqrt instructions: NA\nLongest recurrence chain latency (RecMII): 1.00\n",
        },
        {
          title = "Front-end and detailed OoO resources (UFS)",
          txt = "FE+BE cycles    : 16.15\nStall cycles    : 3.21\nLM full (events): 12.85\n",
        },
        {
          title = "Cycles summary",
          txt = "Front-end : 12.75\nDispatch  : 16.00\nData deps.: 1.00\nOverall L1: 16.00\n",
        },
        {
          title = "Vectorization ratios",
          txt = "all     : 0%\nload    : 0%\nstore   : 0%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Vector efficiency ratios",
          txt = "all     : 12%\nload    : 12%\nstore   : 12%\nmul     : NA (no mul vectorizable/vectorized instructions)\nadd-sub : NA (no add-sub vectorizable/vectorized instructions)\nfma     : NA (no fma vectorizable/vectorized instructions)\ndiv/sqrt: NA (no div/sqrt vectorizable/vectorized instructions)\nother   : NA (no other vectorizable/vectorized instructions)\n",
        },
        {
          title = "Cycles and memory resources usage",
          txt = "Assuming all data fit into the L1 cache, each iteration of the binary loop takes 16.00 cycles. At this rate:\n - 12% of peak load performance is reached (16.00 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))\n - 12% of peak store performance is reached (8.00 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))\n",
        },
        {
          title = "Front-end bottlenecks",
          txt = "Found no such bottlenecks.",
        },
        {
          title = "ASM code",
          txt = "In the binary file, the address of the loop is: 4776fa\n\nInstruction                                       | Nb FU | P0   | P1   | P2   | P3   | P4 | P5   | P6   | P7   | Latency | Recip. throughput\n---------------------------------------------------------------------------------------------------------------------------------------------\nMOV -0x8(%R10),%R12                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R12,(%R10)                                   | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV -0x8(%RDX),%R11                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R11,(%RDX)                                   | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV (%R10),%R8                                    | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R8,0x8(%R10)                                 | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV (%RDX),%RCX                                   | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RCX,0x8(%RDX)                                | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x8(%R10),%RDI                                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RDI,0x10(%R10)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x8(%RDX),%R14                                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R14,0x10(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x10(%R10),%R15                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R15,0x18(%R10)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x10(%RDX),%RAX                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RAX,0x18(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x18(%R10),%RSI                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RSI,0x20(%R10)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x18(%RDX),%R13                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R13,0x20(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x20(%R10),%R12                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R12,0x28(%R10)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x20(%RDX),%R11                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R11,0x28(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x28(%R10),%R8                                | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R8,0x30(%R10)                                | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x28(%RDX),%RCX                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RCX,0x30(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nMOV 0x30(%R10),%RDI                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %RDI,0x38(%R10)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nADD $0x40,%R10                                    | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nMOV 0x30(%RDX),%R14                               | 1     | 0    | 0    | 0.50 | 0.50 | 0  | 0    | 0    | 0    | 4-5     | 0.50\nADD %R14,0x38(%RDX)                               | 2     | 0.25 | 0.25 | 0.83 | 0.83 | 1  | 0.25 | 0.25 | 0.33 | 5       | 1\nADD $0x40,%RDX                                    | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nCMP %R9,%R10                                      | 1     | 0.25 | 0.25 | 0    | 0    | 0  | 0.25 | 0.25 | 0    | 1       | 0.25\nJNE 4776fa <hypre_BoomerAMGBuildMultipass+0x27ca> | 1     | 0.50 | 0    | 0    | 0    | 0  | 0    | 0.50 | 0    | 0       | 0.50-1\n",
        },
      },
      header = {
        "0% of peak computational performance is used (0.00 out of 64.00 FLOP per cycle (GFLOPS @ 1GHz))",
      },
      brief = {
      },
      gain = {
        {
          workaround = " - Try another compiler or update/tune your current one\n - Remove inter-iterations dependences from your loop and make it unit-stride:\n  * If your arrays have 2 or more dimensions, check whether elements are accessed contiguously and, otherwise, try to permute loops accordingly:\nC storage order is row-major: for(i) for(j) a[j][i] = b[j][i]; (slow, non stride 1) => for(i) for(j) a[i][j] = b[i][j]; (fast, stride 1)\n  * If your loop streams arrays of structures (AoS), try to use structures of arrays instead (SoA):\nfor(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)\n",
          details = "All SSE/AVX instructions are used in scalar version (process only one data element in vector registers).\nSince your execution units are vector units, only a vectorized loop can use their full power.\n",
          title = "Vectorization",
          txt = "Your loop is not vectorized.\n8 data elements could be processed at once in vector registers.\nBy vectorizing your loop, you can lower the cost of an iteration from 16.00 to 2.00 cycles (8.00x speedup).",
        },
        {
          workaround = " - Read less array elements\n - Write less array elements\n - Provide more information to your compiler:\n  * hardcode the bounds of the corresponding 'for' loop\n  * use the 'restrict' C99 keyword\n",
          title = "Execution units bottlenecks",
          txt = "Performance is limited by:\n - reading data from caches/RAM (load units are a bottleneck)\n - writing data to caches/RAM (the store unit is a bottleneck)\n",
        },
      },
      potential = {
      },
    },
  common = {
    header = {
      "The loop is defined in /home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/parcsr_ls/par_multi_interp.c:1158-1161.\n",
      "The related source loop is not unrolled or unrolled with no peel/tail loop.",
    },
    nb_paths = 1,
  },
}
