Loop Id: 896 | Module: exec | Source: MultiBsplineRef.hpp:227-262 | Coverage: 0.01% |
---|
Loop Id: 896 | Module: exec | Source: MultiBsplineRef.hpp:227-262 | Coverage: 0.01% |
---|
0x482079 MOV 0xd0(%RSP),%R13 |
0x482081 MOV %R9,%RSI |
0x482084 MOV 0xe0(%RSP),%RAX |
0x48208c MOV %RDX,%RDI |
0x48208f MOV 0xc8(%RSP),%RDX |
0x482097 MOV %RCX,0xd8(%RSP) |
0x48209f SUB %R13,%RSI |
0x4820a2 MOV %R9,%R13 |
0x4820a5 VMOVSD 0x120(%RSP,%RAX,1),%XMM4 |
0x4820ae VMOVSD 0x160(%RSP,%RAX,1),%XMM5 |
0x4820b7 MOV %RSI,%R9 |
0x4820ba MOV 0xc0(%RSP),%RSI |
0x4820c2 VMULSD 0x1a0(%RSP,%RAX,1),%XMM22,%XMM31 |
0x4820ca XOR %EAX,%EAX |
0x4820cc VMULSD %XMM4,%XMM24,%XMM9 |
0x4820d2 VMULSD %XMM23,%XMM4,%XMM29 |
0x4820d8 VMULSD %XMM5,%XMM23,%XMM30 |
0x4820de VMULSD %XMM22,%XMM4,%XMM4 |
0x4820e4 VMULSD %XMM5,%XMM22,%XMM5 |
0x4820ea NOPW (%RAX,%RAX,1) |
(895) 0x4820f0 VMOVSD (%R9,%RAX,8),%XMM2 |
(895) 0x4820f6 VMOVSD (%R13,%RAX,8),%XMM26 |
(895) 0x4820fe VMOVSD (%RDI,%RAX,8),%XMM3 |
(895) 0x482103 VMOVSD (%R10,%RAX,8),%XMM25 |
(895) 0x48210a VMULSD %XMM11,%XMM26,%XMM0 |
(895) 0x482110 VMULSD %XMM13,%XMM25,%XMM1 |
(895) 0x482116 VFMADD231SD %XMM16,%XMM2,%XMM0 |
(895) 0x48211c VFMADD231SD %XMM12,%XMM3,%XMM1 |
(895) 0x482121 MOV 0xf8(%RSP),%RCX |
(895) 0x482129 VMULSD %XMM19,%XMM25,%XMM27 |
(895) 0x48212f VMULSD %XMM10,%XMM25,%XMM25 |
(895) 0x482135 VFMADD231SD %XMM14,%XMM3,%XMM27 |
(895) 0x48213b VFMADD132SD %XMM17,%XMM25,%XMM3 |
(895) 0x482141 VADDSD %XMM1,%XMM0,%XMM0 |
(895) 0x482145 VMULSD %XMM21,%XMM26,%XMM1 |
(895) 0x48214b VMULSD %XMM20,%XMM26,%XMM26 |
(895) 0x482151 VFMADD231SD %XMM18,%XMM2,%XMM1 |
(895) 0x482157 VFMADD132SD %XMM15,%XMM26,%XMM2 |
(895) 0x48215d VADDSD %XMM27,%XMM1,%XMM1 |
(895) 0x482163 VADDSD %XMM3,%XMM2,%XMM2 |
(895) 0x482167 VMOVSD %XMM9,%XMM9,%XMM3 |
(895) 0x48216b VFMADD213SD (%R11,%RAX,8),%XMM0,%XMM3 |
(895) 0x482171 VMOVSD %XMM3,(%R11,%RAX,8) |
(895) 0x482177 VMOVSD %XMM30,%XMM30,%XMM3 |
(895) 0x48217d VFMADD213SD (%R14,%RAX,8),%XMM0,%XMM3 |
(895) 0x482183 VMOVSD %XMM3,(%R14,%RAX,8) |
(895) 0x482189 VMOVSD %XMM29,%XMM29,%XMM3 |
(895) 0x48218f VFMADD213SD (%R15,%RAX,8),%XMM1,%XMM3 |
(895) 0x482195 VMOVSD %XMM3,(%R15,%RAX,8) |
(895) 0x48219b VMOVSD %XMM31,%XMM31,%XMM3 |
(895) 0x4821a1 VFMADD213SD (%RDX,%RAX,8),%XMM0,%XMM3 |
(895) 0x4821a7 VMOVSD %XMM3,(%RDX,%RAX,8) |
(895) 0x4821ac VMOVSD %XMM5,%XMM5,%XMM3 |
(895) 0x4821b0 VFMADD213SD (%RSI,%RAX,8),%XMM1,%XMM3 |
(895) 0x4821b6 VMOVSD %XMM3,(%RSI,%RAX,8) |
(895) 0x4821bb VMOVSD %XMM5,%XMM5,%XMM3 |
(895) 0x4821bf VFMADD213SD (%RBX,%RAX,8),%XMM4,%XMM2 |
(895) 0x4821c5 VMOVSD %XMM2,(%RBX,%RAX,8) |
(895) 0x4821ca VMOVSD %XMM29,%XMM29,%XMM2 |
(895) 0x4821d0 VFMADD213SD (%R12,%RAX,8),%XMM0,%XMM2 |
(895) 0x4821d6 VMOVSD %XMM2,(%R12,%RAX,8) |
(895) 0x4821dc VFMADD213SD (%R8,%RAX,8),%XMM0,%XMM3 |
(895) 0x4821e2 VMOVSD %XMM3,(%R8,%RAX,8) |
(895) 0x4821e8 VFMADD213SD (%RCX,%RAX,8),%XMM4,%XMM1 |
(895) 0x4821ee VMOVSD %XMM1,(%RCX,%RAX,8) |
(895) 0x4821f3 MOV 0xf0(%RSP),%RCX |
(895) 0x4821fb VFMADD213SD (%RCX,%RAX,8),%XMM4,%XMM0 |
(895) 0x482201 VMOVSD %XMM0,(%RCX,%RAX,8) |
(895) 0x482206 MOV %RAX,%RCX |
(895) 0x482209 INC %RAX |
(895) 0x48220c CMP %RCX,0xe8(%RSP) |
(895) 0x482214 JNE 4820f0 |
0x48221a MOV 0xd8(%RSP),%RCX |
0x482222 ADDQ $0x8,0xe0(%RSP) |
0x48222b MOV 0xe0(%RSP),%RAX |
0x482233 MOV %RDI,%RDX |
0x482236 MOV %R13,%R9 |
0x482239 ADD %RCX,%R9 |
0x48223c ADD %RCX,%RDX |
0x48223f ADD %RCX,%R10 |
0x482242 CMP $0x20,%RAX |
0x482246 JNE 482079 |
/beegfs/hackathon/users/eoseret/qaas_runs/170-855-3059/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineRef.hpp: 227 - 262 |
-------------------------------------------------------------------------------- |
227: for (int j = 0; j < 4; j++) |
228: { |
229: const T* restrict coefs = spline_m->coefs + (ix + i) * xs + (iy + j) * ys + iz * zs; |
230: const T* restrict coefszs = coefs + zs; |
231: const T* restrict coefs2zs = coefs + 2 * zs; |
232: const T* restrict coefs3zs = coefs + 3 * zs; |
233: |
234: const T pre20 = d2a[i] * b[j]; |
235: const T pre10 = da[i] * b[j]; |
236: const T pre00 = a[i] * b[j]; |
237: const T pre11 = da[i] * db[j]; |
238: const T pre01 = a[i] * db[j]; |
239: const T pre02 = a[i] * d2b[j]; |
240: |
241: const int iSplitPoint = num_splines; |
242: for (int n = 0; n < iSplitPoint; n++) |
243: { |
244: T coefsv = coefs[n]; |
245: T coefsvzs = coefszs[n]; |
246: T coefsv2zs = coefs2zs[n]; |
247: T coefsv3zs = coefs3zs[n]; |
248: |
249: T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs; |
250: T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs; |
251: T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs; |
252: |
253: hxx[n] += pre20 * sum0; |
254: hxy[n] += pre11 * sum0; |
255: hxz[n] += pre10 * sum1; |
256: hyy[n] += pre02 * sum0; |
257: hyz[n] += pre01 * sum1; |
258: hzz[n] += pre00 * sum2; |
259: gx[n] += pre10 * sum0; |
260: gy[n] += pre01 * sum0; |
261: gz[n] += pre00 * sum1; |
262: vals[n] += pre00 * sum0; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.61 |
CQA speedup if FP arith vectorized | 1.53 |
CQA speedup if fully vectorized | 5.80 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.32 |
Bottlenecks | micro-operation queue, |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source | MultiBsplineRef.hpp:227-229,MultiBsplineRef.hpp:234-239,MultiBsplineRef.hpp:242-242 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 4.83 |
CQA cycles if no scalar integer | 3.00 |
CQA cycles if FP arith vectorized | 3.17 |
CQA cycles if fully vectorized | 0.83 |
Front-end cycles | 4.83 |
DIV/SQRT cycles | 1.50 |
P0 cycles | 1.50 |
P1 cycles | 1.25 |
P2 cycles | 1.25 |
P3 cycles | 0.50 |
P4 cycles | 3.67 |
P5 cycles | 3.67 |
P6 cycles | 3.67 |
P7 cycles | 3.00 |
P8 cycles | 3.00 |
P9 cycles | 0.00 |
P10 cycles | 0.00 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 30.00 |
Nb uops | 29.00 |
Nb loads | 10.00 |
Nb stores | 2.00 |
Nb stack references | 5.00 |
FLOP/cycle | 1.24 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 6.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 19.86 |
Bytes prefetched | 0.00 |
Bytes loaded | 80.00 |
Bytes stored | 16.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 14.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.61 |
CQA speedup if FP arith vectorized | 1.53 |
CQA speedup if fully vectorized | 5.80 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.32 |
Bottlenecks | micro-operation queue, |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source | MultiBsplineRef.hpp:227-229,MultiBsplineRef.hpp:234-239,MultiBsplineRef.hpp:242-242 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 4.83 |
CQA cycles if no scalar integer | 3.00 |
CQA cycles if FP arith vectorized | 3.17 |
CQA cycles if fully vectorized | 0.83 |
Front-end cycles | 4.83 |
DIV/SQRT cycles | 1.50 |
P0 cycles | 1.50 |
P1 cycles | 1.25 |
P2 cycles | 1.25 |
P3 cycles | 0.50 |
P4 cycles | 3.67 |
P5 cycles | 3.67 |
P6 cycles | 3.67 |
P7 cycles | 3.00 |
P8 cycles | 3.00 |
P9 cycles | 0.00 |
P10 cycles | 0.00 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 30.00 |
Nb uops | 29.00 |
Nb loads | 10.00 |
Nb stores | 2.00 |
Nb stack references | 5.00 |
FLOP/cycle | 1.24 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 6.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 19.86 |
Bytes prefetched | 0.00 |
Bytes loaded | 80.00 |
Bytes stored | 16.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 14.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Path / |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source file and lines | MultiBsplineRef.hpp:227-262 |
Module | exec |
nb instructions | 30 |
nb uops | 29 |
loop length | 169 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 9 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 5 |
micro-operation queue | 4.83 cycles |
front end | 4.83 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.50 | 1.50 | 1.25 | 1.25 | 0.50 | 3.67 | 3.67 | 3.67 | 3.00 | 3.00 | 0.00 | 0.00 | 0.00 | 0.00 |
cycles | 1.50 | 1.50 | 1.25 | 1.25 | 0.50 | 3.67 | 3.67 | 3.67 | 3.00 | 3.00 | 0.00 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
Front-end | 4.83 |
Dispatch | 3.67 |
Data deps. | 1.00 |
Overall L1 | 4.83 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 12% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0xd0(%RSP),%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %R9,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0xe0(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %RDX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0xc8(%RSP),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %RCX,0xd8(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SUB %R13,%RSI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R9,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVSD 0x120(%RSP,%RAX,1),%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVSD 0x160(%RSP,%RAX,1),%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV %RSI,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0xc0(%RSP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMULSD 0x1a0(%RSP,%RAX,1),%XMM22,%XMM31 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMULSD %XMM4,%XMM24,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULSD %XMM23,%XMM4,%XMM29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULSD %XMM5,%XMM23,%XMM30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULSD %XMM22,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULSD %XMM5,%XMM22,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
NOPW (%RAX,%RAX,1) | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.09 |
MOV 0xd8(%RSP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADDQ $0x8,0xe0(%RSP) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV 0xe0(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %RDI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R13,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %RCX,%R9 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD %RCX,%RDX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD %RCX,%R10 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP $0x20,%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JNE 482079 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x769> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source file and lines | MultiBsplineRef.hpp:227-262 |
Module | exec |
nb instructions | 30 |
nb uops | 29 |
loop length | 169 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 9 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 5 |
micro-operation queue | 4.83 cycles |
front end | 4.83 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.50 | 1.50 | 1.25 | 1.25 | 0.50 | 3.67 | 3.67 | 3.67 | 3.00 | 3.00 | 0.00 | 0.00 | 0.00 | 0.00 |
cycles | 1.50 | 1.50 | 1.25 | 1.25 | 0.50 | 3.67 | 3.67 | 3.67 | 3.00 | 3.00 | 0.00 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
Front-end | 4.83 |
Dispatch | 3.67 |
Data deps. | 1.00 |
Overall L1 | 4.83 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 12% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0xd0(%RSP),%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %R9,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0xe0(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %RDX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0xc8(%RSP),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %RCX,0xd8(%RSP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SUB %R13,%RSI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R9,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVSD 0x120(%RSP,%RAX,1),%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVSD 0x160(%RSP,%RAX,1),%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV %RSI,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0xc0(%RSP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMULSD 0x1a0(%RSP,%RAX,1),%XMM22,%XMM31 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMULSD %XMM4,%XMM24,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULSD %XMM23,%XMM4,%XMM29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULSD %XMM5,%XMM23,%XMM30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULSD %XMM22,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULSD %XMM5,%XMM22,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
NOPW (%RAX,%RAX,1) | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.09 |
MOV 0xd8(%RSP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADDQ $0x8,0xe0(%RSP) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV 0xe0(%RSP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV %RDI,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R13,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %RCX,%R9 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD %RCX,%RDX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD %RCX,%R10 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP $0x20,%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JNE 482079 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x769> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |