Loop Id: 678 | Module: exec | Source: MultiBsplineRef.hpp:234-270 [...] | Coverage: 0.02% |
---|
Loop Id: 678 | Module: exec | Source: MultiBsplineRef.hpp:234-270 [...] | Coverage: 0.02% |
---|
0x438c60 LEA 0x1(%RAX),%R15 |
0x438c64 MOV 0xe0(%RSP),%RDI |
0x438c6c ADD %RDI,%R11 |
0x438c6f ADD %RDI,%R10 |
0x438c72 ADD %RDI,%R13 |
0x438c75 ADD %RDI,%R8 |
0x438c78 CMP $0x3,%RAX |
0x438c7c MOV %R15,%RAX |
0x438c7f MOV 0xf0(%RSP),%RDI |
0x438c87 JE 438ba0 |
0x438c8d CMPL $0,0x10(%RSP) |
0x438c92 JLE 438c60 |
0x438c94 VMOVSD 0x240(%RSP,%RAX,8),%XMM0 |
0x438c9d VMULSD 0xd0(%RSP),%XMM0,%XMM28 |
0x438ca5 VMOVSD 0x28(%RSP),%XMM2 |
0x438cab VMULSD %XMM2,%XMM0,%XMM20 |
0x438cb1 VMOVSD 0x1a0(%RSP,%RAX,8),%XMM1 |
0x438cba VMOVSD 0xc0(%RSP),%XMM3 |
0x438cc3 VMULSD %XMM3,%XMM0,%XMM27 |
0x438cc9 VMULSD %XMM2,%XMM1,%XMM21 |
0x438ccf VMULSD %XMM3,%XMM1,%XMM2 |
0x438cd3 VMULSD 0x160(%RSP,%RAX,8),%XMM3,%XMM19 |
0x438cdb CMPQ $0,0x20(%RSP) |
0x438ce1 MOV %RAX,0x60(%RSP) |
0x438ce6 JE 438e90 |
0x438cec VBROADCASTSD %XMM28,%ZMM22 |
0x438cf2 VBROADCASTSD %XMM21,%ZMM23 |
0x438cf8 VBROADCASTSD %XMM20,%ZMM24 |
0x438cfe VBROADCASTSD %XMM19,%ZMM25 |
0x438d04 VBROADCASTSD %XMM2,%ZMM26 |
0x438d0a VMOVAPD %XMM27,%XMM3 |
0x438d10 VBROADCASTSD %XMM27,%ZMM27 |
0x438d16 XOR %EAX,%EAX |
0x438d18 NOPL (%RAX,%RAX,1) |
(679) 0x438d20 VMOVUPD (%R8,%RAX,8),%ZMM0 |
(679) 0x438d27 VMOVUPD (%R13,%RAX,8),%ZMM1 |
(679) 0x438d2f VMOVUPD (%R10,%RAX,8),%ZMM30 |
(679) 0x438d36 VMOVUPD (%R11,%RAX,8),%ZMM31 |
(679) 0x438d3d VMULPD %ZMM29,%ZMM0,%ZMM13 |
(679) 0x438d43 VMULPD %ZMM4,%ZMM31,%ZMM14 |
(679) 0x438d49 VFMADD231PD %ZMM17,%ZMM1,%ZMM13 |
(679) 0x438d4f VFMADD231PD %ZMM18,%ZMM30,%ZMM13 |
(679) 0x438d55 VFMADD231PD %ZMM14,%ZMM5,%ZMM13 |
(679) 0x438d5b VMULPD %ZMM6,%ZMM0,%ZMM15 |
(679) 0x438d61 VMULPD %ZMM7,%ZMM1,%ZMM16 |
(679) 0x438d67 VFMADD231PD %ZMM31,%ZMM9,%ZMM16 |
(679) 0x438d6d VFMADD231PD %ZMM8,%ZMM30,%ZMM15 |
(679) 0x438d73 VFMADD231PD %ZMM16,%ZMM4,%ZMM15 |
(679) 0x438d79 LEA (%R14,%RAX,8),%RDI |
(679) 0x438d7d VMOVUPD (%R14,%RAX,8),%ZMM16 |
(679) 0x438d84 VFMADD231PD %ZMM13,%ZMM22,%ZMM16 |
(679) 0x438d8a VMOVUPD %ZMM16,(%R14,%RAX,8) |
(679) 0x438d91 VMOVUPD (%RDI,%RBX,8),%ZMM16 |
(679) 0x438d98 VFMADD231PD %ZMM13,%ZMM23,%ZMM16 |
(679) 0x438d9e VMOVUPD %ZMM16,(%RDI,%RBX,8) |
(679) 0x438da5 LEA (%RDI,%RBX,8),%RDI |
(679) 0x438da9 VMOVUPD (%R9,%RDI,1),%ZMM16 |
(679) 0x438db0 VFMADD231PD %ZMM24,%ZMM15,%ZMM16 |
(679) 0x438db6 VMOVUPD %ZMM16,(%R9,%RDI,1) |
(679) 0x438dbd LEA (%RDI,%R9,1),%RDI |
(679) 0x438dc1 VMOVUPD (%R9,%RDI,1),%ZMM16 |
(679) 0x438dc8 VFMADD231PD %ZMM13,%ZMM25,%ZMM16 |
(679) 0x438dce VMOVUPD %ZMM16,(%R9,%RDI,1) |
(679) 0x438dd5 LEA (%RDI,%R9,1),%RDI |
(679) 0x438dd9 VMOVUPD (%R9,%RDI,1),%ZMM16 |
(679) 0x438de0 VFMADD231PD %ZMM26,%ZMM15,%ZMM16 |
(679) 0x438de6 VMOVUPD %ZMM16,(%R9,%RDI,1) |
(679) 0x438ded LEA (%RDI,%R9,1),%RDI |
(679) 0x438df1 VFMADD213PD %ZMM14,%ZMM10,%ZMM0 |
(679) 0x438df7 VFMADD231PD %ZMM1,%ZMM11,%ZMM0 |
(679) 0x438dfd VFMADD231PD %ZMM30,%ZMM12,%ZMM0 |
(679) 0x438e03 VFMADD213PD (%R9,%RDI,1),%ZMM27,%ZMM0 |
(679) 0x438e0a VMOVUPD %ZMM0,(%R9,%RDI,1) |
(679) 0x438e11 VMOVUPD (%RCX,%RAX,8),%ZMM0 |
(679) 0x438e18 VFMADD231PD %ZMM24,%ZMM13,%ZMM0 |
(679) 0x438e1e VMOVUPD %ZMM0,(%RCX,%RAX,8) |
(679) 0x438e25 VMOVUPD (%RDX,%RAX,8),%ZMM0 |
(679) 0x438e2c VFMADD231PD %ZMM26,%ZMM13,%ZMM0 |
(679) 0x438e32 VMOVUPD %ZMM0,(%RDX,%RAX,8) |
(679) 0x438e39 VFMADD213PD (%R12,%RAX,8),%ZMM27,%ZMM15 |
(679) 0x438e40 VMOVUPD %ZMM15,(%R12,%RAX,8) |
(679) 0x438e47 VFMADD213PD (%RSI,%RAX,8),%ZMM27,%ZMM13 |
(679) 0x438e4e VMOVUPD %ZMM13,(%RSI,%RAX,8) |
(679) 0x438e55 ADD $0x8,%RAX |
(679) 0x438e59 CMP 0x20(%RSP),%RAX |
(679) 0x438e5e JL 438d20 |
0x438e64 MOV 0x20(%RSP),%RAX |
0x438e69 MOV %RAX,%R15 |
0x438e6c CMP 0x10(%RSP),%RAX |
0x438e71 MOV 0x60(%RSP),%RAX |
0x438e76 VMOVAPD %XMM3,%XMM27 |
0x438e7c JE 438c60 |
0x438e82 JMP 438e93 |
0x438e90 XOR %R15D,%R15D |
0x438e93 VPBROADCASTQ %R15,%ZMM0 |
0x438e99 VPORQ 0x58f9d(%RIP),%ZMM0,%ZMM0 |
0x438ea3 VPCMPLTUQ 0x280(%RSP),%ZMM0,%K1 |
0x438eac KORTESTB %K1,%K1 |
0x438eb0 JE 438c60 |
0x438eb6 MOV 0x30(%RSP),%RAX |
0x438ebb IMUL 0x60(%RSP),%RAX |
0x438ec1 ADD 0xb0(%RSP),%RAX |
0x438ec9 MOV 0x98(%RSP),%RDI |
0x438ed1 ADD %R15,%RDI |
0x438ed4 ADD %RAX,%RDI |
0x438ed7 MOV 0x50(%RSP),%RSI |
0x438edc VMOVUPD (%RSI,%RDI,8),%ZMM0{%K1}{z} |
0x438ee3 VMOVUPD 0x500(%RSP),%ZMM25 |
0x438eeb VMOVAPD %ZMM0,%ZMM25{%K1} |
0x438ef1 MOV 0x88(%RSP),%RDI |
0x438ef9 ADD %R15,%RDI |
0x438efc ADD %RAX,%RDI |
0x438eff MOV 0x50(%RSP),%RSI |
0x438f04 VMOVUPD (%RSI,%RDI,8),%ZMM0{%K1}{z} |
0x438f0b VMOVUPD 0x540(%RSP),%ZMM16 |
0x438f13 VMOVAPD %ZMM0,%ZMM16{%K1} |
0x438f19 MOV 0x90(%RSP),%RDI |
0x438f21 ADD %R15,%RDI |
0x438f24 ADD %RAX,%RDI |
0x438f27 MOV 0x50(%RSP),%RSI |
0x438f2c VMOVUPD (%RSI,%RDI,8),%ZMM0{%K1}{z} |
0x438f33 VMOVUPD 0x580(%RSP),%ZMM15 |
0x438f3b VMOVAPD %ZMM0,%ZMM15{%K1} |
0x438f41 MOV 0xa0(%RSP),%RDI |
0x438f49 ADD %R15,%RDI |
0x438f4c ADD %RDI,%RAX |
0x438f4f MOV 0x50(%RSP),%RSI |
0x438f54 VMOVUPD (%RSI,%RAX,8),%ZMM0{%K1}{z} |
0x438f5b MOV 0x158(%RSP),%RSI |
0x438f63 VMOVUPD 0x5c0(%RSP),%ZMM1 |
0x438f6b VMOVAPD %ZMM0,%ZMM1{%K1} |
0x438f71 VMULPD %ZMM29,%ZMM25,%ZMM22 |
0x438f77 VMULPD %ZMM4,%ZMM1,%ZMM24 |
0x438f7d VFMADD231PD %ZMM17,%ZMM16,%ZMM22 |
0x438f83 VFMADD231PD %ZMM18,%ZMM15,%ZMM22 |
0x438f89 VFMADD231PD %ZMM24,%ZMM5,%ZMM22 |
0x438f8f VMOVUPD (%R14,%R15,8),%ZMM0{%K1}{z} |
0x438f96 VMULPD %ZMM6,%ZMM25,%ZMM23 |
0x438f9c VMOVUPD 0x600(%RSP),%ZMM13 |
0x438fa4 VMOVAPD %ZMM0,%ZMM13{%K1} |
0x438faa VMULPD %ZMM7,%ZMM16,%ZMM0 |
0x438fb0 VMOVUPD %ZMM1,0x5c0(%RSP) |
0x438fb8 VFMADD231PD %ZMM9,%ZMM1,%ZMM0 |
0x438fbe VFMADD231PD %ZMM8,%ZMM15,%ZMM23 |
0x438fc4 VBROADCASTSD %XMM28,%ZMM1 |
0x438fca VMOVUPD %ZMM13,0x600(%RSP) |
0x438fd2 VFMADD213PD %ZMM13,%ZMM22,%ZMM1 |
0x438fd8 VMOVUPD %ZMM1,(%R14,%R15,8){%K1} |
0x438fdf LEA (%R15,%RBX,1),%RAX |
0x438fe3 MOV %RAX,0xb8(%RSP) |
0x438feb VMOVUPD (%R14,%RAX,8),%ZMM1{%K1}{z} |
0x438ff2 VFMADD231PD %ZMM0,%ZMM4,%ZMM23 |
0x438ff8 VMOVUPD 0x640(%RSP),%ZMM13 |
0x439000 VMOVAPD %ZMM1,%ZMM13{%K1} |
0x439006 VBROADCASTSD %XMM21,%ZMM0 |
0x43900c VMOVUPD %ZMM13,0x640(%RSP) |
0x439014 VFMADD213PD %ZMM13,%ZMM22,%ZMM0 |
0x43901a VMOVUPD %ZMM0,(%R14,%RAX,8){%K1} |
0x439021 VBROADCASTSD %XMM20,%ZMM0 |
0x439027 MOV 0x48(%RSP),%RAX |
0x43902c LEA (%RAX,%R15,1),%RAX |
0x439030 VMOVUPD (%R14,%RAX,8),%ZMM1{%K1}{z} |
0x439037 VMOVUPD 0x680(%RSP),%ZMM13 |
0x43903f VMOVAPD %ZMM1,%ZMM13{%K1} |
0x439045 VMOVAPD %ZMM0,%ZMM1 |
0x43904b VMOVUPD %ZMM13,0x680(%RSP) |
0x439053 VFMADD213PD %ZMM13,%ZMM23,%ZMM1 |
0x439059 VMOVUPD %ZMM1,(%R14,%RAX,8){%K1} |
0x439060 MOV 0x38(%RSP),%RDI |
0x439065 LEA (%RDI,%R15,1),%RDI |
0x439069 VMOVUPD (%R14,%RDI,8),%ZMM1{%K1}{z} |
0x439070 VMOVUPD 0x6c0(%RSP),%ZMM13 |
0x439078 VMOVAPD %ZMM1,%ZMM13{%K1} |
0x43907e VBROADCASTSD %XMM19,%ZMM1 |
0x439084 VMOVUPD %ZMM13,0x6c0(%RSP) |
0x43908c VFMADD213PD %ZMM13,%ZMM22,%ZMM1 |
0x439092 VMOVUPD %ZMM1,(%R14,%RDI,8){%K1} |
0x439099 VBROADCASTSD %XMM2,%ZMM1 |
0x43909f MOV 0xa8(%RSP),%RDI |
0x4390a7 LEA (%RDI,%R15,1),%RDI |
0x4390ab VMOVUPD (%R14,%RDI,8),%ZMM13{%K1}{z} |
0x4390b2 VMOVUPD 0x700(%RSP),%ZMM14 |
0x4390ba VMOVAPD %ZMM13,%ZMM14{%K1} |
0x4390c0 VMOVAPD %ZMM1,%ZMM13 |
0x4390c6 VMOVUPD %ZMM14,0x700(%RSP) |
0x4390ce VFMADD213PD %ZMM14,%ZMM23,%ZMM13 |
0x4390d4 VMOVUPD %ZMM13,(%R14,%RDI,8){%K1} |
0x4390db MOV 0x40(%RSP),%RDI |
0x4390e0 LEA (%RDI,%R15,1),%RDI |
0x4390e4 VMOVUPD (%R14,%RDI,8),%ZMM13{%K1}{z} |
0x4390eb VMOVUPD 0x740(%RSP),%ZMM14 |
0x4390f3 VMOVAPD %ZMM13,%ZMM14{%K1} |
0x4390f9 VBROADCASTSD %XMM27,%ZMM13 |
0x4390ff VMOVUPD %ZMM25,0x500(%RSP) |
0x439107 VFMADD231PD %ZMM25,%ZMM10,%ZMM24 |
0x43910d VMOVUPD %ZMM16,0x540(%RSP) |
0x439115 VFMADD231PD %ZMM16,%ZMM11,%ZMM24 |
0x43911b VMOVUPD %ZMM15,0x580(%RSP) |
0x439123 VFMADD231PD %ZMM15,%ZMM12,%ZMM24 |
0x439129 VMOVUPD %ZMM14,0x740(%RSP) |
0x439131 VFMADD213PD %ZMM14,%ZMM13,%ZMM24 |
0x439137 VMOVUPD %ZMM24,(%R14,%RDI,8){%K1} |
0x43913e VMOVUPD (%RCX,%R15,8),%ZMM14{%K1}{z} |
0x439145 VMOVUPD 0x780(%RSP),%ZMM15 |
0x43914d VMOVAPD %ZMM14,%ZMM15{%K1} |
0x439153 VMOVUPD %ZMM15,0x780(%RSP) |
0x43915b VFMADD213PD %ZMM15,%ZMM22,%ZMM0 |
0x439161 VMOVUPD %ZMM0,(%RCX,%R15,8){%K1} |
0x439168 MOV 0xb8(%RSP),%RDI |
0x439170 VMOVUPD (%RCX,%RDI,8),%ZMM0{%K1}{z} |
0x439177 VMOVUPD 0x7c0(%RSP),%ZMM14 |
0x43917f VMOVAPD %ZMM0,%ZMM14{%K1} |
0x439185 VMOVUPD %ZMM14,0x7c0(%RSP) |
0x43918d VFMADD213PD %ZMM14,%ZMM22,%ZMM1 |
0x439193 VMOVUPD %ZMM1,(%RCX,%RDI,8){%K1} |
0x43919a VMOVUPD (%RCX,%RAX,8),%ZMM0{%K1}{z} |
0x4391a1 VMOVUPD 0x800(%RSP),%ZMM1 |
0x4391a9 VMOVAPD %ZMM0,%ZMM1{%K1} |
0x4391af VMOVUPD %ZMM1,0x800(%RSP) |
0x4391b7 VFMADD213PD %ZMM1,%ZMM13,%ZMM23 |
0x4391bd VMOVUPD %ZMM23,(%RCX,%RAX,8){%K1} |
0x4391c4 MOV 0x60(%RSP),%RAX |
0x4391c9 VMOVUPD (%RSI,%R15,8),%ZMM0{%K1}{z} |
0x4391d0 VMOVUPD 0x840(%RSP),%ZMM1 |
0x4391d8 VMOVAPD %ZMM0,%ZMM1{%K1} |
0x4391de VMOVUPD %ZMM1,0x840(%RSP) |
0x4391e6 VFMADD213PD %ZMM1,%ZMM13,%ZMM22 |
0x4391ec VMOVUPD %ZMM22,(%RSI,%R15,8){%K1} |
0x4391f3 JMP 438c60 |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVectorOps.h: 59 - 59 |
-------------------------------------------------------------------------------- |
59: for (unsigned d = 0; d < D; ++d) |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineRef.hpp: 234 - 270 |
-------------------------------------------------------------------------------- |
234: for (int j = 0; j < 4; j++) |
[...] |
241: const T pre20 = d2a[i] * b[j]; |
242: const T pre10 = da[i] * b[j]; |
243: const T pre00 = a[i] * b[j]; |
244: const T pre11 = da[i] * db[j]; |
245: const T pre01 = a[i] * db[j]; |
246: const T pre02 = a[i] * d2b[j]; |
247: |
248: const int iSplitPoint = num_splines; |
249: #pragma omp simd aligned(coefs, coefszs, coefs2zs, coefs3zs: QMC_SIMD_ALIGNMENT) simdlen(simdlen_) |
250: for (int n = 0; n < iSplitPoint; n++) |
251: { |
252: T coefsv = coefs[n]; |
253: T coefsvzs = coefszs[n]; |
254: T coefsv2zs = coefs2zs[n]; |
255: T coefsv3zs = coefs3zs[n]; |
256: |
257: T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs; |
258: T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs; |
259: T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs; |
260: |
261: hxx[n] += pre20 * sum0; |
262: hxy[n] += pre11 * sum0; |
263: hxz[n] += pre10 * sum1; |
264: hyy[n] += pre02 * sum0; |
265: hyz[n] += pre01 * sum1; |
266: hzz[n] += pre00 * sum2; |
267: gx[n] += pre10 * sum0; |
268: gy[n] += pre01 * sum0; |
269: gz[n] += pre00 * sum1; |
270: vals[n] += pre00 * sum0; |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineData.hpp: 71 - 71 |
-------------------------------------------------------------------------------- |
71: a[3] = ((A30 * tx + A31) * tx + A32) * tx + A33; |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:100 | exec |
○ | qmcplusplus::WaveFunction::rat[...] | WaveFunction.cpp:202 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.15 |
CQA speedup if FP arith vectorized | 1.03 |
CQA speedup if fully vectorized | 1.20 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.48 |
Bottlenecks | micro-operation queue, |
Function | miniqmcreference::einspline_spo_ref |
Source | TinyVectorOps.h:59-59,MultiBsplineRef.hpp:234-270,MultiBsplineData.hpp:71-71 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 44.25 |
CQA cycles if no scalar integer | 38.50 |
CQA cycles if FP arith vectorized | 43.16 |
CQA cycles if fully vectorized | 37.02 |
Front-end cycles | 44.25 |
DIV/SQRT cycles | 21.50 |
P0 cycles | 16.25 |
P1 cycles | 30.00 |
P2 cycles | 30.00 |
P3 cycles | 26.00 |
P4 cycles | 21.00 |
P5 cycles | 16.25 |
P6 cycles | 26.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 44.55 |
Stall cycles (UFS) | 0.00 |
Nb insns | 177.00 |
Nb uops | 176.00 |
Nb loads | 61.00 |
Nb stores | 26.00 |
Nb stack references | 36.00 |
FLOP/cycle | 7.73 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 38.00 |
Nb FLOP fma | 152.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 83.98 |
Bytes prefetched | 0.00 |
Bytes loaded | 2164.00 |
Bytes stored | 1552.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 74.22 |
Vectorization ratio load | 76.92 |
Vectorization ratio store | 92.31 |
Vectorization ratio mul | 40.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 54.05 |
Vector-efficiency ratio all | 76.22 |
Vector-efficiency ratio load | 79.65 |
Vector-efficiency ratio store | 93.27 |
Vector-efficiency ratio mul | 47.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 55.57 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.15 |
CQA speedup if FP arith vectorized | 1.03 |
CQA speedup if fully vectorized | 1.20 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.48 |
Bottlenecks | micro-operation queue, |
Function | miniqmcreference::einspline_spo_ref |
Source | TinyVectorOps.h:59-59,MultiBsplineRef.hpp:234-270,MultiBsplineData.hpp:71-71 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 44.25 |
CQA cycles if no scalar integer | 38.50 |
CQA cycles if FP arith vectorized | 43.16 |
CQA cycles if fully vectorized | 37.02 |
Front-end cycles | 44.25 |
DIV/SQRT cycles | 21.50 |
P0 cycles | 16.25 |
P1 cycles | 30.00 |
P2 cycles | 30.00 |
P3 cycles | 26.00 |
P4 cycles | 21.00 |
P5 cycles | 16.25 |
P6 cycles | 26.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 44.55 |
Stall cycles (UFS) | 0.00 |
Nb insns | 177.00 |
Nb uops | 176.00 |
Nb loads | 61.00 |
Nb stores | 26.00 |
Nb stack references | 36.00 |
FLOP/cycle | 7.73 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 38.00 |
Nb FLOP fma | 152.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 83.98 |
Bytes prefetched | 0.00 |
Bytes loaded | 2164.00 |
Bytes stored | 1552.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 74.22 |
Vectorization ratio load | 76.92 |
Vectorization ratio store | 92.31 |
Vectorization ratio mul | 40.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 54.05 |
Vector-efficiency ratio all | 76.22 |
Vector-efficiency ratio load | 79.65 |
Vector-efficiency ratio store | 93.27 |
Vector-efficiency ratio mul | 47.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 55.57 |
Path / |
Function | miniqmcreference::einspline_spo_ref |
Source file and lines | MultiBsplineRef.hpp:234-270 |
Module | exec |
nb instructions | 177 |
nb uops | 176 |
loop length | 1096 |
used x86 registers | 12 |
used mmx registers | 0 |
used xmm registers | 9 |
used ymm registers | 0 |
used zmm registers | 24 |
nb stack references | 36 |
micro-operation queue | 44.25 cycles |
front end | 44.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 21.50 | 16.25 | 30.00 | 30.00 | 26.00 | 21.00 | 16.25 | 26.00 |
cycles | 21.50 | 16.25 | 30.00 | 30.00 | 26.00 | 21.00 | 16.25 | 26.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 44.55 |
Stall cycles | 0.00 |
Front-end | 44.25 |
Dispatch | 30.00 |
Overall L1 | 44.25 |
all | 15% |
load | 40% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 28% |
all | 80% |
load | 82% |
store | 100% |
mul | 40% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 100% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 60% |
all | 74% |
load | 76% |
store | 92% |
mul | 40% |
add-sub | 0% |
fma | 100% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 54% |
all | 25% |
load | 46% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 36% |
all | 81% |
load | 84% |
store | 100% |
mul | 47% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 100% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 60% |
all | 76% |
load | 79% |
store | 93% |
mul | 47% |
add-sub | 12% |
fma | 100% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 55% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
LEA 0x1(%RAX),%R15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0xe0(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %RDI,%R11 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RDI,%R10 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RDI,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RDI,%R8 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP $0x3,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %R15,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0xf0(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
JE 438ba0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
CMPL $0,0x10(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JLE 438c60 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVSD 0x240(%RSP,%RAX,8),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMULSD 0xd0(%RSP),%XMM0,%XMM28 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD 0x28(%RSP),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMULSD %XMM2,%XMM0,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD 0x1a0(%RSP,%RAX,8),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0xc0(%RSP),%XMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMULSD %XMM3,%XMM0,%XMM27 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM2,%XMM1,%XMM21 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM3,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x160(%RSP,%RAX,8),%XMM3,%XMM19 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMPQ $0,0x20(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
MOV %RAX,0x60(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
JE 438e90 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VBROADCASTSD %XMM28,%ZMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM21,%ZMM23 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM20,%ZMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM19,%ZMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM2,%ZMM26 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVAPD %XMM27,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSD %XMM27,%ZMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x20(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %RAX,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CMP 0x10(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
MOV 0x60(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVAPD %XMM3,%XMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JE 438c60 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
JMP 438e93 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VPBROADCASTQ %R15,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VPORQ 0x58f9d(%RIP),%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VPCMPLTUQ 0x280(%RSP),%ZMM0,%K1 | |||||||||||
KORTESTB %K1,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
JE 438c60 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV 0x30(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
IMUL 0x60(%RSP),%RAX | 1 | 0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD 0xb0(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
MOV 0x98(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %R15,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x50(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RSI,%RDI,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x500(%RSP),%ZMM25 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM25{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x88(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %R15,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x50(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RSI,%RDI,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x540(%RSP),%ZMM16 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM16{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x90(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %R15,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x50(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RSI,%RDI,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x580(%RSP),%ZMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM15{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0xa0(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %R15,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RDI,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x50(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RSI,%RAX,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
MOV 0x158(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD 0x5c0(%RSP),%ZMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM1{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMULPD %ZMM29,%ZMM25,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM4,%ZMM1,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM17,%ZMM16,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM18,%ZMM15,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM24,%ZMM5,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD (%R14,%R15,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMULPD %ZMM6,%ZMM25,%ZMM23 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x600(%RSP),%ZMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMULPD %ZMM7,%ZMM16,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,0x5c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231PD %ZMM9,%ZMM1,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM8,%ZMM15,%ZMM23 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VBROADCASTSD %XMM28,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVUPD %ZMM13,0x600(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM13,%ZMM22,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,(%R14,%R15,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (%R15,%RBX,1),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RAX,0xb8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD (%R14,%RAX,8),%ZMM1{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD231PD %ZMM0,%ZMM4,%ZMM23 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x640(%RSP),%ZMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM1,%ZMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSD %XMM21,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVUPD %ZMM13,0x640(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM13,%ZMM22,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM0,(%R14,%RAX,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VBROADCASTSD %XMM20,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
MOV 0x48(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%RAX,%R15,1),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVUPD (%R14,%RAX,8),%ZMM1{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x680(%RSP),%ZMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM1,%ZMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %ZMM0,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM13,0x680(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM13,%ZMM23,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,(%R14,%RAX,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0x38(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%RDI,%R15,1),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVUPD (%R14,%RDI,8),%ZMM1{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x6c0(%RSP),%ZMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM1,%ZMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSD %XMM19,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVUPD %ZMM13,0x6c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM13,%ZMM22,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,(%R14,%RDI,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VBROADCASTSD %XMM2,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
MOV 0xa8(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%RDI,%R15,1),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVUPD (%R14,%RDI,8),%ZMM13{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x700(%RSP),%ZMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM13,%ZMM14{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %ZMM1,%ZMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM14,0x700(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM14,%ZMM23,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM13,(%R14,%RDI,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0x40(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%RDI,%R15,1),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVUPD (%R14,%RDI,8),%ZMM13{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x740(%RSP),%ZMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM13,%ZMM14{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSD %XMM27,%ZMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVUPD %ZMM25,0x500(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231PD %ZMM25,%ZMM10,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM16,0x540(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231PD %ZMM16,%ZMM11,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM15,0x580(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231PD %ZMM15,%ZMM12,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM14,0x740(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM14,%ZMM13,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM24,(%R14,%RDI,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD (%RCX,%R15,8),%ZMM14{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x780(%RSP),%ZMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM14,%ZMM15{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM15,0x780(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM15,%ZMM22,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM0,(%RCX,%R15,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0xb8(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RCX,%RDI,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x7c0(%RSP),%ZMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM14{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM14,0x7c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM14,%ZMM22,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,(%RCX,%RDI,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD (%RCX,%RAX,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x800(%RSP),%ZMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM1{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM1,0x800(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM1,%ZMM13,%ZMM23 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM23,(%RCX,%RAX,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0x60(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RSI,%R15,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x840(%RSP),%ZMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM1{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM1,0x840(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM1,%ZMM13,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM22,(%RSI,%R15,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
JMP 438c60 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
Function | miniqmcreference::einspline_spo_ref |
Source file and lines | MultiBsplineRef.hpp:234-270 |
Module | exec |
nb instructions | 177 |
nb uops | 176 |
loop length | 1096 |
used x86 registers | 12 |
used mmx registers | 0 |
used xmm registers | 9 |
used ymm registers | 0 |
used zmm registers | 24 |
nb stack references | 36 |
micro-operation queue | 44.25 cycles |
front end | 44.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 21.50 | 16.25 | 30.00 | 30.00 | 26.00 | 21.00 | 16.25 | 26.00 |
cycles | 21.50 | 16.25 | 30.00 | 30.00 | 26.00 | 21.00 | 16.25 | 26.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 44.55 |
Stall cycles | 0.00 |
Front-end | 44.25 |
Dispatch | 30.00 |
Overall L1 | 44.25 |
all | 15% |
load | 40% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 28% |
all | 80% |
load | 82% |
store | 100% |
mul | 40% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 100% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 60% |
all | 74% |
load | 76% |
store | 92% |
mul | 40% |
add-sub | 0% |
fma | 100% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 54% |
all | 25% |
load | 46% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 36% |
all | 81% |
load | 84% |
store | 100% |
mul | 47% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 100% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 60% |
all | 76% |
load | 79% |
store | 93% |
mul | 47% |
add-sub | 12% |
fma | 100% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 55% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
LEA 0x1(%RAX),%R15 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0xe0(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %RDI,%R11 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RDI,%R10 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RDI,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RDI,%R8 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP $0x3,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %R15,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0xf0(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
JE 438ba0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
CMPL $0,0x10(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JLE 438c60 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVSD 0x240(%RSP,%RAX,8),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMULSD 0xd0(%RSP),%XMM0,%XMM28 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD 0x28(%RSP),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMULSD %XMM2,%XMM0,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD 0x1a0(%RSP,%RAX,8),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0xc0(%RSP),%XMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMULSD %XMM3,%XMM0,%XMM27 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM2,%XMM1,%XMM21 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM3,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x160(%RSP,%RAX,8),%XMM3,%XMM19 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMPQ $0,0x20(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
MOV %RAX,0x60(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
JE 438e90 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VBROADCASTSD %XMM28,%ZMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM21,%ZMM23 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM20,%ZMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM19,%ZMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM2,%ZMM26 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVAPD %XMM27,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSD %XMM27,%ZMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x20(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %RAX,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CMP 0x10(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
MOV 0x60(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVAPD %XMM3,%XMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JE 438c60 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
JMP 438e93 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VPBROADCASTQ %R15,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VPORQ 0x58f9d(%RIP),%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VPCMPLTUQ 0x280(%RSP),%ZMM0,%K1 | |||||||||||
KORTESTB %K1,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
JE 438c60 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV 0x30(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
IMUL 0x60(%RSP),%RAX | 1 | 0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD 0xb0(%RSP),%RAX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
MOV 0x98(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %R15,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x50(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RSI,%RDI,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x500(%RSP),%ZMM25 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM25{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x88(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %R15,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x50(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RSI,%RDI,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x540(%RSP),%ZMM16 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM16{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x90(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %R15,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x50(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RSI,%RDI,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x580(%RSP),%ZMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM15{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0xa0(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %R15,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RDI,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x50(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RSI,%RAX,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
MOV 0x158(%RSP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD 0x5c0(%RSP),%ZMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM1{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMULPD %ZMM29,%ZMM25,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM4,%ZMM1,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM17,%ZMM16,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM18,%ZMM15,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM24,%ZMM5,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD (%R14,%R15,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMULPD %ZMM6,%ZMM25,%ZMM23 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x600(%RSP),%ZMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMULPD %ZMM7,%ZMM16,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,0x5c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231PD %ZMM9,%ZMM1,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM8,%ZMM15,%ZMM23 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VBROADCASTSD %XMM28,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVUPD %ZMM13,0x600(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM13,%ZMM22,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,(%R14,%R15,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (%R15,%RBX,1),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RAX,0xb8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD (%R14,%RAX,8),%ZMM1{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD231PD %ZMM0,%ZMM4,%ZMM23 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x640(%RSP),%ZMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM1,%ZMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSD %XMM21,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVUPD %ZMM13,0x640(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM13,%ZMM22,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM0,(%R14,%RAX,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VBROADCASTSD %XMM20,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
MOV 0x48(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%RAX,%R15,1),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVUPD (%R14,%RAX,8),%ZMM1{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x680(%RSP),%ZMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM1,%ZMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %ZMM0,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM13,0x680(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM13,%ZMM23,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,(%R14,%RAX,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0x38(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%RDI,%R15,1),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVUPD (%R14,%RDI,8),%ZMM1{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x6c0(%RSP),%ZMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM1,%ZMM13{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSD %XMM19,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVUPD %ZMM13,0x6c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM13,%ZMM22,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,(%R14,%RDI,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VBROADCASTSD %XMM2,%ZMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
MOV 0xa8(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%RDI,%R15,1),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVUPD (%R14,%RDI,8),%ZMM13{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x700(%RSP),%ZMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM13,%ZMM14{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %ZMM1,%ZMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM14,0x700(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM14,%ZMM23,%ZMM13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM13,(%R14,%RDI,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0x40(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%RDI,%R15,1),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVUPD (%R14,%RDI,8),%ZMM13{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x740(%RSP),%ZMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM13,%ZMM14{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSD %XMM27,%ZMM13 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVUPD %ZMM25,0x500(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231PD %ZMM25,%ZMM10,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM16,0x540(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231PD %ZMM16,%ZMM11,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM15,0x580(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231PD %ZMM15,%ZMM12,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM14,0x740(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM14,%ZMM13,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM24,(%R14,%RDI,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD (%RCX,%R15,8),%ZMM14{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x780(%RSP),%ZMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM14,%ZMM15{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM15,0x780(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM15,%ZMM22,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM0,(%RCX,%R15,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0xb8(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RCX,%RDI,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x7c0(%RSP),%ZMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM14{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM14,0x7c0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM14,%ZMM22,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,(%RCX,%RDI,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD (%RCX,%RAX,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x800(%RSP),%ZMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM1{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM1,0x800(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM1,%ZMM13,%ZMM23 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM23,(%RCX,%RAX,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0x60(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%RSI,%R15,8),%ZMM0{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x840(%RSP),%ZMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVAPD %ZMM0,%ZMM1{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD %ZMM1,0x840(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD213PD %ZMM1,%ZMM13,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM22,(%RSI,%R15,8){%K1} | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
JMP 438c60 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |