Loop Id: 225 | Module: exec | Source: BsplineFunctor.h:302-335 | Coverage: 0.14% |
---|
Loop Id: 225 | Module: exec | Source: BsplineFunctor.h:302-335 | Coverage: 0.14% |
---|
0x41b000 VMOVUPD (%R14,%RBX,8),%ZMM22 [5] |
0x41b007 VMULPD %ZMM15,%ZMM22,%ZMM24 |
0x41b00d VMOVDQA64 %ZMM27,%ZMM4 |
0x41b013 VCVTTPD2DQ %ZMM24,%YMM27 |
0x41b019 VXORPD %XMM28,%XMM28,%XMM28 |
0x41b01f KXNORW %K0,%K0,%K1 |
0x41b023 VGATHERDPD (%RCX,%YMM27,8),%ZMM28{%K1} [6] |
0x41b02a VXORPD %XMM26,%XMM26,%XMM26 |
0x41b030 KXNORW %K0,%K0,%K1 |
0x41b034 VXORPD %XMM25,%XMM25,%XMM25 |
0x41b03a KXNORW %K0,%K0,%K2 |
0x41b03e VGATHERDPD 0x8(%RCX,%YMM27,8),%ZMM26{%K1} [6] |
0x41b046 VXORPD %XMM23,%XMM23,%XMM23 |
0x41b04c VGATHERDPD 0x10(%RCX,%YMM27,8),%ZMM25{%K2} [6] |
0x41b054 KXNORW %K0,%K0,%K1 |
0x41b058 VGATHERDPD 0x18(%RCX,%YMM27,8),%ZMM23{%K1} [6] |
0x41b060 VRNDSCALEPD $0xb,%ZMM24,%ZMM27 |
0x41b067 VSUBPD %ZMM27,%ZMM24,%ZMM29 |
0x41b06d VMOVAPD %ZMM29,%ZMM27 |
0x41b073 VMOVUPD 0x4c0(%RSP),%ZMM24 [7] |
0x41b07b VFMADD132PD 0x500(%RSP),%ZMM24,%ZMM27 [7] |
0x41b083 VFMADD213PD 0x480(%RSP),%ZMM29,%ZMM27 [7] |
0x41b08b VMOVAPD %ZMM29,%ZMM30 |
0x41b091 VMOVAPD %ZMM29,%ZMM24 |
0x41b097 VMOVUPD 0x2c0(%RSP),%ZMM31 [7] |
0x41b09f VFMADD132PD 0x300(%RSP),%ZMM31,%ZMM24 [7] |
0x41b0a7 VMULPD %ZMM28,%ZMM24,%ZMM24 |
0x41b0ad VMOVUPD 0x680(%RSP),%ZMM31 [7] |
0x41b0b5 VFMADD132PD 0x280(%RSP),%ZMM31,%ZMM30 [7] |
0x41b0bd VFMADD213PD %ZMM24,%ZMM26,%ZMM30 |
0x41b0c3 VMOVAPD %ZMM29,%ZMM24 |
0x41b0c9 VFMADD213PD %ZMM3,%ZMM7,%ZMM24 |
0x41b0cf VFMADD213PD %ZMM8,%ZMM29,%ZMM24 |
0x41b0d5 VFMADD213PD %ZMM9,%ZMM29,%ZMM24 |
0x41b0db VMULPD %ZMM28,%ZMM27,%ZMM27 |
0x41b0e1 VMULPD %ZMM28,%ZMM24,%ZMM24 |
0x41b0e7 VMOVAPD %ZMM29,%ZMM28 |
0x41b0ed VMOVUPD 0x400(%RSP),%ZMM31 [7] |
0x41b0f5 VFMADD132PD 0x440(%RSP),%ZMM31,%ZMM28 [7] |
0x41b0fd VFMADD213PD 0x3c0(%RSP),%ZMM29,%ZMM28 [7] |
0x41b105 VFMADD213PD %ZMM27,%ZMM26,%ZMM28 |
0x41b10b VMOVAPD %ZMM29,%ZMM27 |
0x41b111 VFMADD213PD %ZMM11,%ZMM10,%ZMM27 |
0x41b117 VFMADD213PD %ZMM12,%ZMM29,%ZMM27 |
0x41b11d VFMADD213PD %ZMM14,%ZMM29,%ZMM27 |
0x41b123 VFMADD213PD %ZMM24,%ZMM26,%ZMM27 |
0x41b129 VMOVAPD %ZMM29,%ZMM24 |
0x41b12f VMOVUPD 0x600(%RSP),%ZMM26 [7] |
0x41b137 VFMADD132PD 0x640(%RSP),%ZMM26,%ZMM24 [7] |
0x41b13f VFMADD213PD %ZMM30,%ZMM25,%ZMM24 |
0x41b145 VMOVAPD %ZMM29,%ZMM26 |
0x41b14b VMOVUPD 0x340(%RSP),%ZMM30 [7] |
0x41b153 VFMADD132PD 0x380(%RSP),%ZMM30,%ZMM26 [7] |
0x41b15b VFMADD213PD %ZMM5,%ZMM29,%ZMM26 |
0x41b161 VFMADD213PD %ZMM28,%ZMM25,%ZMM26 |
0x41b167 VMOVAPD %ZMM29,%ZMM28 |
0x41b16d VFMADD213PD %ZMM0,%ZMM13,%ZMM28 |
0x41b173 VFMADD213PD %ZMM16,%ZMM29,%ZMM28 |
0x41b179 VFMADD213PD %ZMM17,%ZMM29,%ZMM28 |
0x41b17f VFMADD213PD %ZMM27,%ZMM25,%ZMM28 |
0x41b185 VMOVDQA64 %ZMM4,%ZMM27 |
0x41b18b VMOVAPD %ZMM29,%ZMM25 |
0x41b191 VMOVUPD 0x580(%RSP),%ZMM4 [7] |
0x41b199 VFMADD132PD 0x5c0(%RSP),%ZMM4,%ZMM25 [7] |
0x41b1a1 VFMADD213PD %ZMM24,%ZMM23,%ZMM25 |
0x41b1a7 VMOVAPD %ZMM29,%ZMM24 |
0x41b1ad VFMADD213PD %ZMM6,%ZMM1,%ZMM24 |
0x41b1b3 VFMADD213PD %ZMM2,%ZMM29,%ZMM24 |
0x41b1b9 VFMADD213PD %ZMM26,%ZMM23,%ZMM24 |
0x41b1bf VPMOVSXDQ (%R13,%RBX,4),%ZMM26 [2] |
0x41b1c7 VMULPD 0x540(%RSP),%ZMM25,%ZMM25 [7] |
0x41b1cf VPADDQ %ZMM26,%ZMM27,%ZMM26 |
0x41b1d5 KXNORW %K0,%K0,%K1 |
0x41b1d9 VSCATTERQPD %ZMM25,(%RDX,%ZMM26,8){%K1} [1] |
0x41b1e0 VMULPD %ZMM15,%ZMM24,%ZMM24 |
0x41b1e6 VDIVPD %ZMM22,%ZMM24,%ZMM22 |
0x41b1ec KXNORW %K0,%K0,%K1 |
0x41b1f0 VSCATTERQPD %ZMM22,(%R10,%ZMM26,8){%K1} [4] |
0x41b1f7 VMOVAPD %ZMM29,%ZMM22 |
0x41b1fd VFMADD213PD %ZMM19,%ZMM18,%ZMM22 |
0x41b203 VFMADD213PD %ZMM20,%ZMM29,%ZMM22 |
0x41b209 VFMADD213PD %ZMM21,%ZMM29,%ZMM22 |
0x41b20f VFMADD213PD %ZMM28,%ZMM23,%ZMM22 |
0x41b215 KXNORW %K0,%K0,%K1 |
0x41b219 VSCATTERQPD %ZMM22,(%R9,%ZMM26,8){%K1} [3] |
0x41b220 ADD $0x8,%RBX |
0x41b224 CMP %RSI,%RBX |
0x41b227 JB 41b000 |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h: 302 - 335 |
-------------------------------------------------------------------------------- |
302: #pragma omp simd |
303: for (int j = 0; j < iCount; j++) |
304: { |
305: real_type r = distArrayCompressed[j]; |
306: int iScatter = distIndices[j]; |
307: real_type rinv = cOne / r; |
308: r *= DeltaRInv; |
309: int iGather = (int)r; |
310: real_type t = r - real_type(iGather); |
311: real_type tp0 = t * t * t; |
312: real_type tp1 = t * t; |
313: real_type tp2 = t; |
314: |
315: real_type sCoef0 = SplineCoefs[iGather + 0]; |
316: real_type sCoef1 = SplineCoefs[iGather + 1]; |
317: real_type sCoef2 = SplineCoefs[iGather + 2]; |
318: real_type sCoef3 = SplineCoefs[iGather + 3]; |
319: |
320: // clang-format off |
321: laplArray[iScatter] = dSquareDeltaRinv * |
322: (sCoef0*( d2A[ 2]*tp2 + d2A[ 3])+ |
323: sCoef1*( d2A[ 6]*tp2 + d2A[ 7])+ |
324: sCoef2*( d2A[10]*tp2 + d2A[11])+ |
325: sCoef3*( d2A[14]*tp2 + d2A[15])); |
326: |
327: gradArray[iScatter] = DeltaRInv * rinv * |
328: (sCoef0*( dA[ 1]*tp1 + dA[ 2]*tp2 + dA[ 3])+ |
329: sCoef1*( dA[ 5]*tp1 + dA[ 6]*tp2 + dA[ 7])+ |
330: sCoef2*( dA[ 9]*tp1 + dA[10]*tp2 + dA[11])+ |
331: sCoef3*( dA[13]*tp1 + dA[14]*tp2 + dA[15])); |
332: |
333: valArray[iScatter] = (sCoef0*(A[ 0]*tp0 + A[ 1]*tp1 + A[ 2]*tp2 + A[ 3])+ |
334: sCoef1*(A[ 4]*tp0 + A[ 5]*tp1 + A[ 6]*tp2 + A[ 7])+ |
335: sCoef2*(A[ 8]*tp0 + A[ 9]*tp1 + A[10]*tp2 + A[11])+ |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►70.59+ | miniqmcreference::TwoBodyJastr[...] | TwoBodyJastrowRef.h:271 | exec |
○ | qmcplusplus::WaveFunction::rat[...] | WaveFunction.cpp:207 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 | |
►23.53+ | miniqmcreference::TwoBodyJastr[...] | TwoBodyJastrowRef.h:271 | exec |
○ | qmcplusplus::WaveFunction::acc[...] | NewTimer.h:249 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 | |
►5.88+ | miniqmcreference::OneBodyJastr[...] | OneBodyJastrowRef.h:222 | exec |
○ | miniqmcreference::OneBodyJastr[...] | stl_vector.h:1258 | exec |
○ | qmcplusplus::WaveFunction::rat[...] | WaveFunction.cpp:207 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.04 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.01 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.21 |
Bottlenecks | micro-operation queue, |
Function | qmcplusplus::BsplineFunctor |
Source | BsplineFunctor.h:302-335 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 45.50 |
CQA cycles if no scalar integer | 43.75 |
CQA cycles if FP arith vectorized | 45.50 |
CQA cycles if fully vectorized | 45.24 |
Front-end cycles | 45.50 |
DIV/SQRT cycles | 33.50 |
P0 cycles | 20.00 |
P1 cycles | 37.50 |
P2 cycles | 37.50 |
P3 cycles | 24.00 |
P4 cycles | 33.50 |
P5 cycles | 1.00 |
P6 cycles | 0.00 |
P7 cycles | 16.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 56.93 |
Stall cycles (UFS) | 31.63 |
Nb insns | 88.00 |
Nb uops | 182.00 |
Nb loads | 23.00 |
Nb stores | 3.00 |
Nb stack references | 17.00 |
FLOP/cycle | 13.01 |
Nb FLOP add-sub | 8.00 |
Nb FLOP mul | 48.00 |
Nb FLOP fma | 264.00 |
Nb FLOP div | 8.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 35.87 |
Bytes prefetched | 0.00 |
Bytes loaded | 1440.00 |
Bytes stored | 192.00 |
Stride 0 | 1.00 |
Stride 1 | 2.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 4.00 |
Vectorization ratio all | 100.00 |
Vectorization ratio load | 100.00 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 100.00 |
Vector-efficiency ratio all | 95.51 |
Vector-efficiency ratio load | 97.83 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 100.00 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 87.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.04 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.01 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.21 |
Bottlenecks | micro-operation queue, |
Function | qmcplusplus::BsplineFunctor |
Source | BsplineFunctor.h:302-335 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 45.50 |
CQA cycles if no scalar integer | 43.75 |
CQA cycles if FP arith vectorized | 45.50 |
CQA cycles if fully vectorized | 45.24 |
Front-end cycles | 45.50 |
DIV/SQRT cycles | 33.50 |
P0 cycles | 20.00 |
P1 cycles | 37.50 |
P2 cycles | 37.50 |
P3 cycles | 24.00 |
P4 cycles | 33.50 |
P5 cycles | 1.00 |
P6 cycles | 0.00 |
P7 cycles | 16.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 56.93 |
Stall cycles (UFS) | 31.63 |
Nb insns | 88.00 |
Nb uops | 182.00 |
Nb loads | 23.00 |
Nb stores | 3.00 |
Nb stack references | 17.00 |
FLOP/cycle | 13.01 |
Nb FLOP add-sub | 8.00 |
Nb FLOP mul | 48.00 |
Nb FLOP fma | 264.00 |
Nb FLOP div | 8.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 35.87 |
Bytes prefetched | 0.00 |
Bytes loaded | 1440.00 |
Bytes stored | 192.00 |
Stride 0 | 1.00 |
Stride 1 | 2.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 4.00 |
Vectorization ratio all | 100.00 |
Vectorization ratio load | 100.00 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 100.00 |
Vector-efficiency ratio all | 95.51 |
Vector-efficiency ratio load | 97.83 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 100.00 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 87.50 |
Path / |
Function | qmcplusplus::BsplineFunctor |
Source file and lines | BsplineFunctor.h:302-335 |
Module | exec |
nb instructions | 88 |
nb uops | 182 |
loop length | 557 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 4 |
used ymm registers | 1 |
used zmm registers | 32 |
nb stack references | 17 |
ADD-SUB / MUL ratio | 0.17 |
micro-operation queue | 45.50 cycles |
front end | 45.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 33.50 | 1.00 | 37.50 | 37.50 | 24.00 | 33.50 | 1.00 | 0.00 |
cycles | 33.50 | 20.00 | 37.50 | 37.50 | 24.00 | 33.50 | 1.00 | 0.00 |
Cycles executing div or sqrt instructions | 16.00 |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 56.93 |
Stall cycles | 31.63 |
ROB full (events) | 33.22 |
RS full (events) | 0.32 |
Front-end | 45.50 |
Dispatch | 37.50 |
DIV/SQRT | 16.00 |
Data deps. | 1.00 |
Overall L1 | 45.50 |
all | 100% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 100% |
all | 100% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 100% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 87% |
load | 50% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 100% |
all | 95% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 86% |
all | 95% |
load | 97% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 87% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPD (%R14,%RBX,8),%ZMM22 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMULPD %ZMM15,%ZMM22,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVDQA64 %ZMM27,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VCVTTPD2DQ %ZMM24,%YMM27 | 2 | 0.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 7 | 1 |
VXORPD %XMM28,%XMM28,%XMM28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERDPD (%RCX,%YMM27,8),%ZMM28{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VXORPD %XMM26,%XMM26,%XMM26 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VXORPD %XMM25,%XMM25,%XMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERDPD 0x8(%RCX,%YMM27,8),%ZMM26{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VXORPD %XMM23,%XMM23,%XMM23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERDPD 0x10(%RCX,%YMM27,8),%ZMM25{%K2} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERDPD 0x18(%RCX,%YMM27,8),%ZMM23{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VRNDSCALEPD $0xb,%ZMM24,%ZMM27 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VSUBPD %ZMM27,%ZMM24,%ZMM29 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x4c0(%RSP),%ZMM24 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x500(%RSP),%ZMM24,%ZMM27 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD 0x480(%RSP),%ZMM29,%ZMM27 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %ZMM29,%ZMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x2c0(%RSP),%ZMM31 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x300(%RSP),%ZMM31,%ZMM24 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM28,%ZMM24,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x680(%RSP),%ZMM31 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x280(%RSP),%ZMM31,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM24,%ZMM26,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMADD213PD %ZMM3,%ZMM7,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM8,%ZMM29,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM9,%ZMM29,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM28,%ZMM27,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM28,%ZMM24,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x400(%RSP),%ZMM31 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x440(%RSP),%ZMM31,%ZMM28 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD 0x3c0(%RSP),%ZMM29,%ZMM28 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM27,%ZMM26,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMADD213PD %ZMM11,%ZMM10,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM12,%ZMM29,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM14,%ZMM29,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM24,%ZMM26,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x600(%RSP),%ZMM26 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x640(%RSP),%ZMM26,%ZMM24 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM30,%ZMM25,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM26 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x340(%RSP),%ZMM30 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x380(%RSP),%ZMM30,%ZMM26 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM5,%ZMM29,%ZMM26 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM28,%ZMM25,%ZMM26 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMADD213PD %ZMM0,%ZMM13,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM16,%ZMM29,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM17,%ZMM29,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM27,%ZMM25,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVDQA64 %ZMM4,%ZMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %ZMM29,%ZMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x580(%RSP),%ZMM4 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x5c0(%RSP),%ZMM4,%ZMM25 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM24,%ZMM23,%ZMM25 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMADD213PD %ZMM6,%ZMM1,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM2,%ZMM29,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM26,%ZMM23,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VPMOVSXDQ (%R13,%RBX,4),%ZMM26 | 2 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 3 | 1 |
VMULPD 0x540(%RSP),%ZMM25,%ZMM25 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VPADDQ %ZMM26,%ZMM27,%ZMM26 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VSCATTERQPD %ZMM25,(%RDX,%ZMM26,8){%K1} | 27 | 0 | 0 | 4 | 4 | 8 | 1 | 0 | 0 | 15 | 11 |
VMULPD %ZMM15,%ZMM24,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VDIVPD %ZMM22,%ZMM24,%ZMM22 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 24 | 16 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VSCATTERQPD %ZMM22,(%R10,%ZMM26,8){%K1} | 27 | 0 | 0 | 4 | 4 | 8 | 1 | 0 | 0 | 15 | 11 |
VMOVAPD %ZMM29,%ZMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMADD213PD %ZMM19,%ZMM18,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM20,%ZMM29,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM21,%ZMM29,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM28,%ZMM23,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VSCATTERQPD %ZMM22,(%R9,%ZMM26,8){%K1} | 27 | 0 | 0 | 4 | 4 | 8 | 1 | 0 | 0 | 15 | 11 |
ADD $0x8,%RBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %RSI,%RBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JB 41b000 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Function | qmcplusplus::BsplineFunctor |
Source file and lines | BsplineFunctor.h:302-335 |
Module | exec |
nb instructions | 88 |
nb uops | 182 |
loop length | 557 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 4 |
used ymm registers | 1 |
used zmm registers | 32 |
nb stack references | 17 |
ADD-SUB / MUL ratio | 0.17 |
micro-operation queue | 45.50 cycles |
front end | 45.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 33.50 | 1.00 | 37.50 | 37.50 | 24.00 | 33.50 | 1.00 | 0.00 |
cycles | 33.50 | 20.00 | 37.50 | 37.50 | 24.00 | 33.50 | 1.00 | 0.00 |
Cycles executing div or sqrt instructions | 16.00 |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 56.93 |
Stall cycles | 31.63 |
ROB full (events) | 33.22 |
RS full (events) | 0.32 |
Front-end | 45.50 |
Dispatch | 37.50 |
DIV/SQRT | 16.00 |
Data deps. | 1.00 |
Overall L1 | 45.50 |
all | 100% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 100% |
all | 100% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 100% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 87% |
load | 50% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 100% |
all | 95% |
load | 100% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 86% |
all | 95% |
load | 97% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 87% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPD (%R14,%RBX,8),%ZMM22 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMULPD %ZMM15,%ZMM22,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVDQA64 %ZMM27,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VCVTTPD2DQ %ZMM24,%YMM27 | 2 | 0.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 7 | 1 |
VXORPD %XMM28,%XMM28,%XMM28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERDPD (%RCX,%YMM27,8),%ZMM28{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VXORPD %XMM26,%XMM26,%XMM26 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VXORPD %XMM25,%XMM25,%XMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERDPD 0x8(%RCX,%YMM27,8),%ZMM26{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VXORPD %XMM23,%XMM23,%XMM23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERDPD 0x10(%RCX,%YMM27,8),%ZMM25{%K2} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERDPD 0x18(%RCX,%YMM27,8),%ZMM23{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VRNDSCALEPD $0xb,%ZMM24,%ZMM27 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VSUBPD %ZMM27,%ZMM24,%ZMM29 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x4c0(%RSP),%ZMM24 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x500(%RSP),%ZMM24,%ZMM27 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD 0x480(%RSP),%ZMM29,%ZMM27 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %ZMM29,%ZMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x2c0(%RSP),%ZMM31 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x300(%RSP),%ZMM31,%ZMM24 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM28,%ZMM24,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x680(%RSP),%ZMM31 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x280(%RSP),%ZMM31,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM24,%ZMM26,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMADD213PD %ZMM3,%ZMM7,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM8,%ZMM29,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM9,%ZMM29,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM28,%ZMM27,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM28,%ZMM24,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x400(%RSP),%ZMM31 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x440(%RSP),%ZMM31,%ZMM28 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD 0x3c0(%RSP),%ZMM29,%ZMM28 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM27,%ZMM26,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMADD213PD %ZMM11,%ZMM10,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM12,%ZMM29,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM14,%ZMM29,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM24,%ZMM26,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x600(%RSP),%ZMM26 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x640(%RSP),%ZMM26,%ZMM24 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM30,%ZMM25,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM26 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x340(%RSP),%ZMM30 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x380(%RSP),%ZMM30,%ZMM26 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM5,%ZMM29,%ZMM26 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM28,%ZMM25,%ZMM26 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMADD213PD %ZMM0,%ZMM13,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM16,%ZMM29,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM17,%ZMM29,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM27,%ZMM25,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVDQA64 %ZMM4,%ZMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %ZMM29,%ZMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVUPD 0x580(%RSP),%ZMM4 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VFMADD132PD 0x5c0(%RSP),%ZMM4,%ZMM25 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM24,%ZMM23,%ZMM25 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVAPD %ZMM29,%ZMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMADD213PD %ZMM6,%ZMM1,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM2,%ZMM29,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM26,%ZMM23,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VPMOVSXDQ (%R13,%RBX,4),%ZMM26 | 2 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 3 | 1 |
VMULPD 0x540(%RSP),%ZMM25,%ZMM25 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VPADDQ %ZMM26,%ZMM27,%ZMM26 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VSCATTERQPD %ZMM25,(%RDX,%ZMM26,8){%K1} | 27 | 0 | 0 | 4 | 4 | 8 | 1 | 0 | 0 | 15 | 11 |
VMULPD %ZMM15,%ZMM24,%ZMM24 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VDIVPD %ZMM22,%ZMM24,%ZMM22 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 24 | 16 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VSCATTERQPD %ZMM22,(%R10,%ZMM26,8){%K1} | 27 | 0 | 0 | 4 | 4 | 8 | 1 | 0 | 0 | 15 | 11 |
VMOVAPD %ZMM29,%ZMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMADD213PD %ZMM19,%ZMM18,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM20,%ZMM29,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM21,%ZMM29,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM28,%ZMM23,%ZMM22 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VSCATTERQPD %ZMM22,(%R9,%ZMM26,8){%K1} | 27 | 0 | 0 | 4 | 4 | 8 | 1 | 0 | 0 | 15 | 11 |
ADD $0x8,%RBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %RSI,%RBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JB 41b000 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |