Loop Id: 971 | Module: exec | Source: ParticleBConds.h:185-217 | Coverage: 11.03% |
---|
Loop Id: 971 | Module: exec | Source: ParticleBConds.h:185-217 | Coverage: 11.03% |
---|
0x457b00 LEA (%R9,%RDX,1),%EAX |
0x457b04 VPBROADCASTD %EAX,%YMM0 |
0x457b0a VPADDD 0x3e32e(%RIP),%YMM0,%YMM0 [6] |
0x457b12 VPCMPGTD %YMM0,%YMM5,%K1 |
0x457b18 CLTQ |
0x457b1a VMOVUPD (%R12,%RAX,8),%ZMM0 [10] |
0x457b21 VBROADCASTSD 0x36505(%RIP),%ZMM1 [6] |
0x457b2b VBROADCASTSD 0x364f3(%RIP),%ZMM2 [6] |
0x457b35 VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} |
0x457b3b VSUBPD 0x780(%RSP),%ZMM0,%ZMM0 [9] |
0x457b43 VMOVUPD (%R15,%RAX,8),%ZMM30 [7] |
0x457b4a VSUBPD 0x740(%RSP),%ZMM30,%ZMM30 [9] |
0x457b52 VMULPD %ZMM29,%ZMM0,%ZMM31 |
0x457b58 VMULPD %ZMM29,%ZMM30,%ZMM30 |
0x457b5e VMOVUPD (%RSI,%RAX,8),%ZMM0 [1] |
0x457b65 VSUBPD 0x700(%RSP),%ZMM0,%ZMM0 [9] |
0x457b6d VMULPD 0x6c0(%RSP),%ZMM31,%ZMM1 [9] |
0x457b75 VMULPD %ZMM29,%ZMM0,%ZMM2 |
0x457b7b VFMADD231PD 0x680(%RSP),%ZMM30,%ZMM1 [9] |
0x457b83 VFMADD231PD 0x640(%RSP),%ZMM2,%ZMM1 [9] |
0x457b8b VMULPD 0x600(%RSP),%ZMM31,%ZMM3 [9] |
0x457b93 VFMADD231PD 0x5c0(%RSP),%ZMM30,%ZMM3 [9] |
0x457b9b VRNDSCALEPD $0x9,%ZMM1,%ZMM0 |
0x457ba2 VFMADD231PD 0x580(%RSP),%ZMM2,%ZMM3 [9] |
0x457baa VMULPD 0x540(%RSP),%ZMM31,%ZMM1 [9] |
0x457bb2 VFMADD231PD 0x500(%RSP),%ZMM30,%ZMM1 [9] |
0x457bba VFMADD231PD 0x4c0(%RSP),%ZMM2,%ZMM1 [9] |
0x457bc2 VRNDSCALEPD $0x9,%ZMM3,%ZMM3 |
0x457bc9 VRNDSCALEPD $0x9,%ZMM1,%ZMM1 |
0x457bd0 VFMSUB231PD 0x480(%RSP),%ZMM0,%ZMM31 [9] |
0x457bd8 VFMADD231PD 0x440(%RSP),%ZMM3,%ZMM31 [9] |
0x457be0 VFMSUB231PD 0x3c0(%RSP),%ZMM0,%ZMM30 [9] |
0x457be8 VFNMSUB231PD 0x400(%RSP),%ZMM1,%ZMM31 [9] |
0x457bf0 VFMADD231PD 0x380(%RSP),%ZMM3,%ZMM30 [9] |
0x457bf8 VFNMSUB231PD 0x340(%RSP),%ZMM1,%ZMM30 [9] |
0x457c00 VFMSUB132PD 0x300(%RSP),%ZMM2,%ZMM0 [9] |
0x457c08 VFMADD231PD %ZMM3,%ZMM6,%ZMM0 |
0x457c0e VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 |
0x457c14 VMULPD %ZMM31,%ZMM31,%ZMM1 |
0x457c1a VFMADD231PD %ZMM30,%ZMM30,%ZMM1 |
0x457c20 VADDPD %ZMM31,%ZMM8,%ZMM2 |
0x457c26 VADDPD %ZMM30,%ZMM9,%ZMM3 |
0x457c2c VADDPD %ZMM0,%ZMM10,%ZMM4 |
0x457c32 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x457c38 VFMADD231PD %ZMM0,%ZMM0,%ZMM1 |
0x457c3e VFMADD231PD %ZMM3,%ZMM3,%ZMM2 |
0x457c44 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457c4a VCMPPD $0x1,%ZMM1,%ZMM2,%K0 |
0x457c51 VADDPD %ZMM31,%ZMM11,%ZMM3 |
0x457c57 VADDPD %ZMM30,%ZMM12,%ZMM4 |
0x457c5d VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x457c63 VADDPD %ZMM0,%ZMM13,%ZMM2 |
0x457c69 VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x457c6f VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x457c75 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x457c7b VADDPD %ZMM31,%ZMM14,%ZMM2 |
0x457c81 VCMPPD $0x1,%ZMM1,%ZMM3,%K1 |
0x457c88 VADDPD %ZMM30,%ZMM15,%ZMM4 |
0x457c8e VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x457c94 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457c9a VADDPD %ZMM0,%ZMM16,%ZMM4 |
0x457ca0 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457ca6 VMINPD %ZMM1,%ZMM3,%ZMM1 |
0x457cac VCMPPD $0x1,%ZMM1,%ZMM2,%K2 |
0x457cb3 VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x457cb9 VADDPD %ZMM31,%ZMM17,%ZMM2 |
0x457cbf VADDPD %ZMM30,%ZMM18,%ZMM3 |
0x457cc5 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x457ccb VADDPD %ZMM0,%ZMM19,%ZMM4 |
0x457cd1 VFMADD231PD %ZMM3,%ZMM3,%ZMM2 |
0x457cd7 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457cdd VCMPPD $0x1,%ZMM1,%ZMM2,%K3 |
0x457ce4 VADDPD %ZMM31,%ZMM20,%ZMM3 |
0x457cea VADDPD %ZMM30,%ZMM21,%ZMM4 |
0x457cf0 VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x457cf6 VADDPD %ZMM0,%ZMM22,%ZMM2 |
0x457cfc VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x457d02 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x457d08 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x457d0e VADDPD %ZMM31,%ZMM23,%ZMM2 |
0x457d14 VCMPPD $0x1,%ZMM1,%ZMM3,%K4 |
0x457d1b VADDPD %ZMM30,%ZMM24,%ZMM4 |
0x457d21 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x457d27 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457d2d VADDPD %ZMM0,%ZMM25,%ZMM4 |
0x457d33 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457d39 VMINPD %ZMM1,%ZMM3,%ZMM1 |
0x457d3f VADDPD %ZMM31,%ZMM26,%ZMM3 |
0x457d45 VADDPD %ZMM30,%ZMM27,%ZMM4 |
0x457d4b VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x457d51 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x457d57 VCMPPD $0x1,%ZMM1,%ZMM2,%K5 |
0x457d5e VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x457d64 VADDPD %ZMM0,%ZMM28,%ZMM2 |
0x457d6a VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x457d70 VMINPD %ZMM1,%ZMM3,%ZMM2 |
0x457d76 VSQRTPD %ZMM2,%ZMM2 |
0x457d7c VCMPPD $0x1,%ZMM1,%ZMM3,%K6 |
0x457d83 VPMOVM2Q %K0,%ZMM1 |
0x457d89 VPSRLQ $0x3f,%ZMM1,%ZMM1 |
0x457d90 VPBROADCASTQ 0x3dec6(%RIP),%ZMM1{%K1} [6] |
0x457d9a VPBROADCASTQ 0x3dec4(%RIP),%ZMM1{%K2} [6] |
0x457da4 VPBROADCASTQ 0x3dec2(%RIP),%ZMM1{%K3} [6] |
0x457dae VPBROADCASTQ 0x39f48(%RIP),%ZMM1{%K4} [6] |
0x457db8 VPBROADCASTQ 0x3deb6(%RIP),%ZMM1{%K5} [6] |
0x457dc2 VPBROADCASTQ 0x3deb4(%RIP),%ZMM1{%K6} [6] |
0x457dcc KXNORW %K0,%K0,%K1 |
0x457dd0 VMOVUPD %ZMM2,(%RCX,%RAX,8) [5] |
0x457dd7 VXORPD %XMM2,%XMM2,%XMM2 |
0x457ddb VGATHERQPD 0x90(%RDI,%ZMM1,8),%ZMM2{%K1} [4] |
0x457de3 VADDPD %ZMM31,%ZMM2,%ZMM2 |
0x457de9 VMULPD %ZMM29,%ZMM2,%ZMM2 |
0x457def KXNORW %K0,%K0,%K1 |
0x457df3 VMOVUPD %ZMM2,(%R10,%RAX,8) [2] |
0x457dfa VXORPD %XMM2,%XMM2,%XMM2 |
0x457dfe VGATHERQPD 0xd0(%RDI,%ZMM1,8),%ZMM2{%K1} [4] |
0x457e06 VADDPD %ZMM30,%ZMM2,%ZMM2 |
0x457e0c VMULPD %ZMM29,%ZMM2,%ZMM2 |
0x457e12 KXNORW %K0,%K0,%K1 |
0x457e16 VMOVUPD %ZMM2,(%R8,%RAX,8) [8] |
0x457e1d VXORPD %XMM2,%XMM2,%XMM2 |
0x457e21 VGATHERQPD 0x110(%RDI,%ZMM1,8),%ZMM2{%K1} [4] |
0x457e29 VADDPD %ZMM0,%ZMM2,%ZMM0 |
0x457e2f VMULPD %ZMM29,%ZMM0,%ZMM0 |
0x457e35 VMOVUPD %ZMM0,(%R11,%RAX,8) [3] |
0x457e3c ADD $0x8,%EDX |
0x457e3f CMP %EBX,%EDX |
0x457e41 JBE 457b00 |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Particle/Lattice/ParticleBConds.h: 185 - 217 |
-------------------------------------------------------------------------------- |
185: #pragma omp simd aligned(temp_r, px, py, pz, dx, dy, dz: QMC_SIMD_ALIGNMENT) |
186: for (int iat = first; iat < last; ++iat) |
187: { |
188: const T flip = iat < flip_ind ? one : minusone; |
189: const T displ_0 = (px[iat] - x0) * flip; |
190: const T displ_1 = (py[iat] - y0) * flip; |
191: const T displ_2 = (pz[iat] - z0) * flip; |
192: |
193: const T ar_0 = -std::floor(displ_0 * g00 + displ_1 * g10 + displ_2 * g20); |
194: const T ar_1 = -std::floor(displ_0 * g01 + displ_1 * g11 + displ_2 * g21); |
195: const T ar_2 = -std::floor(displ_0 * g02 + displ_1 * g12 + displ_2 * g22); |
196: |
197: const T delx = displ_0 + ar_0 * r00 + ar_1 * r10 + ar_2 * r20; |
198: const T dely = displ_1 + ar_0 * r01 + ar_1 * r11 + ar_2 * r21; |
199: const T delz = displ_2 + ar_0 * r02 + ar_1 * r12 + ar_2 * r22; |
200: |
201: T rmin = delx * delx + dely * dely + delz * delz; |
202: int ic = 0; |
203: #pragma unroll(7) |
204: for (int c = 1; c < 8; ++c) |
205: { |
206: const T x = delx + cellx[c]; |
207: const T y = dely + celly[c]; |
208: const T z = delz + cellz[c]; |
209: const T r2 = x * x + y * y + z * z; |
210: ic = (r2 < rmin) ? c : ic; |
211: rmin = (r2 < rmin) ? r2 : rmin; |
212: } |
213: |
214: temp_r[iat] = std::sqrt(rmin); |
215: dx[iat] = flip * (delx + cellx[ic]); |
216: dy[iat] = flip * (dely + celly[ic]); |
217: dz[iat] = flip * (delz + cellz[ic]); |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►53.33+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:83 | exec |
○ | qmcplusplus::ParticleSet::make[...] | ParticleSet.cpp:290 | exec |
○ | main.extracted.104 | miniqmc.cpp:482 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 | |
►20.00+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:83 | exec |
○ | qmcplusplus::ParticleSet::make[...] | ParticleSet.cpp:290 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 | |
►13.33+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:119 | exec |
○ | qmcplusplus::ParticleSet::make[...] | ParticleSet.cpp:290 | exec |
○ | main.extracted.104 | miniqmc.cpp:482 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 | |
►6.67+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:69 | exec |
○ | qmcplusplus::ParticleSet::upda[...] | ParticleSet.cpp:250 | exec |
○ | main.extracted.107 | miniqmc.cpp:390 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:374 | exec |
○ | __libc_init_first | libc.so.6 | |
►6.67+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:76 | exec |
○ | qmcplusplus::ParticleSet::setA[...] | ParticleSet.cpp:259 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.03 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.01 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.49 |
Bottlenecks | P0, P5, |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:185-217 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 56.00 |
CQA cycles if no scalar integer | 54.50 |
CQA cycles if FP arith vectorized | 56.00 |
CQA cycles if fully vectorized | 55.50 |
Front-end cycles | 35.25 |
DIV/SQRT cycles | 56.00 |
P0 cycles | 37.50 |
P1 cycles | 27.50 |
P2 cycles | 27.50 |
P3 cycles | 4.00 |
P4 cycles | 56.00 |
P5 cycles | 2.50 |
P6 cycles | 4.00 |
P7 cycles | 18.00 - 24.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 113.75 - 119.42 |
Stall cycles (UFS) | 79.51 - 85.18 |
Nb insns | 128.00 |
Nb uops | 141.00 |
Nb loads | 34.00 |
Nb stores | 4.00 |
Nb stack references | 19.00 |
FLOP/cycle | 15.29 |
Nb FLOP add-sub | 216.00 |
Nb FLOP mul | 136.00 |
Nb FLOP fma | 248.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 8.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 34.86 |
Bytes prefetched | 0.00 |
Bytes loaded | 1696.00 |
Bytes stored | 256.00 |
Stride 0 | 9.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 90.91 |
Vectorization ratio load | 76.47 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 70.27 |
Vector-efficiency ratio all | 89.22 |
Vector-efficiency ratio load | 77.94 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 98.21 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 66.09 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.03 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.01 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.49 |
Bottlenecks | P0, P5, |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:185-217 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 56.00 |
CQA cycles if no scalar integer | 54.50 |
CQA cycles if FP arith vectorized | 56.00 |
CQA cycles if fully vectorized | 55.50 |
Front-end cycles | 35.25 |
DIV/SQRT cycles | 56.00 |
P0 cycles | 37.50 |
P1 cycles | 27.50 |
P2 cycles | 27.50 |
P3 cycles | 4.00 |
P4 cycles | 56.00 |
P5 cycles | 2.50 |
P6 cycles | 4.00 |
P7 cycles | 18.00 - 24.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 113.75 - 119.42 |
Stall cycles (UFS) | 79.51 - 85.18 |
Nb insns | 128.00 |
Nb uops | 141.00 |
Nb loads | 34.00 |
Nb stores | 4.00 |
Nb stack references | 19.00 |
FLOP/cycle | 15.29 |
Nb FLOP add-sub | 216.00 |
Nb FLOP mul | 136.00 |
Nb FLOP fma | 248.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 8.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 34.86 |
Bytes prefetched | 0.00 |
Bytes loaded | 1696.00 |
Bytes stored | 256.00 |
Stride 0 | 9.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 90.91 |
Vectorization ratio load | 76.47 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 70.27 |
Vector-efficiency ratio all | 89.22 |
Vector-efficiency ratio load | 77.94 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 98.21 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 66.09 |
Path / |
Function | void qmcplusplus::DTD_BConds |
Source file and lines | ParticleBConds.h:185-217 |
Module | exec |
nb instructions | 128 |
nb uops | 141 |
loop length | 839 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 1 |
used ymm registers | 2 |
used zmm registers | 31 |
nb stack references | 19 |
ADD-SUB / MUL ratio | 1.59 |
micro-operation queue | 35.25 cycles |
front end | 35.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 56.00 | 2.50 | 27.50 | 27.50 | 4.00 | 56.00 | 2.50 | 4.00 |
cycles | 56.00 | 37.50 | 27.50 | 27.50 | 4.00 | 56.00 | 2.50 | 4.00 |
Cycles executing div or sqrt instructions | 18.00-24.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 113.75-119.42 |
Stall cycles | 79.51-85.18 |
RS full (events) | 1.21-1.23 |
PRF_FLOAT full (events) | 81.49-87.17 |
Front-end | 35.25 |
Dispatch | 56.00 |
DIV/SQRT | 18.00-24.00 |
Data deps. | 0.00 |
Overall L1 | 56.00 |
all | 25% |
load | 14% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 18% |
all | 98% |
load | 92% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 92% |
all | 90% |
load | 76% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 70% |
all | 24% |
load | 17% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 22% |
all | 96% |
load | 93% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 84% |
all | 89% |
load | 77% |
store | 100% |
mul | 100% |
add-sub | 98% |
fma | 100% |
div/sqrt | 100% |
other | 66% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
LEA (%R9,%RDX,1),%EAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTD %EAX,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VPADDD 0x3e32e(%RIP),%YMM0,%YMM0 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 |
VPCMPGTD %YMM0,%YMM5,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CLTQ | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
VMOVUPD (%R12,%RAX,8),%ZMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VBROADCASTSD 0x36505(%RIP),%ZMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSD 0x364f3(%RIP),%ZMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VSUBPD 0x780(%RSP),%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD (%R15,%RAX,8),%ZMM30 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPD 0x740(%RSP),%ZMM30,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM30,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD (%RSI,%RAX,8),%ZMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPD 0x700(%RSP),%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x6c0(%RSP),%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x680(%RSP),%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x640(%RSP),%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x600(%RSP),%ZMM31,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x5c0(%RSP),%ZMM30,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VRNDSCALEPD $0x9,%ZMM1,%ZMM0 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VFMADD231PD 0x580(%RSP),%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x540(%RSP),%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x500(%RSP),%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x4c0(%RSP),%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VRNDSCALEPD $0x9,%ZMM3,%ZMM3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VRNDSCALEPD $0x9,%ZMM1,%ZMM1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VFMSUB231PD 0x480(%RSP),%ZMM0,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x440(%RSP),%ZMM3,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMSUB231PD 0x3c0(%RSP),%ZMM0,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD 0x400(%RSP),%ZMM1,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x380(%RSP),%ZMM3,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD 0x340(%RSP),%ZMM1,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMSUB132PD 0x300(%RSP),%ZMM2,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM6,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM31,%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM30,%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM8,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM9,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM10,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM0,%ZMM0,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM31,%ZMM11,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM12,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM13,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM14,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM30,%ZMM15,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM16,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM17,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM18,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM19,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM31,%ZMM20,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM21,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM22,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM23,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM30,%ZMM24,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM25,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM26,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM27,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM28,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VSQRTPD %ZMM2,%ZMM2 | 3 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 24-33 | 18-24 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VPMOVM2Q %K0,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VPSRLQ $0x3f,%ZMM1,%ZMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPBROADCASTQ 0x3dec6(%RIP),%ZMM1{%K1} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3dec4(%RIP),%ZMM1{%K2} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3dec2(%RIP),%ZMM1{%K3} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x39f48(%RIP),%ZMM1{%K4} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3deb6(%RIP),%ZMM1{%K5} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3deb4(%RIP),%ZMM1{%K6} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%RCX,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0x90(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM31,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%R10,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0xd0(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM30,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%R8,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0x110(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM0,%ZMM2,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM0,(%R11,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
ADD $0x8,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %EBX,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JBE 457b00 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Function | void qmcplusplus::DTD_BConds |
Source file and lines | ParticleBConds.h:185-217 |
Module | exec |
nb instructions | 128 |
nb uops | 141 |
loop length | 839 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 1 |
used ymm registers | 2 |
used zmm registers | 31 |
nb stack references | 19 |
ADD-SUB / MUL ratio | 1.59 |
micro-operation queue | 35.25 cycles |
front end | 35.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 56.00 | 2.50 | 27.50 | 27.50 | 4.00 | 56.00 | 2.50 | 4.00 |
cycles | 56.00 | 37.50 | 27.50 | 27.50 | 4.00 | 56.00 | 2.50 | 4.00 |
Cycles executing div or sqrt instructions | 18.00-24.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 113.75-119.42 |
Stall cycles | 79.51-85.18 |
RS full (events) | 1.21-1.23 |
PRF_FLOAT full (events) | 81.49-87.17 |
Front-end | 35.25 |
Dispatch | 56.00 |
DIV/SQRT | 18.00-24.00 |
Data deps. | 0.00 |
Overall L1 | 56.00 |
all | 25% |
load | 14% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 18% |
all | 98% |
load | 92% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 92% |
all | 90% |
load | 76% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 70% |
all | 24% |
load | 17% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 22% |
all | 96% |
load | 93% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 84% |
all | 89% |
load | 77% |
store | 100% |
mul | 100% |
add-sub | 98% |
fma | 100% |
div/sqrt | 100% |
other | 66% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
LEA (%R9,%RDX,1),%EAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTD %EAX,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VPADDD 0x3e32e(%RIP),%YMM0,%YMM0 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 |
VPCMPGTD %YMM0,%YMM5,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CLTQ | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
VMOVUPD (%R12,%RAX,8),%ZMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VBROADCASTSD 0x36505(%RIP),%ZMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSD 0x364f3(%RIP),%ZMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VSUBPD 0x780(%RSP),%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD (%R15,%RAX,8),%ZMM30 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPD 0x740(%RSP),%ZMM30,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM30,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD (%RSI,%RAX,8),%ZMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPD 0x700(%RSP),%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x6c0(%RSP),%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x680(%RSP),%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x640(%RSP),%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x600(%RSP),%ZMM31,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x5c0(%RSP),%ZMM30,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VRNDSCALEPD $0x9,%ZMM1,%ZMM0 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VFMADD231PD 0x580(%RSP),%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x540(%RSP),%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x500(%RSP),%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x4c0(%RSP),%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VRNDSCALEPD $0x9,%ZMM3,%ZMM3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VRNDSCALEPD $0x9,%ZMM1,%ZMM1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VFMSUB231PD 0x480(%RSP),%ZMM0,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x440(%RSP),%ZMM3,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMSUB231PD 0x3c0(%RSP),%ZMM0,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD 0x400(%RSP),%ZMM1,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x380(%RSP),%ZMM3,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD 0x340(%RSP),%ZMM1,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMSUB132PD 0x300(%RSP),%ZMM2,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM6,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM31,%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM30,%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM8,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM9,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM10,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM0,%ZMM0,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM31,%ZMM11,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM12,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM13,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM14,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM30,%ZMM15,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM16,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM17,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM18,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM19,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM31,%ZMM20,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM21,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM22,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM23,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM30,%ZMM24,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM25,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM26,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM27,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM28,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VSQRTPD %ZMM2,%ZMM2 | 3 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 24-33 | 18-24 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VPMOVM2Q %K0,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VPSRLQ $0x3f,%ZMM1,%ZMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPBROADCASTQ 0x3dec6(%RIP),%ZMM1{%K1} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3dec4(%RIP),%ZMM1{%K2} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3dec2(%RIP),%ZMM1{%K3} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x39f48(%RIP),%ZMM1{%K4} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3deb6(%RIP),%ZMM1{%K5} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3deb4(%RIP),%ZMM1{%K6} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%RCX,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0x90(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM31,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%R10,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0xd0(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM30,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%R8,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0x110(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM0,%ZMM2,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM0,(%R11,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
ADD $0x8,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %EBX,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JBE 457b00 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Metric | run_0 |
---|---|
Coverage (% app. time) | 11.03 |
Time (s) | 0.07 |
Instance Count | 47712 |
Iteration Count - min | 8 |
Iteration Count - avg | 52.71 |
Iteration Count - max | 96 |
Cycles per Iteration - min | 84.5 |
Cycles per Iteration - avg | 89.42 |
Cycles per Iteration - max | 1615.5 |
Metric | Value |
---|---|
Bucket Coverage (% loop time) | 98.69 |
Instance Count | 47712 |
ORIG CPI:min | 90.38 |
ORIG CPI:med | 92.44 |
ORIG CPI:max | 100.15 |
DL1 CPI:min | 82.77 |
DL1 CPI:med | 84.15 |
DL1 CPI:max | 87.44 |
ORIG (min) / DL1 (min) | 1.09 |
ORIG (med) / DL1 (med) | 1.10 |
ORIG (max) / DL1 (max) | 1.15 |
Nb Iteration:min | 96 |
Nb Iteration:med | 96.00 |
Nb Iteration:max | 96 |
ORIG: min (cycles) | 8676 |
ORIG: med (cycles) | 8874.00 |
ORIG: max (cycles) | 9614 |
DL1:min (cycles) | 7946 |
DL1:med (cycles) | 8078.00 |
DL1:max (cycles) | 8394 |
Metric | Value |
---|---|
Bucket Coverage (% loop time) | 1.07 |
Instance Count | 47712 |
ORIG CPI:min | 92.00 |
ORIG CPI:med | 108.00 |
ORIG CPI:max | 178.00 |
DL1 CPI:min | 83.15 |
DL1 CPI:med | 92.50 |
DL1 CPI:max | 121.50 |
ORIG (min) / DL1 (min) | 1.12 |
ORIG (med) / DL1 (med) | 1.75 |
ORIG (max) / DL1 (max) | 1.88 |
Nb Iteration:min | 96 |
Nb Iteration:med | 96.00 |
Nb Iteration:max | 96 |
ORIG: min (cycles) | 746 |
ORIG: med (cycles) | 1372.00 |
ORIG: max (cycles) | 15336 |
DL1:min (cycles) | 666 |
DL1:med (cycles) | 786.00 |
DL1:max (cycles) | 8158 |
Metric (average per iteration except for Time and Iteration Count) | ORIG | DL1 | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | |
Time | 8874.00 | 8874.00 | 8874.00 | 8874.00 | 8676.00 | 8874.00 | 9614.00 | 8078.00 | 8078.00 | 8078.00 | 8078.00 | 7946.00 | 8078.00 | 8394.00 |
CPI MIN | 90.38 | 82.77 | ||||||||||||
CPI MED | 92.44 | 92.44 | 92.44 | 92.44 | 90.38 | 92.44 | 100.15 | 84.15 | 84.15 | 84.15 | 84.15 | 82.77 | 84.15 | 87.44 |
CPI AVG | 92.79 | 84.16 | ||||||||||||
CPI MAX | 100.15 | 87.44 | ||||||||||||
Iteration Count | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 |
Metric (average per iteration except for Time and Iteration Count) | ORIG | DL1 | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | |
Time | 1372.00 | 1372.00 | 1372.00 | 1372.00 | 746.00 | 1372.00 | 15336.00 | 786.00 | 786.00 | 786.00 | 786.00 | 666.00 | 786.00 | 8158.00 |
CPI MIN | 92.00 | 83.15 | ||||||||||||
CPI MED | 108.00 | 108.00 | 108.00 | 108.00 | 92.00 | 108.00 | 178.00 | 92.50 | 92.50 | 92.50 | 92.50 | 83.15 | 92.50 | 121.50 |
CPI AVG | 120.60 | 92.92 | ||||||||||||
CPI MAX | 178.00 | 121.50 | ||||||||||||
Iteration Count | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 | 96.00 |
ORIG | DL1 | Original Code |
---|---|---|
0x4fe3a6 ADDQ $0x1,-0x726e(%RIP) 0x4fe3ae LEA (%R9,%RDX,1),%EAX | 0x4fed80 LEA (%R9,%RDX,1),%EAX | 0x457b00 LEA (%R9,%RDX,1),%EAX |
0x4fe3b2 VPBROADCASTD %EAX,%YMM0 | 0x4fed84 VPBROADCASTD %EAX,%YMM0 | 0x457b04 VPBROADCASTD %EAX,%YMM0 |
0x4fe3b8 VPADDD -0x68580(%RIP),%YMM0,%YMM0 | 0x4fed8a VPADDD -0x68f52(%RIP),%YMM0,%YMM0 | 0x457b0a VPADDD 0x3e32e(%RIP),%YMM0,%YMM0 |
0x4fe3c0 VPCMPGTD %YMM0,%YMM5,%K1 | 0x4fed92 VPCMPGTD %YMM0,%YMM5,%K1 | 0x457b12 VPCMPGTD %YMM0,%YMM5,%K1 |
0x4fe3c6 CLTQ | 0x4fed98 CLTQ | 0x457b18 CLTQ |
0x4fe3c8 VMOVUPD (%R12,%RAX,8),%ZMM0 | 0x4fed9a VMOVUPD -0x9324(%RIP),%ZMM0 | 0x457b1a VMOVUPD (%R12,%RAX,8),%ZMM0 |
0x4fe3cf VBROADCASTSD -0x703a9(%RIP),%ZMM1 | 0x4feda4 VBROADCASTSD -0x70d7e(%RIP),%ZMM1 | 0x457b21 VBROADCASTSD 0x36505(%RIP),%ZMM1 |
0x4fe3d9 VBROADCASTSD -0x703bb(%RIP),%ZMM2 | 0x4fedae VBROADCASTSD -0x70d90(%RIP),%ZMM2 | 0x457b2b VBROADCASTSD 0x364f3(%RIP),%ZMM2 |
0x4fe3e3 VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} | 0x4fedb8 VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} | 0x457b35 VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} |
0x4fe3e9 VSUBPD 0x780(%RSP),%ZMM0,%ZMM0 | 0x4fedbe VSUBPD -0x9a48(%RIP),%ZMM0,%ZMM0 | 0x457b3b VSUBPD 0x780(%RSP),%ZMM0,%ZMM0 |
0x4fe3f1 VMOVUPD (%R15,%RAX,8),%ZMM30 | 0x4fedc8 VMOVUPD -0x9352(%RIP),%ZMM30 | 0x457b43 VMOVUPD (%R15,%RAX,8),%ZMM30 |
0x4fe3f8 VSUBPD 0x740(%RSP),%ZMM30,%ZMM30 | 0x4fedd2 VSUBPD -0x9a1c(%RIP),%ZMM30,%ZMM30 | 0x457b4a VSUBPD 0x740(%RSP),%ZMM30,%ZMM30 |
0x4fe400 VMULPD %ZMM29,%ZMM0,%ZMM31 | 0x4feddc VMULPD %ZMM29,%ZMM0,%ZMM31 | 0x457b52 VMULPD %ZMM29,%ZMM0,%ZMM31 |
0x4fe406 VMULPD %ZMM29,%ZMM30,%ZMM30 | 0x4fede2 VMULPD %ZMM29,%ZMM30,%ZMM30 | 0x457b58 VMULPD %ZMM29,%ZMM30,%ZMM30 |
0x4fe40c VMOVUPD (%RSI,%RAX,8),%ZMM0 | 0x4fede8 VMOVUPD -0x9372(%RIP),%ZMM0 | 0x457b5e VMOVUPD (%RSI,%RAX,8),%ZMM0 |
0x4fe413 VSUBPD 0x700(%RSP),%ZMM0,%ZMM0 | 0x4fedf2 VSUBPD -0x99fc(%RIP),%ZMM0,%ZMM0 | 0x457b65 VSUBPD 0x700(%RSP),%ZMM0,%ZMM0 |
0x4fe41b VMULPD 0x6c0(%RSP),%ZMM31,%ZMM1 | 0x4fedfc VMULPD -0x99c6(%RIP),%ZMM31,%ZMM1 | 0x457b6d VMULPD 0x6c0(%RSP),%ZMM31,%ZMM1 |
0x4fe423 VMULPD %ZMM29,%ZMM0,%ZMM2 | 0x4fee06 VMULPD %ZMM29,%ZMM0,%ZMM2 | 0x457b75 VMULPD %ZMM29,%ZMM0,%ZMM2 |
0x4fe429 VFMADD231PD 0x680(%RSP),%ZMM30,%ZMM1 | 0x4fee0c VFMADD231PD -0x9996(%RIP),%ZMM30,%ZMM1 | 0x457b7b VFMADD231PD 0x680(%RSP),%ZMM30,%ZMM1 |
0x4fe431 VFMADD231PD 0x640(%RSP),%ZMM2,%ZMM1 | 0x4fee16 VFMADD231PD -0x9960(%RIP),%ZMM2,%ZMM1 | 0x457b83 VFMADD231PD 0x640(%RSP),%ZMM2,%ZMM1 |
0x4fe439 VMULPD 0x600(%RSP),%ZMM31,%ZMM3 | 0x4fee20 VMULPD -0x992a(%RIP),%ZMM31,%ZMM3 | 0x457b8b VMULPD 0x600(%RSP),%ZMM31,%ZMM3 |
0x4fe441 VFMADD231PD 0x5c0(%RSP),%ZMM30,%ZMM3 | 0x4fee2a VFMADD231PD -0x98f4(%RIP),%ZMM30,%ZMM3 | 0x457b93 VFMADD231PD 0x5c0(%RSP),%ZMM30,%ZMM3 |
0x4fe449 VRNDSCALEPD $0x9,%ZMM1,%ZMM0 | 0x4fee34 VRNDSCALEPD $0x9,%ZMM1,%ZMM0 | 0x457b9b VRNDSCALEPD $0x9,%ZMM1,%ZMM0 |
0x4fe450 VFMADD231PD 0x580(%RSP),%ZMM2,%ZMM3 | 0x4fee3b VFMADD231PD -0x98c5(%RIP),%ZMM2,%ZMM3 | 0x457ba2 VFMADD231PD 0x580(%RSP),%ZMM2,%ZMM3 |
0x4fe458 VMULPD 0x540(%RSP),%ZMM31,%ZMM1 | 0x4fee45 VMULPD -0x988f(%RIP),%ZMM31,%ZMM1 | 0x457baa VMULPD 0x540(%RSP),%ZMM31,%ZMM1 |
0x4fe460 VFMADD231PD 0x500(%RSP),%ZMM30,%ZMM1 | 0x4fee4f VFMADD231PD -0x9859(%RIP),%ZMM30,%ZMM1 | 0x457bb2 VFMADD231PD 0x500(%RSP),%ZMM30,%ZMM1 |
0x4fe468 VFMADD231PD 0x4c0(%RSP),%ZMM2,%ZMM1 | 0x4fee59 VFMADD231PD -0x9823(%RIP),%ZMM2,%ZMM1 | 0x457bba VFMADD231PD 0x4c0(%RSP),%ZMM2,%ZMM1 |
0x4fe470 VRNDSCALEPD $0x9,%ZMM3,%ZMM3 | 0x4fee63 VRNDSCALEPD $0x9,%ZMM3,%ZMM3 | 0x457bc2 VRNDSCALEPD $0x9,%ZMM3,%ZMM3 |
0x4fe477 VRNDSCALEPD $0x9,%ZMM1,%ZMM1 | 0x4fee6a VRNDSCALEPD $0x9,%ZMM1,%ZMM1 | 0x457bc9 VRNDSCALEPD $0x9,%ZMM1,%ZMM1 |
0x4fe47e VFMSUB231PD 0x480(%RSP),%ZMM0,%ZMM31 | 0x4fee71 VFMSUB231PD -0x97fb(%RIP),%ZMM0,%ZMM31 | 0x457bd0 VFMSUB231PD 0x480(%RSP),%ZMM0,%ZMM31 |
0x4fe486 VFMADD231PD 0x440(%RSP),%ZMM3,%ZMM31 | 0x4fee7b VFMADD231PD -0x97c5(%RIP),%ZMM3,%ZMM31 | 0x457bd8 VFMADD231PD 0x440(%RSP),%ZMM3,%ZMM31 |
0x4fe48e VFMSUB231PD 0x3c0(%RSP),%ZMM0,%ZMM30 | 0x4fee85 VFMSUB231PD -0x978f(%RIP),%ZMM0,%ZMM30 | 0x457be0 VFMSUB231PD 0x3c0(%RSP),%ZMM0,%ZMM30 |
0x4fe496 VFNMSUB231PD 0x400(%RSP),%ZMM1,%ZMM31 | 0x4fee8f VFNMSUB231PD -0x9759(%RIP),%ZMM1,%ZMM31 | 0x457be8 VFNMSUB231PD 0x400(%RSP),%ZMM1,%ZMM31 |
0x4fe49e VFMADD231PD 0x380(%RSP),%ZMM3,%ZMM30 | 0x4fee99 VFMADD231PD -0x9723(%RIP),%ZMM3,%ZMM30 | 0x457bf0 VFMADD231PD 0x380(%RSP),%ZMM3,%ZMM30 |
0x4fe4a6 VFNMSUB231PD 0x340(%RSP),%ZMM1,%ZMM30 | 0x4feea3 VFNMSUB231PD -0x96ed(%RIP),%ZMM1,%ZMM30 | 0x457bf8 VFNMSUB231PD 0x340(%RSP),%ZMM1,%ZMM30 |
0x4fe4ae VFMSUB132PD 0x300(%RSP),%ZMM2,%ZMM0 | 0x4feead VFMSUB132PD -0x96b7(%RIP),%ZMM2,%ZMM0 | 0x457c00 VFMSUB132PD 0x300(%RSP),%ZMM2,%ZMM0 |
0x4fe4b6 VFMADD231PD %ZMM3,%ZMM6,%ZMM0 | 0x4feeb7 VFMADD231PD %ZMM3,%ZMM6,%ZMM0 | 0x457c08 VFMADD231PD %ZMM3,%ZMM6,%ZMM0 |
0x4fe4bc VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 | 0x4feebd VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 | 0x457c0e VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 |
0x4fe4c2 VMULPD %ZMM31,%ZMM31,%ZMM1 | 0x4feec3 VMULPD %ZMM31,%ZMM31,%ZMM1 | 0x457c14 VMULPD %ZMM31,%ZMM31,%ZMM1 |
0x4fe4c8 VFMADD231PD %ZMM30,%ZMM30,%ZMM1 | 0x4feec9 VFMADD231PD %ZMM30,%ZMM30,%ZMM1 | 0x457c1a VFMADD231PD %ZMM30,%ZMM30,%ZMM1 |
0x4fe4ce VADDPD %ZMM31,%ZMM8,%ZMM2 | 0x4feecf VADDPD %ZMM31,%ZMM8,%ZMM2 | 0x457c20 VADDPD %ZMM31,%ZMM8,%ZMM2 |
0x4fe4d4 VADDPD %ZMM30,%ZMM9,%ZMM3 | 0x4feed5 VADDPD %ZMM30,%ZMM9,%ZMM3 | 0x457c26 VADDPD %ZMM30,%ZMM9,%ZMM3 |
0x4fe4da VADDPD %ZMM0,%ZMM10,%ZMM4 | 0x4feedb VADDPD %ZMM0,%ZMM10,%ZMM4 | 0x457c2c VADDPD %ZMM0,%ZMM10,%ZMM4 |
0x4fe4e0 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x4feee1 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x457c32 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x4fe4e6 VFMADD231PD %ZMM0,%ZMM0,%ZMM1 | 0x4feee7 VFMADD231PD %ZMM0,%ZMM0,%ZMM1 | 0x457c38 VFMADD231PD %ZMM0,%ZMM0,%ZMM1 |
0x4fe4ec VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 0x4feeed VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 0x457c3e VFMADD231PD %ZMM3,%ZMM3,%ZMM2 |
0x4fe4f2 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4feef3 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457c44 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fe4f8 VCMPPD $0x1,%ZMM1,%ZMM2,%K0 | 0x4feef9 VCMPPD $0x1,%ZMM1,%ZMM2,%K0 | 0x457c4a VCMPPD $0x1,%ZMM1,%ZMM2,%K0 |
0x4fe4ff VADDPD %ZMM31,%ZMM11,%ZMM3 | 0x4fef00 VADDPD %ZMM31,%ZMM11,%ZMM3 | 0x457c51 VADDPD %ZMM31,%ZMM11,%ZMM3 |
0x4fe505 VADDPD %ZMM30,%ZMM12,%ZMM4 | 0x4fef06 VADDPD %ZMM30,%ZMM12,%ZMM4 | 0x457c57 VADDPD %ZMM30,%ZMM12,%ZMM4 |
0x4fe50b VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x4fef0c VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x457c5d VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x4fe511 VADDPD %ZMM0,%ZMM13,%ZMM2 | 0x4fef12 VADDPD %ZMM0,%ZMM13,%ZMM2 | 0x457c63 VADDPD %ZMM0,%ZMM13,%ZMM2 |
0x4fe517 VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x4fef18 VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x457c69 VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x4fe51d VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x4fef1e VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x457c6f VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x4fe523 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x4fef24 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x457c75 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x4fe529 VADDPD %ZMM31,%ZMM14,%ZMM2 | 0x4fef2a VADDPD %ZMM31,%ZMM14,%ZMM2 | 0x457c7b VADDPD %ZMM31,%ZMM14,%ZMM2 |
0x4fe52f VCMPPD $0x1,%ZMM1,%ZMM3,%K1 | 0x4fef30 VCMPPD $0x1,%ZMM1,%ZMM3,%K1 | 0x457c81 VCMPPD $0x1,%ZMM1,%ZMM3,%K1 |
0x4fe536 VADDPD %ZMM30,%ZMM15,%ZMM4 | 0x4fef37 VADDPD %ZMM30,%ZMM15,%ZMM4 | 0x457c88 VADDPD %ZMM30,%ZMM15,%ZMM4 |
0x4fe53c VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x4fef3d VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x457c8e VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x4fe542 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fef43 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457c94 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fe548 VADDPD %ZMM0,%ZMM16,%ZMM4 | 0x4fef49 VADDPD %ZMM0,%ZMM16,%ZMM4 | 0x457c9a VADDPD %ZMM0,%ZMM16,%ZMM4 |
0x4fe54e VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fef4f VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457ca0 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fe554 VMINPD %ZMM1,%ZMM3,%ZMM1 | 0x4fef55 VMINPD %ZMM1,%ZMM3,%ZMM1 | 0x457ca6 VMINPD %ZMM1,%ZMM3,%ZMM1 |
0x4fe55a VCMPPD $0x1,%ZMM1,%ZMM2,%K2 | 0x4fef5b VCMPPD $0x1,%ZMM1,%ZMM2,%K2 | 0x457cac VCMPPD $0x1,%ZMM1,%ZMM2,%K2 |
0x4fe561 VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x4fef62 VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x457cb3 VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x4fe567 VADDPD %ZMM31,%ZMM17,%ZMM2 | 0x4fef68 VADDPD %ZMM31,%ZMM17,%ZMM2 | 0x457cb9 VADDPD %ZMM31,%ZMM17,%ZMM2 |
0x4fe56d VADDPD %ZMM30,%ZMM18,%ZMM3 | 0x4fef6e VADDPD %ZMM30,%ZMM18,%ZMM3 | 0x457cbf VADDPD %ZMM30,%ZMM18,%ZMM3 |
0x4fe573 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x4fef74 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x457cc5 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x4fe579 VADDPD %ZMM0,%ZMM19,%ZMM4 | 0x4fef7a VADDPD %ZMM0,%ZMM19,%ZMM4 | 0x457ccb VADDPD %ZMM0,%ZMM19,%ZMM4 |
0x4fe57f VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 0x4fef80 VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 0x457cd1 VFMADD231PD %ZMM3,%ZMM3,%ZMM2 |
0x4fe585 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fef86 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457cd7 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fe58b VCMPPD $0x1,%ZMM1,%ZMM2,%K3 | 0x4fef8c VCMPPD $0x1,%ZMM1,%ZMM2,%K3 | 0x457cdd VCMPPD $0x1,%ZMM1,%ZMM2,%K3 |
0x4fe592 VADDPD %ZMM31,%ZMM20,%ZMM3 | 0x4fef93 VADDPD %ZMM31,%ZMM20,%ZMM3 | 0x457ce4 VADDPD %ZMM31,%ZMM20,%ZMM3 |
0x4fe598 VADDPD %ZMM30,%ZMM21,%ZMM4 | 0x4fef99 VADDPD %ZMM30,%ZMM21,%ZMM4 | 0x457cea VADDPD %ZMM30,%ZMM21,%ZMM4 |
0x4fe59e VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x4fef9f VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x457cf0 VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x4fe5a4 VADDPD %ZMM0,%ZMM22,%ZMM2 | 0x4fefa5 VADDPD %ZMM0,%ZMM22,%ZMM2 | 0x457cf6 VADDPD %ZMM0,%ZMM22,%ZMM2 |
0x4fe5aa VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x4fefab VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x457cfc VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x4fe5b0 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x4fefb1 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x457d02 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x4fe5b6 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x4fefb7 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x457d08 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x4fe5bc VADDPD %ZMM31,%ZMM23,%ZMM2 | 0x4fefbd VADDPD %ZMM31,%ZMM23,%ZMM2 | 0x457d0e VADDPD %ZMM31,%ZMM23,%ZMM2 |
0x4fe5c2 VCMPPD $0x1,%ZMM1,%ZMM3,%K4 | 0x4fefc3 VCMPPD $0x1,%ZMM1,%ZMM3,%K4 | 0x457d14 VCMPPD $0x1,%ZMM1,%ZMM3,%K4 |
0x4fe5c9 VADDPD %ZMM30,%ZMM24,%ZMM4 | 0x4fefca VADDPD %ZMM30,%ZMM24,%ZMM4 | 0x457d1b VADDPD %ZMM30,%ZMM24,%ZMM4 |
0x4fe5cf VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x4fefd0 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x457d21 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x4fe5d5 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fefd6 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457d27 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fe5db VADDPD %ZMM0,%ZMM25,%ZMM4 | 0x4fefdc VADDPD %ZMM0,%ZMM25,%ZMM4 | 0x457d2d VADDPD %ZMM0,%ZMM25,%ZMM4 |
0x4fe5e1 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fefe2 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457d33 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fe5e7 VMINPD %ZMM1,%ZMM3,%ZMM1 | 0x4fefe8 VMINPD %ZMM1,%ZMM3,%ZMM1 | 0x457d39 VMINPD %ZMM1,%ZMM3,%ZMM1 |
0x4fe5ed VADDPD %ZMM31,%ZMM26,%ZMM3 | 0x4fefee VADDPD %ZMM31,%ZMM26,%ZMM3 | 0x457d3f VADDPD %ZMM31,%ZMM26,%ZMM3 |
0x4fe5f3 VADDPD %ZMM30,%ZMM27,%ZMM4 | 0x4feff4 VADDPD %ZMM30,%ZMM27,%ZMM4 | 0x457d45 VADDPD %ZMM30,%ZMM27,%ZMM4 |
0x4fe5f9 VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x4feffa VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x457d4b VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x4fe5ff VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x4ff000 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x457d51 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x4fe605 VCMPPD $0x1,%ZMM1,%ZMM2,%K5 | 0x4ff006 VCMPPD $0x1,%ZMM1,%ZMM2,%K5 | 0x457d57 VCMPPD $0x1,%ZMM1,%ZMM2,%K5 |
0x4fe60c VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x4ff00d VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x457d5e VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x4fe612 VADDPD %ZMM0,%ZMM28,%ZMM2 | 0x4ff013 VADDPD %ZMM0,%ZMM28,%ZMM2 | 0x457d64 VADDPD %ZMM0,%ZMM28,%ZMM2 |
0x4fe618 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x4ff019 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x457d6a VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x4fe61e VMINPD %ZMM1,%ZMM3,%ZMM2 | 0x4ff01f VMINPD %ZMM1,%ZMM3,%ZMM2 | 0x457d70 VMINPD %ZMM1,%ZMM3,%ZMM2 |
0x4fe624 VSQRTPD %ZMM2,%ZMM2 | 0x4ff025 VSQRTPD -0x962f(%RIP),%ZMM2 | 0x457d76 VSQRTPD %ZMM2,%ZMM2 |
0x4fe62a VCMPPD $0x1,%ZMM1,%ZMM3,%K6 | 0x4ff02f VCMPPD $0x1,%ZMM1,%ZMM3,%K6 | 0x457d7c VCMPPD $0x1,%ZMM1,%ZMM3,%K6 |
0x4fe631 VPMOVM2Q %K0,%ZMM1 | 0x4ff036 VPMOVM2Q %K0,%ZMM1 | 0x457d83 VPMOVM2Q %K0,%ZMM1 |
0x4fe637 VPSRLQ $0x3f,%ZMM1,%ZMM1 | 0x4ff03c VPSRLQ $0x3f,%ZMM1,%ZMM1 | 0x457d89 VPSRLQ $0x3f,%ZMM1,%ZMM1 |
0x4fe63e VPBROADCASTQ -0x689e8(%RIP),%ZMM1{%K1} | 0x4ff043 VPBROADCASTQ -0x693ed(%RIP),%ZMM1{%K1} | 0x457d90 VPBROADCASTQ 0x3dec6(%RIP),%ZMM1{%K1} |
0x4fe648 VPBROADCASTQ -0x689ea(%RIP),%ZMM1{%K2} | 0x4ff04d VPBROADCASTQ -0x693ef(%RIP),%ZMM1{%K2} | 0x457d9a VPBROADCASTQ 0x3dec4(%RIP),%ZMM1{%K2} |
0x4fe652 VPBROADCASTQ -0x689ec(%RIP),%ZMM1{%K3} | 0x4ff057 VPBROADCASTQ -0x693f1(%RIP),%ZMM1{%K3} | 0x457da4 VPBROADCASTQ 0x3dec2(%RIP),%ZMM1{%K3} |
0x4fe65c VPBROADCASTQ -0x6c966(%RIP),%ZMM1{%K4} | 0x4ff061 VPBROADCASTQ -0x6d36b(%RIP),%ZMM1{%K4} | 0x457dae VPBROADCASTQ 0x39f48(%RIP),%ZMM1{%K4} |
0x4fe666 VPBROADCASTQ -0x689f8(%RIP),%ZMM1{%K5} | 0x4ff06b VPBROADCASTQ -0x693fd(%RIP),%ZMM1{%K5} | 0x457db8 VPBROADCASTQ 0x3deb6(%RIP),%ZMM1{%K5} |
0x4fe670 VPBROADCASTQ -0x689fa(%RIP),%ZMM1{%K6} | 0x4ff075 VPBROADCASTQ -0x693ff(%RIP),%ZMM1{%K6} | 0x457dc2 VPBROADCASTQ 0x3deb4(%RIP),%ZMM1{%K6} |
0x4fe67a KXNORW %K0,%K0,%K1 | 0x4ff07f KXNORW %K0,%K0,%K1 | 0x457dcc KXNORW %K0,%K0,%K1 |
0x4fe67e VMOVUPD %ZMM2,(%RCX,%RAX,8) | 0x4ff083 VMOVUPD %ZMM2,-0x95cd(%RIP) 0x4ff08d NOP | 0x457dd0 VMOVUPD %ZMM2,(%RCX,%RAX,8) |
0x4fe685 VXORPD %XMM2,%XMM2,%XMM2 | 0x4ff08e VXORPD %XMM2,%XMM2,%XMM2 | 0x457dd7 VXORPD %XMM2,%XMM2,%XMM2 |
0x4fe689 VGATHERQPD 0x90(%RDI,%ZMM1,8),%ZMM2{%K1} | 0x4ff092 VMOVUPS -0x961c(%RIP),%ZMM2 | 0x457ddb VGATHERQPD 0x90(%RDI,%ZMM1,8),%ZMM2{%K1} |
0x4fe691 VADDPD %ZMM31,%ZMM2,%ZMM2 | 0x4ff09c VADDPD %ZMM31,%ZMM2,%ZMM2 | 0x457de3 VADDPD %ZMM31,%ZMM2,%ZMM2 |
0x4fe697 VMULPD %ZMM29,%ZMM2,%ZMM2 | 0x4ff0a2 VMULPD %ZMM29,%ZMM2,%ZMM2 | 0x457de9 VMULPD %ZMM29,%ZMM2,%ZMM2 |
0x4fe69d KXNORW %K0,%K0,%K1 | 0x4ff0a8 KXNORW %K0,%K0,%K1 | 0x457def KXNORW %K0,%K0,%K1 |
0x4fe6a1 VMOVUPD %ZMM2,(%R10,%RAX,8) | 0x4ff0ac VMOVUPD %ZMM2,-0x95b6(%RIP) 0x4ff0b6 NOP | 0x457df3 VMOVUPD %ZMM2,(%R10,%RAX,8) |
0x4fe6a8 VXORPD %XMM2,%XMM2,%XMM2 | 0x4ff0b7 VXORPD %XMM2,%XMM2,%XMM2 | 0x457dfa VXORPD %XMM2,%XMM2,%XMM2 |
0x4fe6ac VGATHERQPD 0xd0(%RDI,%ZMM1,8),%ZMM2{%K1} | 0x4ff0bb VMOVUPS -0x9645(%RIP),%ZMM2 | 0x457dfe VGATHERQPD 0xd0(%RDI,%ZMM1,8),%ZMM2{%K1} |
0x4fe6b4 VADDPD %ZMM30,%ZMM2,%ZMM2 | 0x4ff0c5 VADDPD %ZMM30,%ZMM2,%ZMM2 | 0x457e06 VADDPD %ZMM30,%ZMM2,%ZMM2 |
0x4fe6ba VMULPD %ZMM29,%ZMM2,%ZMM2 | 0x4ff0cb VMULPD %ZMM29,%ZMM2,%ZMM2 | 0x457e0c VMULPD %ZMM29,%ZMM2,%ZMM2 |
0x4fe6c0 KXNORW %K0,%K0,%K1 | 0x4ff0d1 KXNORW %K0,%K0,%K1 | 0x457e12 KXNORW %K0,%K0,%K1 |
0x4fe6c4 VMOVUPD %ZMM2,(%R8,%RAX,8) | 0x4ff0d5 VMOVUPD %ZMM2,-0x959f(%RIP) 0x4ff0df NOP | 0x457e16 VMOVUPD %ZMM2,(%R8,%RAX,8) |
0x4fe6cb VXORPD %XMM2,%XMM2,%XMM2 | 0x4ff0e0 VXORPD %XMM2,%XMM2,%XMM2 | 0x457e1d VXORPD %XMM2,%XMM2,%XMM2 |
0x4fe6cf VGATHERQPD 0x110(%RDI,%ZMM1,8),%ZMM2{%K1} | 0x4ff0e4 VMOVUPS -0x966e(%RIP),%ZMM2 | 0x457e21 VGATHERQPD 0x110(%RDI,%ZMM1,8),%ZMM2{%K1} |
0x4fe6d7 VADDPD %ZMM0,%ZMM2,%ZMM0 | 0x4ff0ee VADDPD %ZMM0,%ZMM2,%ZMM0 | 0x457e29 VADDPD %ZMM0,%ZMM2,%ZMM0 |
0x4fe6dd VMULPD %ZMM29,%ZMM0,%ZMM0 | 0x4ff0f4 VMULPD %ZMM29,%ZMM0,%ZMM0 | 0x457e2f VMULPD %ZMM29,%ZMM0,%ZMM0 |
0x4fe6e3 VMOVUPD %ZMM0,(%R11,%RAX,8) | 0x4ff0fa VMOVUPD %ZMM0,-0x9584(%RIP) 0x4ff104 NOP | 0x457e35 VMOVUPD %ZMM0,(%R11,%RAX,8) |
0x4fe6ea ADD $0x8,%EDX | 0x4ff105 ADD $0x8,%EDX | 0x457e3c ADD $0x8,%EDX |
0x4fe6ed CMP %EBX,%EDX | 0x4ff108 CMP %EBX,%EDX | 0x457e3f CMP %EBX,%EDX |
0x4fe6ef JBE 4fe3a6 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii+0xa6d96> | 0x4ff10a JBE 4fed80 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii+0xa7770> | 0x457e41 JBE 457b00 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii+0x4f0> |
Path / |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 15.29, 15.29, | 16.15, 16.15, | 15.29, 15.29, |
cycles L1 CQA | 56.00 | 53.00 | 56.00 |
cycles UFS | 119.52 | 68.49 | 119.42 |
bytes loaded | 1704.00 | 1760.00 | 1696.00 |
bytes stored | 264.00 | 256.00 | 256.00 |
nb loads | 35.00 | 35.00 | 34.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 56.00 | 53.00 | 56.00 |
cycles front end | 35.75 | 34.25 | 35.25 |
cycles P0 | 56.00 | 53.00 | 56.00 |
cycles P1 | 37.50 | 37.50 | 37.50 |
cycles P2 | 28.00 | 17.50 | 27.50 |
cycles P3 | 28.00 | 17.50 | 27.50 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 56.00 | 53.00 | 56.00 |
cycles P6 | 3.00 | 2.50 | 2.50 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 84.77 | 34.09 | 85.18 |
LB full | 0.00 | 0.00 | 0.00 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 87.27 | 0.00 | 87.17 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 1.24 | 61.95 | 1.23 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 143.00 | 137.00 | 141.00 |
uops P0 | 56.00 | 53.00 | 56.00 |
uops P1 | 3.00 | 2.50 | 2.50 |
uops P2 | 28.00 | 17.50 | 27.50 |
uops P3 | 28.00 | 17.50 | 27.50 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 56.00 | 53.00 | 56.00 |
uops P6 | 3.00 | 2.50 | 2.50 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 985 | 987 | 971 |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 15.29, 15.29, | 16.15, 16.15, | 15.29, 15.29, |
cycles L1 CQA | 56.00 | 53.00 | 56.00 |
cycles UFS | 119.52 | 68.49 | 119.42 |
bytes loaded | 1704.00 | 1760.00 | 1696.00 |
bytes stored | 264.00 | 256.00 | 256.00 |
nb loads | 35.00 | 35.00 | 34.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 56.00 | 53.00 | 56.00 |
cycles front end | 35.75 | 34.25 | 35.25 |
cycles P0 | 56.00 | 53.00 | 56.00 |
cycles P1 | 37.50 | 37.50 | 37.50 |
cycles P2 | 28.00 | 17.50 | 27.50 |
cycles P3 | 28.00 | 17.50 | 27.50 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 56.00 | 53.00 | 56.00 |
cycles P6 | 3.00 | 2.50 | 2.50 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 84.77 | 34.09 | 85.18 |
LB full | 0.00 | 0.00 | 0.00 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 87.27 | 0.00 | 87.17 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 1.24 | 61.95 | 1.23 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 143.00 | 137.00 | 141.00 |
uops P0 | 56.00 | 53.00 | 56.00 |
uops P1 | 3.00 | 2.50 | 2.50 |
uops P2 | 28.00 | 17.50 | 27.50 |
uops P3 | 28.00 | 17.50 | 27.50 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 56.00 | 53.00 | 56.00 |
uops P6 | 3.00 | 2.50 | 2.50 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 985 | 987 | 971 |