Loop Id: 971 | Module: exec | Source: ParticleBConds.h:185-217 | Coverage: 13.24% |
---|
Loop Id: 971 | Module: exec | Source: ParticleBConds.h:185-217 | Coverage: 13.24% |
---|
0x457b00 LEA (%R9,%RDX,1),%EAX |
0x457b04 VPBROADCASTD %EAX,%YMM0 |
0x457b0a VPADDD 0x3e32e(%RIP),%YMM0,%YMM0 [6] |
0x457b12 VPCMPGTD %YMM0,%YMM5,%K1 |
0x457b18 CLTQ |
0x457b1a VMOVUPD (%R12,%RAX,8),%ZMM0 [10] |
0x457b21 VBROADCASTSD 0x36505(%RIP),%ZMM1 [6] |
0x457b2b VBROADCASTSD 0x364f3(%RIP),%ZMM2 [6] |
0x457b35 VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} |
0x457b3b VSUBPD 0x780(%RSP),%ZMM0,%ZMM0 [9] |
0x457b43 VMOVUPD (%R15,%RAX,8),%ZMM30 [7] |
0x457b4a VSUBPD 0x740(%RSP),%ZMM30,%ZMM30 [9] |
0x457b52 VMULPD %ZMM29,%ZMM0,%ZMM31 |
0x457b58 VMULPD %ZMM29,%ZMM30,%ZMM30 |
0x457b5e VMOVUPD (%RSI,%RAX,8),%ZMM0 [1] |
0x457b65 VSUBPD 0x700(%RSP),%ZMM0,%ZMM0 [9] |
0x457b6d VMULPD 0x6c0(%RSP),%ZMM31,%ZMM1 [9] |
0x457b75 VMULPD %ZMM29,%ZMM0,%ZMM2 |
0x457b7b VFMADD231PD 0x680(%RSP),%ZMM30,%ZMM1 [9] |
0x457b83 VFMADD231PD 0x640(%RSP),%ZMM2,%ZMM1 [9] |
0x457b8b VMULPD 0x600(%RSP),%ZMM31,%ZMM3 [9] |
0x457b93 VFMADD231PD 0x5c0(%RSP),%ZMM30,%ZMM3 [9] |
0x457b9b VRNDSCALEPD $0x9,%ZMM1,%ZMM0 |
0x457ba2 VFMADD231PD 0x580(%RSP),%ZMM2,%ZMM3 [9] |
0x457baa VMULPD 0x540(%RSP),%ZMM31,%ZMM1 [9] |
0x457bb2 VFMADD231PD 0x500(%RSP),%ZMM30,%ZMM1 [9] |
0x457bba VFMADD231PD 0x4c0(%RSP),%ZMM2,%ZMM1 [9] |
0x457bc2 VRNDSCALEPD $0x9,%ZMM3,%ZMM3 |
0x457bc9 VRNDSCALEPD $0x9,%ZMM1,%ZMM1 |
0x457bd0 VFMSUB231PD 0x480(%RSP),%ZMM0,%ZMM31 [9] |
0x457bd8 VFMADD231PD 0x440(%RSP),%ZMM3,%ZMM31 [9] |
0x457be0 VFMSUB231PD 0x3c0(%RSP),%ZMM0,%ZMM30 [9] |
0x457be8 VFNMSUB231PD 0x400(%RSP),%ZMM1,%ZMM31 [9] |
0x457bf0 VFMADD231PD 0x380(%RSP),%ZMM3,%ZMM30 [9] |
0x457bf8 VFNMSUB231PD 0x340(%RSP),%ZMM1,%ZMM30 [9] |
0x457c00 VFMSUB132PD 0x300(%RSP),%ZMM2,%ZMM0 [9] |
0x457c08 VFMADD231PD %ZMM3,%ZMM6,%ZMM0 |
0x457c0e VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 |
0x457c14 VMULPD %ZMM31,%ZMM31,%ZMM1 |
0x457c1a VFMADD231PD %ZMM30,%ZMM30,%ZMM1 |
0x457c20 VADDPD %ZMM31,%ZMM8,%ZMM2 |
0x457c26 VADDPD %ZMM30,%ZMM9,%ZMM3 |
0x457c2c VADDPD %ZMM0,%ZMM10,%ZMM4 |
0x457c32 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x457c38 VFMADD231PD %ZMM0,%ZMM0,%ZMM1 |
0x457c3e VFMADD231PD %ZMM3,%ZMM3,%ZMM2 |
0x457c44 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457c4a VCMPPD $0x1,%ZMM1,%ZMM2,%K0 |
0x457c51 VADDPD %ZMM31,%ZMM11,%ZMM3 |
0x457c57 VADDPD %ZMM30,%ZMM12,%ZMM4 |
0x457c5d VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x457c63 VADDPD %ZMM0,%ZMM13,%ZMM2 |
0x457c69 VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x457c6f VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x457c75 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x457c7b VADDPD %ZMM31,%ZMM14,%ZMM2 |
0x457c81 VCMPPD $0x1,%ZMM1,%ZMM3,%K1 |
0x457c88 VADDPD %ZMM30,%ZMM15,%ZMM4 |
0x457c8e VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x457c94 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457c9a VADDPD %ZMM0,%ZMM16,%ZMM4 |
0x457ca0 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457ca6 VMINPD %ZMM1,%ZMM3,%ZMM1 |
0x457cac VCMPPD $0x1,%ZMM1,%ZMM2,%K2 |
0x457cb3 VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x457cb9 VADDPD %ZMM31,%ZMM17,%ZMM2 |
0x457cbf VADDPD %ZMM30,%ZMM18,%ZMM3 |
0x457cc5 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x457ccb VADDPD %ZMM0,%ZMM19,%ZMM4 |
0x457cd1 VFMADD231PD %ZMM3,%ZMM3,%ZMM2 |
0x457cd7 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457cdd VCMPPD $0x1,%ZMM1,%ZMM2,%K3 |
0x457ce4 VADDPD %ZMM31,%ZMM20,%ZMM3 |
0x457cea VADDPD %ZMM30,%ZMM21,%ZMM4 |
0x457cf0 VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x457cf6 VADDPD %ZMM0,%ZMM22,%ZMM2 |
0x457cfc VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x457d02 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x457d08 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x457d0e VADDPD %ZMM31,%ZMM23,%ZMM2 |
0x457d14 VCMPPD $0x1,%ZMM1,%ZMM3,%K4 |
0x457d1b VADDPD %ZMM30,%ZMM24,%ZMM4 |
0x457d21 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x457d27 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457d2d VADDPD %ZMM0,%ZMM25,%ZMM4 |
0x457d33 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x457d39 VMINPD %ZMM1,%ZMM3,%ZMM1 |
0x457d3f VADDPD %ZMM31,%ZMM26,%ZMM3 |
0x457d45 VADDPD %ZMM30,%ZMM27,%ZMM4 |
0x457d4b VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x457d51 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x457d57 VCMPPD $0x1,%ZMM1,%ZMM2,%K5 |
0x457d5e VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x457d64 VADDPD %ZMM0,%ZMM28,%ZMM2 |
0x457d6a VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x457d70 VMINPD %ZMM1,%ZMM3,%ZMM2 |
0x457d76 VSQRTPD %ZMM2,%ZMM2 |
0x457d7c VCMPPD $0x1,%ZMM1,%ZMM3,%K6 |
0x457d83 VPMOVM2Q %K0,%ZMM1 |
0x457d89 VPSRLQ $0x3f,%ZMM1,%ZMM1 |
0x457d90 VPBROADCASTQ 0x3dec6(%RIP),%ZMM1{%K1} [6] |
0x457d9a VPBROADCASTQ 0x3dec4(%RIP),%ZMM1{%K2} [6] |
0x457da4 VPBROADCASTQ 0x3dec2(%RIP),%ZMM1{%K3} [6] |
0x457dae VPBROADCASTQ 0x39f48(%RIP),%ZMM1{%K4} [6] |
0x457db8 VPBROADCASTQ 0x3deb6(%RIP),%ZMM1{%K5} [6] |
0x457dc2 VPBROADCASTQ 0x3deb4(%RIP),%ZMM1{%K6} [6] |
0x457dcc KXNORW %K0,%K0,%K1 |
0x457dd0 VMOVUPD %ZMM2,(%RCX,%RAX,8) [5] |
0x457dd7 VXORPD %XMM2,%XMM2,%XMM2 |
0x457ddb VGATHERQPD 0x90(%RDI,%ZMM1,8),%ZMM2{%K1} [4] |
0x457de3 VADDPD %ZMM31,%ZMM2,%ZMM2 |
0x457de9 VMULPD %ZMM29,%ZMM2,%ZMM2 |
0x457def KXNORW %K0,%K0,%K1 |
0x457df3 VMOVUPD %ZMM2,(%R10,%RAX,8) [2] |
0x457dfa VXORPD %XMM2,%XMM2,%XMM2 |
0x457dfe VGATHERQPD 0xd0(%RDI,%ZMM1,8),%ZMM2{%K1} [4] |
0x457e06 VADDPD %ZMM30,%ZMM2,%ZMM2 |
0x457e0c VMULPD %ZMM29,%ZMM2,%ZMM2 |
0x457e12 KXNORW %K0,%K0,%K1 |
0x457e16 VMOVUPD %ZMM2,(%R8,%RAX,8) [8] |
0x457e1d VXORPD %XMM2,%XMM2,%XMM2 |
0x457e21 VGATHERQPD 0x110(%RDI,%ZMM1,8),%ZMM2{%K1} [4] |
0x457e29 VADDPD %ZMM0,%ZMM2,%ZMM0 |
0x457e2f VMULPD %ZMM29,%ZMM0,%ZMM0 |
0x457e35 VMOVUPD %ZMM0,(%R11,%RAX,8) [3] |
0x457e3c ADD $0x8,%EDX |
0x457e3f CMP %EBX,%EDX |
0x457e41 JBE 457b00 |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Particle/Lattice/ParticleBConds.h: 185 - 217 |
-------------------------------------------------------------------------------- |
185: #pragma omp simd aligned(temp_r, px, py, pz, dx, dy, dz: QMC_SIMD_ALIGNMENT) |
186: for (int iat = first; iat < last; ++iat) |
187: { |
188: const T flip = iat < flip_ind ? one : minusone; |
189: const T displ_0 = (px[iat] - x0) * flip; |
190: const T displ_1 = (py[iat] - y0) * flip; |
191: const T displ_2 = (pz[iat] - z0) * flip; |
192: |
193: const T ar_0 = -std::floor(displ_0 * g00 + displ_1 * g10 + displ_2 * g20); |
194: const T ar_1 = -std::floor(displ_0 * g01 + displ_1 * g11 + displ_2 * g21); |
195: const T ar_2 = -std::floor(displ_0 * g02 + displ_1 * g12 + displ_2 * g22); |
196: |
197: const T delx = displ_0 + ar_0 * r00 + ar_1 * r10 + ar_2 * r20; |
198: const T dely = displ_1 + ar_0 * r01 + ar_1 * r11 + ar_2 * r21; |
199: const T delz = displ_2 + ar_0 * r02 + ar_1 * r12 + ar_2 * r22; |
200: |
201: T rmin = delx * delx + dely * dely + delz * delz; |
202: int ic = 0; |
203: #pragma unroll(7) |
204: for (int c = 1; c < 8; ++c) |
205: { |
206: const T x = delx + cellx[c]; |
207: const T y = dely + celly[c]; |
208: const T z = delz + cellz[c]; |
209: const T r2 = x * x + y * y + z * z; |
210: ic = (r2 < rmin) ? c : ic; |
211: rmin = (r2 < rmin) ? r2 : rmin; |
212: } |
213: |
214: temp_r[iat] = std::sqrt(rmin); |
215: dx[iat] = flip * (delx + cellx[ic]); |
216: dy[iat] = flip * (dely + celly[ic]); |
217: dz[iat] = flip * (delz + cellz[ic]); |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►51.83+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:83 | exec |
○ | qmcplusplus::ParticleSet::make[...] | ParticleSet.cpp:290 | exec |
○ | main.extracted.104 | miniqmc.cpp:482 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 | |
►22.25+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:76 | exec |
○ | qmcplusplus::ParticleSet::setA[...] | ParticleSet.cpp:259 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 | |
►13.78+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:83 | exec |
○ | qmcplusplus::ParticleSet::make[...] | ParticleSet.cpp:290 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 | |
►4.61+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:119 | exec |
○ | qmcplusplus::ParticleSet::make[...] | ParticleSet.cpp:290 | exec |
○ | main.extracted.104 | miniqmc.cpp:482 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 | |
►4.42+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:69 | exec |
○ | qmcplusplus::ParticleSet::upda[...] | ParticleSet.cpp:250 | exec |
○ | main.extracted.107 | miniqmc.cpp:390 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:374 | exec |
○ | __libc_init_first | libc.so.6 | |
►1.90+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:119 | exec |
○ | qmcplusplus::ParticleSet::make[...] | ParticleSet.cpp:290 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 | |
►1.20+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:113 | exec |
○ | qmcplusplus::ParticleSet::setA[...] | ParticleSet.cpp:259 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.03 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.01 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.49 |
Bottlenecks | P0, P5, |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:185-217 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 56.00 |
CQA cycles if no scalar integer | 54.50 |
CQA cycles if FP arith vectorized | 56.00 |
CQA cycles if fully vectorized | 55.50 |
Front-end cycles | 35.25 |
DIV/SQRT cycles | 56.00 |
P0 cycles | 37.50 |
P1 cycles | 27.50 |
P2 cycles | 27.50 |
P3 cycles | 4.00 |
P4 cycles | 56.00 |
P5 cycles | 2.50 |
P6 cycles | 4.00 |
P7 cycles | 18.00 - 24.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 113.75 - 119.42 |
Stall cycles (UFS) | 79.51 - 85.18 |
Nb insns | 128.00 |
Nb uops | 141.00 |
Nb loads | 34.00 |
Nb stores | 4.00 |
Nb stack references | 19.00 |
FLOP/cycle | 15.29 |
Nb FLOP add-sub | 216.00 |
Nb FLOP mul | 136.00 |
Nb FLOP fma | 248.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 8.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 34.86 |
Bytes prefetched | 0.00 |
Bytes loaded | 1696.00 |
Bytes stored | 256.00 |
Stride 0 | 9.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 90.91 |
Vectorization ratio load | 76.47 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 70.27 |
Vector-efficiency ratio all | 89.22 |
Vector-efficiency ratio load | 77.94 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 98.21 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 66.09 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.03 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.01 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.49 |
Bottlenecks | P0, P5, |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:185-217 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 56.00 |
CQA cycles if no scalar integer | 54.50 |
CQA cycles if FP arith vectorized | 56.00 |
CQA cycles if fully vectorized | 55.50 |
Front-end cycles | 35.25 |
DIV/SQRT cycles | 56.00 |
P0 cycles | 37.50 |
P1 cycles | 27.50 |
P2 cycles | 27.50 |
P3 cycles | 4.00 |
P4 cycles | 56.00 |
P5 cycles | 2.50 |
P6 cycles | 4.00 |
P7 cycles | 18.00 - 24.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 113.75 - 119.42 |
Stall cycles (UFS) | 79.51 - 85.18 |
Nb insns | 128.00 |
Nb uops | 141.00 |
Nb loads | 34.00 |
Nb stores | 4.00 |
Nb stack references | 19.00 |
FLOP/cycle | 15.29 |
Nb FLOP add-sub | 216.00 |
Nb FLOP mul | 136.00 |
Nb FLOP fma | 248.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 8.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 34.86 |
Bytes prefetched | 0.00 |
Bytes loaded | 1696.00 |
Bytes stored | 256.00 |
Stride 0 | 9.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 90.91 |
Vectorization ratio load | 76.47 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 70.27 |
Vector-efficiency ratio all | 89.22 |
Vector-efficiency ratio load | 77.94 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 98.21 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 66.09 |
Path / |
Function | void qmcplusplus::DTD_BConds |
Source file and lines | ParticleBConds.h:185-217 |
Module | exec |
nb instructions | 128 |
nb uops | 141 |
loop length | 839 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 1 |
used ymm registers | 2 |
used zmm registers | 31 |
nb stack references | 19 |
ADD-SUB / MUL ratio | 1.59 |
micro-operation queue | 35.25 cycles |
front end | 35.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 56.00 | 2.50 | 27.50 | 27.50 | 4.00 | 56.00 | 2.50 | 4.00 |
cycles | 56.00 | 37.50 | 27.50 | 27.50 | 4.00 | 56.00 | 2.50 | 4.00 |
Cycles executing div or sqrt instructions | 18.00-24.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 113.75-119.42 |
Stall cycles | 79.51-85.18 |
RS full (events) | 1.21-1.23 |
PRF_FLOAT full (events) | 81.49-87.17 |
Front-end | 35.25 |
Dispatch | 56.00 |
DIV/SQRT | 18.00-24.00 |
Data deps. | 0.00 |
Overall L1 | 56.00 |
all | 25% |
load | 14% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 18% |
all | 98% |
load | 92% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 92% |
all | 90% |
load | 76% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 70% |
all | 24% |
load | 17% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 22% |
all | 96% |
load | 93% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 84% |
all | 89% |
load | 77% |
store | 100% |
mul | 100% |
add-sub | 98% |
fma | 100% |
div/sqrt | 100% |
other | 66% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
LEA (%R9,%RDX,1),%EAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTD %EAX,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VPADDD 0x3e32e(%RIP),%YMM0,%YMM0 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 |
VPCMPGTD %YMM0,%YMM5,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CLTQ | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
VMOVUPD (%R12,%RAX,8),%ZMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VBROADCASTSD 0x36505(%RIP),%ZMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSD 0x364f3(%RIP),%ZMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VSUBPD 0x780(%RSP),%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD (%R15,%RAX,8),%ZMM30 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPD 0x740(%RSP),%ZMM30,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM30,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD (%RSI,%RAX,8),%ZMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPD 0x700(%RSP),%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x6c0(%RSP),%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x680(%RSP),%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x640(%RSP),%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x600(%RSP),%ZMM31,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x5c0(%RSP),%ZMM30,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VRNDSCALEPD $0x9,%ZMM1,%ZMM0 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VFMADD231PD 0x580(%RSP),%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x540(%RSP),%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x500(%RSP),%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x4c0(%RSP),%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VRNDSCALEPD $0x9,%ZMM3,%ZMM3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VRNDSCALEPD $0x9,%ZMM1,%ZMM1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VFMSUB231PD 0x480(%RSP),%ZMM0,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x440(%RSP),%ZMM3,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMSUB231PD 0x3c0(%RSP),%ZMM0,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD 0x400(%RSP),%ZMM1,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x380(%RSP),%ZMM3,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD 0x340(%RSP),%ZMM1,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMSUB132PD 0x300(%RSP),%ZMM2,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM6,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM31,%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM30,%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM8,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM9,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM10,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM0,%ZMM0,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM31,%ZMM11,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM12,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM13,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM14,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM30,%ZMM15,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM16,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM17,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM18,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM19,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM31,%ZMM20,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM21,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM22,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM23,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM30,%ZMM24,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM25,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM26,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM27,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM28,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VSQRTPD %ZMM2,%ZMM2 | 3 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 24-33 | 18-24 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VPMOVM2Q %K0,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VPSRLQ $0x3f,%ZMM1,%ZMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPBROADCASTQ 0x3dec6(%RIP),%ZMM1{%K1} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3dec4(%RIP),%ZMM1{%K2} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3dec2(%RIP),%ZMM1{%K3} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x39f48(%RIP),%ZMM1{%K4} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3deb6(%RIP),%ZMM1{%K5} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3deb4(%RIP),%ZMM1{%K6} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%RCX,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0x90(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM31,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%R10,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0xd0(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM30,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%R8,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0x110(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM0,%ZMM2,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM0,(%R11,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
ADD $0x8,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %EBX,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JBE 457b00 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Function | void qmcplusplus::DTD_BConds |
Source file and lines | ParticleBConds.h:185-217 |
Module | exec |
nb instructions | 128 |
nb uops | 141 |
loop length | 839 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 1 |
used ymm registers | 2 |
used zmm registers | 31 |
nb stack references | 19 |
ADD-SUB / MUL ratio | 1.59 |
micro-operation queue | 35.25 cycles |
front end | 35.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 56.00 | 2.50 | 27.50 | 27.50 | 4.00 | 56.00 | 2.50 | 4.00 |
cycles | 56.00 | 37.50 | 27.50 | 27.50 | 4.00 | 56.00 | 2.50 | 4.00 |
Cycles executing div or sqrt instructions | 18.00-24.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 113.75-119.42 |
Stall cycles | 79.51-85.18 |
RS full (events) | 1.21-1.23 |
PRF_FLOAT full (events) | 81.49-87.17 |
Front-end | 35.25 |
Dispatch | 56.00 |
DIV/SQRT | 18.00-24.00 |
Data deps. | 0.00 |
Overall L1 | 56.00 |
all | 25% |
load | 14% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 18% |
all | 98% |
load | 92% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 92% |
all | 90% |
load | 76% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 70% |
all | 24% |
load | 17% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 22% |
all | 96% |
load | 93% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 84% |
all | 89% |
load | 77% |
store | 100% |
mul | 100% |
add-sub | 98% |
fma | 100% |
div/sqrt | 100% |
other | 66% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
LEA (%R9,%RDX,1),%EAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTD %EAX,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VPADDD 0x3e32e(%RIP),%YMM0,%YMM0 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 |
VPCMPGTD %YMM0,%YMM5,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CLTQ | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
VMOVUPD (%R12,%RAX,8),%ZMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VBROADCASTSD 0x36505(%RIP),%ZMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSD 0x364f3(%RIP),%ZMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VSUBPD 0x780(%RSP),%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD (%R15,%RAX,8),%ZMM30 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPD 0x740(%RSP),%ZMM30,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM30,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD (%RSI,%RAX,8),%ZMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPD 0x700(%RSP),%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x6c0(%RSP),%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x680(%RSP),%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x640(%RSP),%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x600(%RSP),%ZMM31,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x5c0(%RSP),%ZMM30,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VRNDSCALEPD $0x9,%ZMM1,%ZMM0 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VFMADD231PD 0x580(%RSP),%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD 0x540(%RSP),%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x500(%RSP),%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x4c0(%RSP),%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VRNDSCALEPD $0x9,%ZMM3,%ZMM3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VRNDSCALEPD $0x9,%ZMM1,%ZMM1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 8 | 1 |
VFMSUB231PD 0x480(%RSP),%ZMM0,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x440(%RSP),%ZMM3,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMSUB231PD 0x3c0(%RSP),%ZMM0,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD 0x400(%RSP),%ZMM1,%ZMM31 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x380(%RSP),%ZMM3,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD 0x340(%RSP),%ZMM1,%ZMM30 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMSUB132PD 0x300(%RSP),%ZMM2,%ZMM0 | 1 | 0.50 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM6,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM31,%ZMM31,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM30,%ZMM30,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM8,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM9,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM10,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM0,%ZMM0,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM31,%ZMM11,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM12,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM13,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM14,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM30,%ZMM15,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM16,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM17,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM18,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM19,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM31,%ZMM20,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM21,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM22,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM23,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPD %ZMM30,%ZMM24,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM2,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM25,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM31,%ZMM26,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM30,%ZMM27,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM3,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM1,%ZMM2,%K5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMINPD %ZMM1,%ZMM2,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VADDPD %ZMM0,%ZMM28,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM1,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VSQRTPD %ZMM2,%ZMM2 | 3 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 24-33 | 18-24 |
VCMPPD $0x1,%ZMM1,%ZMM3,%K6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VPMOVM2Q %K0,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VPSRLQ $0x3f,%ZMM1,%ZMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPBROADCASTQ 0x3dec6(%RIP),%ZMM1{%K1} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3dec4(%RIP),%ZMM1{%K2} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3dec2(%RIP),%ZMM1{%K3} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x39f48(%RIP),%ZMM1{%K4} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3deb6(%RIP),%ZMM1{%K5} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VPBROADCASTQ 0x3deb4(%RIP),%ZMM1{%K6} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%RCX,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0x90(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM31,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%R10,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0xd0(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM30,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
KXNORW %K0,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPD %ZMM2,(%R8,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VGATHERQPD 0x110(%RDI,%ZMM1,8),%ZMM2{%K1} | 4 | 1 | 0 | 4 | 4 | 0 | 1 | 0 | 0 | 21 | 5 |
VADDPD %ZMM0,%ZMM2,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM29,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM0,(%R11,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
ADD $0x8,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %EBX,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JBE 457b00 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Metric | run_0 |
---|---|
Coverage (% app. time) | 13.24 |
Time (s) | 7.91 |
Instance Count | 374184 |
Iteration Count - min | 64 |
Iteration Count - avg | 421.78 |
Iteration Count - max | 768 |
Cycles per Iteration - min | 87.38 |
Cycles per Iteration - avg | 104.6 |
Cycles per Iteration - max | 3800.69 |
Metric | Value |
---|---|
Bucket Coverage (% loop time) | 76.86 |
Instance Count | 374184 |
ORIG CPI:min | 88.95 |
ORIG CPI:med | 90.29 |
ORIG CPI:max | 245.89 |
DL1 CPI:min | 82.48 |
DL1 CPI:med | 82.77 |
DL1 CPI:max | 109.42 |
ORIG (min) / DL1 (min) | 1.08 |
ORIG (med) / DL1 (med) | 1.09 |
ORIG (max) / DL1 (max) | 2.25 |
Nb Iteration:min | 768 |
Nb Iteration:med | 768.00 |
Nb Iteration:max | 768 |
ORIG: min (cycles) | 68312 |
ORIG: med (cycles) | 69344.00 |
ORIG: max (cycles) | 188844 |
DL1:min (cycles) | 63346 |
DL1:med (cycles) | 63568.00 |
DL1:max (cycles) | 84034 |
Metric | Value |
---|---|
Bucket Coverage (% loop time) | 22.61 |
Instance Count | 374184 |
ORIG CPI:min | 108.79 |
ORIG CPI:med | 126.60 |
ORIG CPI:max | 265.80 |
DL1 CPI:min | 82.52 |
DL1 CPI:med | 82.70 |
DL1 CPI:max | 82.90 |
ORIG (min) / DL1 (min) | 1.32 |
ORIG (med) / DL1 (med) | 1.53 |
ORIG (max) / DL1 (max) | 3.21 |
Nb Iteration:min | 768 |
Nb Iteration:med | 768.00 |
Nb Iteration:max | 768 |
ORIG: min (cycles) | 83550 |
ORIG: med (cycles) | 97232.00 |
ORIG: max (cycles) | 204138 |
DL1:min (cycles) | 63376 |
DL1:med (cycles) | 63510.00 |
DL1:max (cycles) | 63666 |
Metric (average per iteration except for Time and Iteration Count) | ORIG | DL1 | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | |
Time | 69344.00 | 69344.00 | 69344.00 | 69344.00 | 68312.00 | 69344.00 | 188844.00 | 63568.00 | 63568.00 | 63568.00 | 63568.00 | 63346.00 | 63568.00 | 84034.00 |
CPI MIN | 88.95 | 82.48 | ||||||||||||
CPI MED | 90.29 | 90.29 | 90.29 | 90.29 | 88.95 | 90.29 | 245.89 | 82.77 | 82.77 | 82.77 | 82.77 | 82.48 | 82.77 | 109.42 |
CPI AVG | 104.08 | 84.32 | ||||||||||||
CPI MAX | 245.89 | 109.42 | ||||||||||||
Iteration Count | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 |
Metric (average per iteration except for Time and Iteration Count) | ORIG | DL1 | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | |
Time | 97232.00 | 97232.00 | 97232.00 | 97232.00 | 83550.00 | 97232.00 | 204138.00 | 63510.00 | 63510.00 | 63510.00 | 63510.00 | 63376.00 | 63510.00 | 63666.00 |
CPI MIN | 108.79 | 82.52 | ||||||||||||
CPI MED | 126.60 | 126.60 | 126.60 | 126.60 | 108.79 | 126.60 | 265.80 | 82.70 | 82.70 | 82.70 | 82.70 | 82.52 | 82.70 | 82.90 |
CPI AVG | 141.08 | 82.72 | ||||||||||||
CPI MAX | 265.80 | 82.90 | ||||||||||||
Iteration Count | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 |
ORIG | DL1 | Original Code |
---|---|---|
0x4fc32a ADDQ $0x1,-0x5132(%RIP) 0x4fc332 LEA (%R9,%RDX,1),%EAX | 0x4fcd04 LEA (%R9,%RDX,1),%EAX | 0x457b00 LEA (%R9,%RDX,1),%EAX |
0x4fc336 VPBROADCASTD %EAX,%YMM0 | 0x4fcd08 VPBROADCASTD %EAX,%YMM0 | 0x457b04 VPBROADCASTD %EAX,%YMM0 |
0x4fc33c VPADDD -0x66504(%RIP),%YMM0,%YMM0 | 0x4fcd0e VPADDD -0x66ed6(%RIP),%YMM0,%YMM0 | 0x457b0a VPADDD 0x3e32e(%RIP),%YMM0,%YMM0 |
0x4fc344 VPCMPGTD %YMM0,%YMM5,%K1 | 0x4fcd16 VPCMPGTD %YMM0,%YMM5,%K1 | 0x457b12 VPCMPGTD %YMM0,%YMM5,%K1 |
0x4fc34a CLTQ | 0x4fcd1c CLTQ | 0x457b18 CLTQ |
0x4fc34c VMOVUPD (%R12,%RAX,8),%ZMM0 | 0x4fcd1e VMOVUPD -0x72a8(%RIP),%ZMM0 | 0x457b1a VMOVUPD (%R12,%RAX,8),%ZMM0 |
0x4fc353 VBROADCASTSD -0x6e32d(%RIP),%ZMM1 | 0x4fcd28 VBROADCASTSD -0x6ed02(%RIP),%ZMM1 | 0x457b21 VBROADCASTSD 0x36505(%RIP),%ZMM1 |
0x4fc35d VBROADCASTSD -0x6e33f(%RIP),%ZMM2 | 0x4fcd32 VBROADCASTSD -0x6ed14(%RIP),%ZMM2 | 0x457b2b VBROADCASTSD 0x364f3(%RIP),%ZMM2 |
0x4fc367 VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} | 0x4fcd3c VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} | 0x457b35 VBLENDMPD %ZMM2,%ZMM1,%ZMM29{%K1} |
0x4fc36d VSUBPD 0x780(%RSP),%ZMM0,%ZMM0 | 0x4fcd42 VSUBPD -0x79cc(%RIP),%ZMM0,%ZMM0 | 0x457b3b VSUBPD 0x780(%RSP),%ZMM0,%ZMM0 |
0x4fc375 VMOVUPD (%R15,%RAX,8),%ZMM30 | 0x4fcd4c VMOVUPD -0x72d6(%RIP),%ZMM30 | 0x457b43 VMOVUPD (%R15,%RAX,8),%ZMM30 |
0x4fc37c VSUBPD 0x740(%RSP),%ZMM30,%ZMM30 | 0x4fcd56 VSUBPD -0x79a0(%RIP),%ZMM30,%ZMM30 | 0x457b4a VSUBPD 0x740(%RSP),%ZMM30,%ZMM30 |
0x4fc384 VMULPD %ZMM29,%ZMM0,%ZMM31 | 0x4fcd60 VMULPD %ZMM29,%ZMM0,%ZMM31 | 0x457b52 VMULPD %ZMM29,%ZMM0,%ZMM31 |
0x4fc38a VMULPD %ZMM29,%ZMM30,%ZMM30 | 0x4fcd66 VMULPD %ZMM29,%ZMM30,%ZMM30 | 0x457b58 VMULPD %ZMM29,%ZMM30,%ZMM30 |
0x4fc390 VMOVUPD (%RSI,%RAX,8),%ZMM0 | 0x4fcd6c VMOVUPD -0x72f6(%RIP),%ZMM0 | 0x457b5e VMOVUPD (%RSI,%RAX,8),%ZMM0 |
0x4fc397 VSUBPD 0x700(%RSP),%ZMM0,%ZMM0 | 0x4fcd76 VSUBPD -0x7980(%RIP),%ZMM0,%ZMM0 | 0x457b65 VSUBPD 0x700(%RSP),%ZMM0,%ZMM0 |
0x4fc39f VMULPD 0x6c0(%RSP),%ZMM31,%ZMM1 | 0x4fcd80 VMULPD -0x794a(%RIP),%ZMM31,%ZMM1 | 0x457b6d VMULPD 0x6c0(%RSP),%ZMM31,%ZMM1 |
0x4fc3a7 VMULPD %ZMM29,%ZMM0,%ZMM2 | 0x4fcd8a VMULPD %ZMM29,%ZMM0,%ZMM2 | 0x457b75 VMULPD %ZMM29,%ZMM0,%ZMM2 |
0x4fc3ad VFMADD231PD 0x680(%RSP),%ZMM30,%ZMM1 | 0x4fcd90 VFMADD231PD -0x791a(%RIP),%ZMM30,%ZMM1 | 0x457b7b VFMADD231PD 0x680(%RSP),%ZMM30,%ZMM1 |
0x4fc3b5 VFMADD231PD 0x640(%RSP),%ZMM2,%ZMM1 | 0x4fcd9a VFMADD231PD -0x78e4(%RIP),%ZMM2,%ZMM1 | 0x457b83 VFMADD231PD 0x640(%RSP),%ZMM2,%ZMM1 |
0x4fc3bd VMULPD 0x600(%RSP),%ZMM31,%ZMM3 | 0x4fcda4 VMULPD -0x78ae(%RIP),%ZMM31,%ZMM3 | 0x457b8b VMULPD 0x600(%RSP),%ZMM31,%ZMM3 |
0x4fc3c5 VFMADD231PD 0x5c0(%RSP),%ZMM30,%ZMM3 | 0x4fcdae VFMADD231PD -0x7878(%RIP),%ZMM30,%ZMM3 | 0x457b93 VFMADD231PD 0x5c0(%RSP),%ZMM30,%ZMM3 |
0x4fc3cd VRNDSCALEPD $0x9,%ZMM1,%ZMM0 | 0x4fcdb8 VRNDSCALEPD $0x9,%ZMM1,%ZMM0 | 0x457b9b VRNDSCALEPD $0x9,%ZMM1,%ZMM0 |
0x4fc3d4 VFMADD231PD 0x580(%RSP),%ZMM2,%ZMM3 | 0x4fcdbf VFMADD231PD -0x7849(%RIP),%ZMM2,%ZMM3 | 0x457ba2 VFMADD231PD 0x580(%RSP),%ZMM2,%ZMM3 |
0x4fc3dc VMULPD 0x540(%RSP),%ZMM31,%ZMM1 | 0x4fcdc9 VMULPD -0x7813(%RIP),%ZMM31,%ZMM1 | 0x457baa VMULPD 0x540(%RSP),%ZMM31,%ZMM1 |
0x4fc3e4 VFMADD231PD 0x500(%RSP),%ZMM30,%ZMM1 | 0x4fcdd3 VFMADD231PD -0x77dd(%RIP),%ZMM30,%ZMM1 | 0x457bb2 VFMADD231PD 0x500(%RSP),%ZMM30,%ZMM1 |
0x4fc3ec VFMADD231PD 0x4c0(%RSP),%ZMM2,%ZMM1 | 0x4fcddd VFMADD231PD -0x77a7(%RIP),%ZMM2,%ZMM1 | 0x457bba VFMADD231PD 0x4c0(%RSP),%ZMM2,%ZMM1 |
0x4fc3f4 VRNDSCALEPD $0x9,%ZMM3,%ZMM3 | 0x4fcde7 VRNDSCALEPD $0x9,%ZMM3,%ZMM3 | 0x457bc2 VRNDSCALEPD $0x9,%ZMM3,%ZMM3 |
0x4fc3fb VRNDSCALEPD $0x9,%ZMM1,%ZMM1 | 0x4fcdee VRNDSCALEPD $0x9,%ZMM1,%ZMM1 | 0x457bc9 VRNDSCALEPD $0x9,%ZMM1,%ZMM1 |
0x4fc402 VFMSUB231PD 0x480(%RSP),%ZMM0,%ZMM31 | 0x4fcdf5 VFMSUB231PD -0x777f(%RIP),%ZMM0,%ZMM31 | 0x457bd0 VFMSUB231PD 0x480(%RSP),%ZMM0,%ZMM31 |
0x4fc40a VFMADD231PD 0x440(%RSP),%ZMM3,%ZMM31 | 0x4fcdff VFMADD231PD -0x7749(%RIP),%ZMM3,%ZMM31 | 0x457bd8 VFMADD231PD 0x440(%RSP),%ZMM3,%ZMM31 |
0x4fc412 VFMSUB231PD 0x3c0(%RSP),%ZMM0,%ZMM30 | 0x4fce09 VFMSUB231PD -0x7713(%RIP),%ZMM0,%ZMM30 | 0x457be0 VFMSUB231PD 0x3c0(%RSP),%ZMM0,%ZMM30 |
0x4fc41a VFNMSUB231PD 0x400(%RSP),%ZMM1,%ZMM31 | 0x4fce13 VFNMSUB231PD -0x76dd(%RIP),%ZMM1,%ZMM31 | 0x457be8 VFNMSUB231PD 0x400(%RSP),%ZMM1,%ZMM31 |
0x4fc422 VFMADD231PD 0x380(%RSP),%ZMM3,%ZMM30 | 0x4fce1d VFMADD231PD -0x76a7(%RIP),%ZMM3,%ZMM30 | 0x457bf0 VFMADD231PD 0x380(%RSP),%ZMM3,%ZMM30 |
0x4fc42a VFNMSUB231PD 0x340(%RSP),%ZMM1,%ZMM30 | 0x4fce27 VFNMSUB231PD -0x7671(%RIP),%ZMM1,%ZMM30 | 0x457bf8 VFNMSUB231PD 0x340(%RSP),%ZMM1,%ZMM30 |
0x4fc432 VFMSUB132PD 0x300(%RSP),%ZMM2,%ZMM0 | 0x4fce31 VFMSUB132PD -0x763b(%RIP),%ZMM2,%ZMM0 | 0x457c00 VFMSUB132PD 0x300(%RSP),%ZMM2,%ZMM0 |
0x4fc43a VFMADD231PD %ZMM3,%ZMM6,%ZMM0 | 0x4fce3b VFMADD231PD %ZMM3,%ZMM6,%ZMM0 | 0x457c08 VFMADD231PD %ZMM3,%ZMM6,%ZMM0 |
0x4fc440 VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 | 0x4fce41 VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 | 0x457c0e VFNMSUB231PD %ZMM1,%ZMM7,%ZMM0 |
0x4fc446 VMULPD %ZMM31,%ZMM31,%ZMM1 | 0x4fce47 VMULPD %ZMM31,%ZMM31,%ZMM1 | 0x457c14 VMULPD %ZMM31,%ZMM31,%ZMM1 |
0x4fc44c VFMADD231PD %ZMM30,%ZMM30,%ZMM1 | 0x4fce4d VFMADD231PD %ZMM30,%ZMM30,%ZMM1 | 0x457c1a VFMADD231PD %ZMM30,%ZMM30,%ZMM1 |
0x4fc452 VADDPD %ZMM31,%ZMM8,%ZMM2 | 0x4fce53 VADDPD %ZMM31,%ZMM8,%ZMM2 | 0x457c20 VADDPD %ZMM31,%ZMM8,%ZMM2 |
0x4fc458 VADDPD %ZMM30,%ZMM9,%ZMM3 | 0x4fce59 VADDPD %ZMM30,%ZMM9,%ZMM3 | 0x457c26 VADDPD %ZMM30,%ZMM9,%ZMM3 |
0x4fc45e VADDPD %ZMM0,%ZMM10,%ZMM4 | 0x4fce5f VADDPD %ZMM0,%ZMM10,%ZMM4 | 0x457c2c VADDPD %ZMM0,%ZMM10,%ZMM4 |
0x4fc464 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x4fce65 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x457c32 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x4fc46a VFMADD231PD %ZMM0,%ZMM0,%ZMM1 | 0x4fce6b VFMADD231PD %ZMM0,%ZMM0,%ZMM1 | 0x457c38 VFMADD231PD %ZMM0,%ZMM0,%ZMM1 |
0x4fc470 VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 0x4fce71 VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 0x457c3e VFMADD231PD %ZMM3,%ZMM3,%ZMM2 |
0x4fc476 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fce77 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457c44 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fc47c VCMPPD $0x1,%ZMM1,%ZMM2,%K0 | 0x4fce7d VCMPPD $0x1,%ZMM1,%ZMM2,%K0 | 0x457c4a VCMPPD $0x1,%ZMM1,%ZMM2,%K0 |
0x4fc483 VADDPD %ZMM31,%ZMM11,%ZMM3 | 0x4fce84 VADDPD %ZMM31,%ZMM11,%ZMM3 | 0x457c51 VADDPD %ZMM31,%ZMM11,%ZMM3 |
0x4fc489 VADDPD %ZMM30,%ZMM12,%ZMM4 | 0x4fce8a VADDPD %ZMM30,%ZMM12,%ZMM4 | 0x457c57 VADDPD %ZMM30,%ZMM12,%ZMM4 |
0x4fc48f VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x4fce90 VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x457c5d VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x4fc495 VADDPD %ZMM0,%ZMM13,%ZMM2 | 0x4fce96 VADDPD %ZMM0,%ZMM13,%ZMM2 | 0x457c63 VADDPD %ZMM0,%ZMM13,%ZMM2 |
0x4fc49b VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x4fce9c VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x457c69 VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x4fc4a1 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x4fcea2 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x457c6f VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x4fc4a7 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x4fcea8 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x457c75 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x4fc4ad VADDPD %ZMM31,%ZMM14,%ZMM2 | 0x4fceae VADDPD %ZMM31,%ZMM14,%ZMM2 | 0x457c7b VADDPD %ZMM31,%ZMM14,%ZMM2 |
0x4fc4b3 VCMPPD $0x1,%ZMM1,%ZMM3,%K1 | 0x4fceb4 VCMPPD $0x1,%ZMM1,%ZMM3,%K1 | 0x457c81 VCMPPD $0x1,%ZMM1,%ZMM3,%K1 |
0x4fc4ba VADDPD %ZMM30,%ZMM15,%ZMM4 | 0x4fcebb VADDPD %ZMM30,%ZMM15,%ZMM4 | 0x457c88 VADDPD %ZMM30,%ZMM15,%ZMM4 |
0x4fc4c0 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x4fcec1 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x457c8e VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x4fc4c6 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fcec7 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457c94 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fc4cc VADDPD %ZMM0,%ZMM16,%ZMM4 | 0x4fcecd VADDPD %ZMM0,%ZMM16,%ZMM4 | 0x457c9a VADDPD %ZMM0,%ZMM16,%ZMM4 |
0x4fc4d2 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fced3 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457ca0 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fc4d8 VMINPD %ZMM1,%ZMM3,%ZMM1 | 0x4fced9 VMINPD %ZMM1,%ZMM3,%ZMM1 | 0x457ca6 VMINPD %ZMM1,%ZMM3,%ZMM1 |
0x4fc4de VCMPPD $0x1,%ZMM1,%ZMM2,%K2 | 0x4fcedf VCMPPD $0x1,%ZMM1,%ZMM2,%K2 | 0x457cac VCMPPD $0x1,%ZMM1,%ZMM2,%K2 |
0x4fc4e5 VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x4fcee6 VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x457cb3 VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x4fc4eb VADDPD %ZMM31,%ZMM17,%ZMM2 | 0x4fceec VADDPD %ZMM31,%ZMM17,%ZMM2 | 0x457cb9 VADDPD %ZMM31,%ZMM17,%ZMM2 |
0x4fc4f1 VADDPD %ZMM30,%ZMM18,%ZMM3 | 0x4fcef2 VADDPD %ZMM30,%ZMM18,%ZMM3 | 0x457cbf VADDPD %ZMM30,%ZMM18,%ZMM3 |
0x4fc4f7 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x4fcef8 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x457cc5 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x4fc4fd VADDPD %ZMM0,%ZMM19,%ZMM4 | 0x4fcefe VADDPD %ZMM0,%ZMM19,%ZMM4 | 0x457ccb VADDPD %ZMM0,%ZMM19,%ZMM4 |
0x4fc503 VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 0x4fcf04 VFMADD231PD %ZMM3,%ZMM3,%ZMM2 | 0x457cd1 VFMADD231PD %ZMM3,%ZMM3,%ZMM2 |
0x4fc509 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fcf0a VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457cd7 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fc50f VCMPPD $0x1,%ZMM1,%ZMM2,%K3 | 0x4fcf10 VCMPPD $0x1,%ZMM1,%ZMM2,%K3 | 0x457cdd VCMPPD $0x1,%ZMM1,%ZMM2,%K3 |
0x4fc516 VADDPD %ZMM31,%ZMM20,%ZMM3 | 0x4fcf17 VADDPD %ZMM31,%ZMM20,%ZMM3 | 0x457ce4 VADDPD %ZMM31,%ZMM20,%ZMM3 |
0x4fc51c VADDPD %ZMM30,%ZMM21,%ZMM4 | 0x4fcf1d VADDPD %ZMM30,%ZMM21,%ZMM4 | 0x457cea VADDPD %ZMM30,%ZMM21,%ZMM4 |
0x4fc522 VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x4fcf23 VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x457cf0 VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x4fc528 VADDPD %ZMM0,%ZMM22,%ZMM2 | 0x4fcf29 VADDPD %ZMM0,%ZMM22,%ZMM2 | 0x457cf6 VADDPD %ZMM0,%ZMM22,%ZMM2 |
0x4fc52e VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x4fcf2f VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x457cfc VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x4fc534 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x4fcf35 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x457d02 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x4fc53a VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x4fcf3b VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x457d08 VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x4fc540 VADDPD %ZMM31,%ZMM23,%ZMM2 | 0x4fcf41 VADDPD %ZMM31,%ZMM23,%ZMM2 | 0x457d0e VADDPD %ZMM31,%ZMM23,%ZMM2 |
0x4fc546 VCMPPD $0x1,%ZMM1,%ZMM3,%K4 | 0x4fcf47 VCMPPD $0x1,%ZMM1,%ZMM3,%K4 | 0x457d14 VCMPPD $0x1,%ZMM1,%ZMM3,%K4 |
0x4fc54d VADDPD %ZMM30,%ZMM24,%ZMM4 | 0x4fcf4e VADDPD %ZMM30,%ZMM24,%ZMM4 | 0x457d1b VADDPD %ZMM30,%ZMM24,%ZMM4 |
0x4fc553 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x4fcf54 VMULPD %ZMM2,%ZMM2,%ZMM2 | 0x457d21 VMULPD %ZMM2,%ZMM2,%ZMM2 |
0x4fc559 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fcf5a VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457d27 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fc55f VADDPD %ZMM0,%ZMM25,%ZMM4 | 0x4fcf60 VADDPD %ZMM0,%ZMM25,%ZMM4 | 0x457d2d VADDPD %ZMM0,%ZMM25,%ZMM4 |
0x4fc565 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x4fcf66 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 | 0x457d33 VFMADD231PD %ZMM4,%ZMM4,%ZMM2 |
0x4fc56b VMINPD %ZMM1,%ZMM3,%ZMM1 | 0x4fcf6c VMINPD %ZMM1,%ZMM3,%ZMM1 | 0x457d39 VMINPD %ZMM1,%ZMM3,%ZMM1 |
0x4fc571 VADDPD %ZMM31,%ZMM26,%ZMM3 | 0x4fcf72 VADDPD %ZMM31,%ZMM26,%ZMM3 | 0x457d3f VADDPD %ZMM31,%ZMM26,%ZMM3 |
0x4fc577 VADDPD %ZMM30,%ZMM27,%ZMM4 | 0x4fcf78 VADDPD %ZMM30,%ZMM27,%ZMM4 | 0x457d45 VADDPD %ZMM30,%ZMM27,%ZMM4 |
0x4fc57d VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x4fcf7e VMULPD %ZMM3,%ZMM3,%ZMM3 | 0x457d4b VMULPD %ZMM3,%ZMM3,%ZMM3 |
0x4fc583 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x4fcf84 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 | 0x457d51 VFMADD231PD %ZMM4,%ZMM4,%ZMM3 |
0x4fc589 VCMPPD $0x1,%ZMM1,%ZMM2,%K5 | 0x4fcf8a VCMPPD $0x1,%ZMM1,%ZMM2,%K5 | 0x457d57 VCMPPD $0x1,%ZMM1,%ZMM2,%K5 |
0x4fc590 VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x4fcf91 VMINPD %ZMM1,%ZMM2,%ZMM1 | 0x457d5e VMINPD %ZMM1,%ZMM2,%ZMM1 |
0x4fc596 VADDPD %ZMM0,%ZMM28,%ZMM2 | 0x4fcf97 VADDPD %ZMM0,%ZMM28,%ZMM2 | 0x457d64 VADDPD %ZMM0,%ZMM28,%ZMM2 |
0x4fc59c VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x4fcf9d VFMADD231PD %ZMM2,%ZMM2,%ZMM3 | 0x457d6a VFMADD231PD %ZMM2,%ZMM2,%ZMM3 |
0x4fc5a2 VMINPD %ZMM1,%ZMM3,%ZMM2 | 0x4fcfa3 VMINPD %ZMM1,%ZMM3,%ZMM2 | 0x457d70 VMINPD %ZMM1,%ZMM3,%ZMM2 |
0x4fc5a8 VSQRTPD %ZMM2,%ZMM2 | 0x4fcfa9 VSQRTPD -0x75b3(%RIP),%ZMM2 | 0x457d76 VSQRTPD %ZMM2,%ZMM2 |
0x4fc5ae VCMPPD $0x1,%ZMM1,%ZMM3,%K6 | 0x4fcfb3 VCMPPD $0x1,%ZMM1,%ZMM3,%K6 | 0x457d7c VCMPPD $0x1,%ZMM1,%ZMM3,%K6 |
0x4fc5b5 VPMOVM2Q %K0,%ZMM1 | 0x4fcfba VPMOVM2Q %K0,%ZMM1 | 0x457d83 VPMOVM2Q %K0,%ZMM1 |
0x4fc5bb VPSRLQ $0x3f,%ZMM1,%ZMM1 | 0x4fcfc0 VPSRLQ $0x3f,%ZMM1,%ZMM1 | 0x457d89 VPSRLQ $0x3f,%ZMM1,%ZMM1 |
0x4fc5c2 VPBROADCASTQ -0x6696c(%RIP),%ZMM1{%K1} | 0x4fcfc7 VPBROADCASTQ -0x67371(%RIP),%ZMM1{%K1} | 0x457d90 VPBROADCASTQ 0x3dec6(%RIP),%ZMM1{%K1} |
0x4fc5cc VPBROADCASTQ -0x6696e(%RIP),%ZMM1{%K2} | 0x4fcfd1 VPBROADCASTQ -0x67373(%RIP),%ZMM1{%K2} | 0x457d9a VPBROADCASTQ 0x3dec4(%RIP),%ZMM1{%K2} |
0x4fc5d6 VPBROADCASTQ -0x66970(%RIP),%ZMM1{%K3} | 0x4fcfdb VPBROADCASTQ -0x67375(%RIP),%ZMM1{%K3} | 0x457da4 VPBROADCASTQ 0x3dec2(%RIP),%ZMM1{%K3} |
0x4fc5e0 VPBROADCASTQ -0x6a8ea(%RIP),%ZMM1{%K4} | 0x4fcfe5 VPBROADCASTQ -0x6b2ef(%RIP),%ZMM1{%K4} | 0x457dae VPBROADCASTQ 0x39f48(%RIP),%ZMM1{%K4} |
0x4fc5ea VPBROADCASTQ -0x6697c(%RIP),%ZMM1{%K5} | 0x4fcfef VPBROADCASTQ -0x67381(%RIP),%ZMM1{%K5} | 0x457db8 VPBROADCASTQ 0x3deb6(%RIP),%ZMM1{%K5} |
0x4fc5f4 VPBROADCASTQ -0x6697e(%RIP),%ZMM1{%K6} | 0x4fcff9 VPBROADCASTQ -0x67383(%RIP),%ZMM1{%K6} | 0x457dc2 VPBROADCASTQ 0x3deb4(%RIP),%ZMM1{%K6} |
0x4fc5fe KXNORW %K0,%K0,%K1 | 0x4fd003 KXNORW %K0,%K0,%K1 | 0x457dcc KXNORW %K0,%K0,%K1 |
0x4fc602 VMOVUPD %ZMM2,(%RCX,%RAX,8) | 0x4fd007 VMOVUPD %ZMM2,-0x7551(%RIP) 0x4fd011 NOP | 0x457dd0 VMOVUPD %ZMM2,(%RCX,%RAX,8) |
0x4fc609 VXORPD %XMM2,%XMM2,%XMM2 | 0x4fd012 VXORPD %XMM2,%XMM2,%XMM2 | 0x457dd7 VXORPD %XMM2,%XMM2,%XMM2 |
0x4fc60d VGATHERQPD 0x90(%RDI,%ZMM1,8),%ZMM2{%K1} | 0x4fd016 VMOVUPS -0x75a0(%RIP),%ZMM2 | 0x457ddb VGATHERQPD 0x90(%RDI,%ZMM1,8),%ZMM2{%K1} |
0x4fc615 VADDPD %ZMM31,%ZMM2,%ZMM2 | 0x4fd020 VADDPD %ZMM31,%ZMM2,%ZMM2 | 0x457de3 VADDPD %ZMM31,%ZMM2,%ZMM2 |
0x4fc61b VMULPD %ZMM29,%ZMM2,%ZMM2 | 0x4fd026 VMULPD %ZMM29,%ZMM2,%ZMM2 | 0x457de9 VMULPD %ZMM29,%ZMM2,%ZMM2 |
0x4fc621 KXNORW %K0,%K0,%K1 | 0x4fd02c KXNORW %K0,%K0,%K1 | 0x457def KXNORW %K0,%K0,%K1 |
0x4fc625 VMOVUPD %ZMM2,(%R10,%RAX,8) | 0x4fd030 VMOVUPD %ZMM2,-0x753a(%RIP) 0x4fd03a NOP | 0x457df3 VMOVUPD %ZMM2,(%R10,%RAX,8) |
0x4fc62c VXORPD %XMM2,%XMM2,%XMM2 | 0x4fd03b VXORPD %XMM2,%XMM2,%XMM2 | 0x457dfa VXORPD %XMM2,%XMM2,%XMM2 |
0x4fc630 VGATHERQPD 0xd0(%RDI,%ZMM1,8),%ZMM2{%K1} | 0x4fd03f VMOVUPS -0x75c9(%RIP),%ZMM2 | 0x457dfe VGATHERQPD 0xd0(%RDI,%ZMM1,8),%ZMM2{%K1} |
0x4fc638 VADDPD %ZMM30,%ZMM2,%ZMM2 | 0x4fd049 VADDPD %ZMM30,%ZMM2,%ZMM2 | 0x457e06 VADDPD %ZMM30,%ZMM2,%ZMM2 |
0x4fc63e VMULPD %ZMM29,%ZMM2,%ZMM2 | 0x4fd04f VMULPD %ZMM29,%ZMM2,%ZMM2 | 0x457e0c VMULPD %ZMM29,%ZMM2,%ZMM2 |
0x4fc644 KXNORW %K0,%K0,%K1 | 0x4fd055 KXNORW %K0,%K0,%K1 | 0x457e12 KXNORW %K0,%K0,%K1 |
0x4fc648 VMOVUPD %ZMM2,(%R8,%RAX,8) | 0x4fd059 VMOVUPD %ZMM2,-0x7523(%RIP) 0x4fd063 NOP | 0x457e16 VMOVUPD %ZMM2,(%R8,%RAX,8) |
0x4fc64f VXORPD %XMM2,%XMM2,%XMM2 | 0x4fd064 VXORPD %XMM2,%XMM2,%XMM2 | 0x457e1d VXORPD %XMM2,%XMM2,%XMM2 |
0x4fc653 VGATHERQPD 0x110(%RDI,%ZMM1,8),%ZMM2{%K1} | 0x4fd068 VMOVUPS -0x75f2(%RIP),%ZMM2 | 0x457e21 VGATHERQPD 0x110(%RDI,%ZMM1,8),%ZMM2{%K1} |
0x4fc65b VADDPD %ZMM0,%ZMM2,%ZMM0 | 0x4fd072 VADDPD %ZMM0,%ZMM2,%ZMM0 | 0x457e29 VADDPD %ZMM0,%ZMM2,%ZMM0 |
0x4fc661 VMULPD %ZMM29,%ZMM0,%ZMM0 | 0x4fd078 VMULPD %ZMM29,%ZMM0,%ZMM0 | 0x457e2f VMULPD %ZMM29,%ZMM0,%ZMM0 |
0x4fc667 VMOVUPD %ZMM0,(%R11,%RAX,8) | 0x4fd07e VMOVUPD %ZMM0,-0x7508(%RIP) 0x4fd088 NOP | 0x457e35 VMOVUPD %ZMM0,(%R11,%RAX,8) |
0x4fc66e ADD $0x8,%EDX | 0x4fd089 ADD $0x8,%EDX | 0x457e3c ADD $0x8,%EDX |
0x4fc671 CMP %EBX,%EDX | 0x4fd08c CMP %EBX,%EDX | 0x457e3f CMP %EBX,%EDX |
0x4fc673 JBE 4fc32a <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii+0xa4d1a> | 0x4fd08e JBE 4fcd04 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii+0xa56f4> | 0x457e41 JBE 457b00 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii+0x4f0> |
Path / |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 15.29, 15.29, | 16.15, 16.15, | 15.29, 15.29, |
cycles L1 CQA | 56.00 | 53.00 | 56.00 |
cycles UFS | 119.52 | 68.49 | 119.42 |
bytes loaded | 1704.00 | 1760.00 | 1696.00 |
bytes stored | 264.00 | 256.00 | 256.00 |
nb loads | 35.00 | 35.00 | 34.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 56.00 | 53.00 | 56.00 |
cycles front end | 35.75 | 34.25 | 35.25 |
cycles P0 | 56.00 | 53.00 | 56.00 |
cycles P1 | 37.50 | 37.50 | 37.50 |
cycles P2 | 28.00 | 17.50 | 27.50 |
cycles P3 | 28.00 | 17.50 | 27.50 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 56.00 | 53.00 | 56.00 |
cycles P6 | 3.00 | 2.50 | 2.50 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 84.77 | 34.09 | 85.18 |
LB full | 0.00 | 0.00 | 0.00 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 87.27 | 0.00 | 87.17 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 1.24 | 61.95 | 1.23 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 143.00 | 137.00 | 141.00 |
uops P0 | 56.00 | 53.00 | 56.00 |
uops P1 | 3.00 | 2.50 | 2.50 |
uops P2 | 28.00 | 17.50 | 27.50 |
uops P3 | 28.00 | 17.50 | 27.50 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 56.00 | 53.00 | 56.00 |
uops P6 | 3.00 | 2.50 | 2.50 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 983 | 985 | 971 |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 15.29, 15.29, | 16.15, 16.15, | 15.29, 15.29, |
cycles L1 CQA | 56.00 | 53.00 | 56.00 |
cycles UFS | 119.52 | 68.49 | 119.42 |
bytes loaded | 1704.00 | 1760.00 | 1696.00 |
bytes stored | 264.00 | 256.00 | 256.00 |
nb loads | 35.00 | 35.00 | 34.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 56.00 | 53.00 | 56.00 |
cycles front end | 35.75 | 34.25 | 35.25 |
cycles P0 | 56.00 | 53.00 | 56.00 |
cycles P1 | 37.50 | 37.50 | 37.50 |
cycles P2 | 28.00 | 17.50 | 27.50 |
cycles P3 | 28.00 | 17.50 | 27.50 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 56.00 | 53.00 | 56.00 |
cycles P6 | 3.00 | 2.50 | 2.50 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 84.77 | 34.09 | 85.18 |
LB full | 0.00 | 0.00 | 0.00 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 87.27 | 0.00 | 87.17 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 1.24 | 61.95 | 1.23 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 143.00 | 137.00 | 141.00 |
uops P0 | 56.00 | 53.00 | 56.00 |
uops P1 | 3.00 | 2.50 | 2.50 |
uops P2 | 28.00 | 17.50 | 27.50 |
uops P3 | 28.00 | 17.50 | 27.50 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 56.00 | 53.00 | 56.00 |
uops P6 | 3.00 | 2.50 | 2.50 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 983 | 985 | 971 |