Loop Id: 1107 | Module: exec | Source: ParticleBConds.h:188-217 | Coverage: 40.42% |
---|
Loop Id: 1107 | Module: exec | Source: ParticleBConds.h:188-217 | Coverage: 40.42% |
---|
0x491830 VMOVSD %XMM8,%XMM8,%XMM11 |
0x491835 CMP %ECX,0x28(%RBP) [10] |
0x491838 JLE 49183f |
0x49183a VMOVSD %XMM10,%XMM10,%XMM11 |
0x49183f VMOVSD (%R11,%RCX,8),%XMM12 [4] |
0x491845 VMOVSD (%R10,%RCX,8),%XMM1 [6] |
0x49184b VMOVSD (%RSI,%RCX,8),%XMM13 [1] |
0x491850 VSUBSD %XMM6,%XMM12,%XMM2 |
0x491854 VSUBSD %XMM5,%XMM1,%XMM0 |
0x491858 VSUBSD %XMM7,%XMM13,%XMM3 |
0x49185c VMULSD %XMM11,%XMM2,%XMM15 |
0x491861 VMULSD %XMM11,%XMM0,%XMM14 |
0x491866 VMULSD %XMM11,%XMM3,%XMM12 |
0x49186b VMULSD 0x20(%RAX),%XMM15,%XMM1 [2] |
0x491870 VMULSD 0x8(%RAX),%XMM15,%XMM4 [2] |
0x491875 VMULSD 0x38(%RAX),%XMM15,%XMM3 [2] |
0x49187a VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 [2] |
0x491880 VFMADD231SD (%RAX),%XMM14,%XMM4 [2] |
0x491885 VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 [2] |
0x49188b VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 [2] |
0x491891 VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 [2] |
0x491897 VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 [2] |
0x49189d VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 |
0x4918a4 VXORPD %XMM9,%XMM0,%XMM2 |
0x4918a9 VMULSD 0x68(%RAX),%XMM2,%XMM0 [2] |
0x4918ae VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 |
0x4918b5 VMULSD 0x50(%RAX),%XMM2,%XMM1 [2] |
0x4918ba VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 |
0x4918c1 VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 [2] |
0x4918c7 VMULSD 0x80(%RAX),%XMM2,%XMM2 [2] |
0x4918cf VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 [2] |
0x4918d8 VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 [2] |
0x4918de VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 [2] |
0x4918e4 VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 [2] |
0x4918ea VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 [2] |
0x4918f0 VADDSD %XMM15,%XMM0,%XMM15 |
0x4918f5 VADDSD %XMM14,%XMM1,%XMM14 |
0x4918fa VADDSD %XMM12,%XMM13,%XMM13 |
0x4918ff VADDSD 0xd8(%RAX),%XMM15,%XMM12 [2] |
0x491907 VMULSD %XMM15,%XMM15,%XMM1 |
0x49190c VADDSD 0x98(%RAX),%XMM14,%XMM4 [2] |
0x491914 VADDSD 0x118(%RAX),%XMM13,%XMM3 [2] |
0x49191c VMULSD %XMM12,%XMM12,%XMM0 |
0x491921 VFMADD231SD %XMM14,%XMM14,%XMM1 |
0x491926 VFMADD132SD %XMM4,%XMM0,%XMM4 |
0x49192b VFMADD231SD %XMM13,%XMM13,%XMM1 |
0x491930 VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x491935 VADDSD 0xe0(%RAX),%XMM15,%XMM4 [2] |
0x49193d VMULSD %XMM4,%XMM4,%XMM12 |
0x491941 VADDSD 0x128(%RAX),%XMM13,%XMM4 [2] |
0x491949 VMINSD %XMM3,%XMM1,%XMM2 |
0x49194d VCOMISD %XMM3,%XMM1 |
0x491951 VADDSD 0xa0(%RAX),%XMM14,%XMM1 [2] |
0x491959 VADDSD 0x120(%RAX),%XMM13,%XMM3 [2] |
0x491961 VFMADD132SD %XMM1,%XMM12,%XMM1 |
0x491966 SETA %R13B |
0x49196a VADDSD 0xf0(%RAX),%XMM15,%XMM12 [2] |
0x491972 MOVZX %R13B,%EDX |
0x491976 MOV $0x4,%R13D |
0x49197c VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x491981 VADDSD 0xe8(%RAX),%XMM15,%XMM1 [2] |
0x491989 VCOMISD %XMM3,%XMM2 |
0x49198d VMINSD %XMM2,%XMM3,%XMM0 |
0x491991 VADDSD 0xa8(%RAX),%XMM14,%XMM2 [2] |
0x491999 VMULSD %XMM1,%XMM1,%XMM3 |
0x49199d VADDSD 0xb0(%RAX),%XMM14,%XMM1 [2] |
0x4919a5 CMOVA %R15,%RDX |
0x4919a9 VFMADD132SD %XMM2,%XMM3,%XMM2 |
0x4919ae VFMADD132SD %XMM4,%XMM2,%XMM4 |
0x4919b3 VADDSD 0x130(%RAX),%XMM13,%XMM2 [2] |
0x4919bb VADDSD 0x138(%RAX),%XMM13,%XMM3 [2] |
0x4919c3 VCOMISD %XMM4,%XMM0 |
0x4919c7 VMINSD %XMM0,%XMM4,%XMM0 |
0x4919cb VMULSD %XMM12,%XMM12,%XMM4 |
0x4919d0 VADDSD 0xf8(%RAX),%XMM15,%XMM12 [2] |
0x4919d8 CMOVA %R14,%RDX |
0x4919dc VFMADD132SD %XMM1,%XMM4,%XMM1 |
0x4919e1 VADDSD 0xc0(%RAX),%XMM14,%XMM4 [2] |
0x4919e9 VFMADD132SD %XMM2,%XMM1,%XMM2 |
0x4919ee VADDSD 0xb8(%RAX),%XMM14,%XMM1 [2] |
0x4919f6 VCOMISD %XMM2,%XMM0 |
0x4919fa VMINSD %XMM0,%XMM2,%XMM0 |
0x4919fe VMULSD %XMM12,%XMM12,%XMM2 |
0x491a03 CMOVA %R13,%RDX |
0x491a07 MOV $0x5,%R13D |
0x491a0d VFMADD132SD %XMM1,%XMM2,%XMM1 |
0x491a12 VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x491a17 VADDSD 0x100(%RAX),%XMM15,%XMM1 [2] |
0x491a1f VMULSD %XMM1,%XMM1,%XMM12 |
0x491a23 VADDSD 0x148(%RAX),%XMM13,%XMM1 [2] |
0x491a2b VCOMISD %XMM3,%XMM0 |
0x491a2f VMINSD %XMM0,%XMM3,%XMM0 |
0x491a33 VADDSD 0x140(%RAX),%XMM13,%XMM3 [2] |
0x491a3b VFMADD132SD %XMM4,%XMM12,%XMM4 |
0x491a40 CMOVA %R13,%RDX |
0x491a44 MOV $0x6,%R13D |
0x491a4a VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x491a4f VADDSD 0x108(%RAX),%XMM15,%XMM4 [2] |
0x491a57 VMINSD %XMM0,%XMM3,%XMM2 |
0x491a5b VCOMISD %XMM3,%XMM0 |
0x491a5f VADDSD 0xc8(%RAX),%XMM14,%XMM0 [2] |
0x491a67 VMULSD %XMM4,%XMM4,%XMM3 |
0x491a6b CMOVA %R13,%RDX |
0x491a6f MOV $0x7,%R13D |
0x491a75 VFMADD132SD %XMM0,%XMM3,%XMM0 |
0x491a7a VFMADD132SD %XMM1,%XMM0,%XMM1 |
0x491a7f VCOMISD %XMM1,%XMM2 |
0x491a83 VMINSD %XMM1,%XMM2,%XMM12 |
0x491a87 CMOVA %R13,%RDX |
0x491a8b VSQRTSD %XMM12,%XMM12,%XMM12 |
0x491a90 LEA (%RAX,%RDX,8),%RDX |
0x491a94 VADDSD 0x90(%RDX),%XMM14,%XMM14 [9] |
0x491a9c VMOVSD %XMM12,(%R8,%RCX,8) [7] |
0x491aa2 VMULSD %XMM11,%XMM14,%XMM2 |
0x491aa7 VMOVSD %XMM2,(%R12,%RCX,8) [3] |
0x491aad VADDSD 0xd0(%RDX),%XMM15,%XMM15 [9] |
0x491ab5 VMULSD %XMM11,%XMM15,%XMM0 |
0x491aba VMOVSD %XMM0,(%RBX,%RCX,8) [8] |
0x491abf VADDSD 0x110(%RDX),%XMM13,%XMM13 [9] |
0x491ac7 VMULSD %XMM11,%XMM13,%XMM11 |
0x491acc VMOVSD %XMM11,(%RDI,%RCX,8) [5] |
0x491ad1 INC %RCX |
0x491ad4 CMP %RCX,%R9 |
0x491ad7 JNE 491830 |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Particle/Lattice/ParticleBConds.h: 188 - 217 |
-------------------------------------------------------------------------------- |
188: const T flip = iat < flip_ind ? one : minusone; |
189: const T displ_0 = (px[iat] - x0) * flip; |
190: const T displ_1 = (py[iat] - y0) * flip; |
191: const T displ_2 = (pz[iat] - z0) * flip; |
192: |
193: const T ar_0 = -std::floor(displ_0 * g00 + displ_1 * g10 + displ_2 * g20); |
194: const T ar_1 = -std::floor(displ_0 * g01 + displ_1 * g11 + displ_2 * g21); |
195: const T ar_2 = -std::floor(displ_0 * g02 + displ_1 * g12 + displ_2 * g22); |
196: |
197: const T delx = displ_0 + ar_0 * r00 + ar_1 * r10 + ar_2 * r20; |
198: const T dely = displ_1 + ar_0 * r01 + ar_1 * r11 + ar_2 * r21; |
199: const T delz = displ_2 + ar_0 * r02 + ar_1 * r12 + ar_2 * r22; |
200: |
201: T rmin = delx * delx + dely * dely + delz * delz; |
202: int ic = 0; |
203: #pragma unroll(7) |
204: for (int c = 1; c < 8; ++c) |
205: { |
206: const T x = delx + cellx[c]; |
207: const T y = dely + celly[c]; |
208: const T z = delz + cellz[c]; |
209: const T r2 = x * x + y * y + z * z; |
210: ic = (r2 < rmin) ? c : ic; |
211: rmin = (r2 < rmin) ? r2 : rmin; |
212: } |
213: |
214: temp_r[iat] = std::sqrt(rmin); |
215: dx[iat] = flip * (delx + cellx[ic]); |
216: dy[iat] = flip * (dely + celly[ic]); |
217: dz[iat] = flip * (delz + cellz[ic]); |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►32.60+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | exec |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | exec |
○ | main._omp_fn.1 | stl_vector.h:1123 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►26.36+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | exec |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | exec |
○ | main._omp_fn.1 | refwrap.h:346 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►16.41+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:77 | exec |
○ | qmcplusplus::ParticleSet::setA[...] | ParticleSet.cpp:259 | exec |
○ | main._omp_fn.1 | stl_vector.h:1126 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►16.07+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | exec |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | exec |
○ | main._omp_fn.1 | stl_vector.h:1123 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►5.34+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | exec |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | exec |
○ | main._omp_fn.1 | miniqmc.cpp:484 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►1.62+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:67 | exec |
○ | qmcplusplus::ParticleSet::upda[...] | ParticleSet.cpp:250 | exec |
○ | main._omp_fn.0 | miniqmc.cpp:390 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►1.60+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:67 | exec |
○ | qmcplusplus::ParticleSet::upda[...] | ParticleSet.cpp:250 | exec |
○ | main._omp_fn.0 | miniqmc.cpp:390 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 2.15 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.51 |
Bottlenecks | |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:188-217 |
Source loop unroll info | multi-versionned |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 49.50 |
CQA cycles if no scalar integer | 49.50 |
CQA cycles if FP arith vectorized | 23.00 |
CQA cycles if fully vectorized | 6.19 |
Front-end cycles | 32.88 |
DIV/SQRT cycles | 49.50 |
P0 cycles | 49.50 |
P1 cycles | 23.00 |
P2 cycles | 23.00 |
P3 cycles | 4.00 |
P4 cycles | 9.50 |
P5 cycles | 15.00 |
P6 cycles | 4.00 |
P7 cycles | 4.50 - 6.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 66.11 - 66.07 |
Stall cycles (UFS) | 32.79 - 32.74 |
Nb insns | 123.50 |
Nb uops | 131.50 |
Nb loads | 46.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.16 |
Nb FLOP add-sub | 30.00 |
Nb FLOP mul | 20.00 |
Nb FLOP fma | 28.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 1.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 364.00 |
Bytes stored | 32.00 |
Stride 0 | 2.00 |
Stride 1 | 7.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.90 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 4.08 |
Vector-efficiency ratio all | 12.33 |
Vector-efficiency ratio load | 12.36 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 11.73 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 2.15 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.50 |
Bottlenecks | P0, P1, |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:188-217 |
Source loop unroll info | multi-versionned |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 49.50 |
CQA cycles if no scalar integer | 49.50 |
CQA cycles if FP arith vectorized | 23.00 |
CQA cycles if fully vectorized | 6.19 |
Front-end cycles | 33.00 |
DIV/SQRT cycles | 49.50 |
P0 cycles | 49.50 |
P1 cycles | 23.00 |
P2 cycles | 23.00 |
P3 cycles | 4.00 |
P4 cycles | 10.00 |
P5 cycles | 15.00 |
P6 cycles | 4.00 |
P7 cycles | 4.50 - 6.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 66.54 - 66.52 |
Stall cycles (UFS) | 33.09 - 33.06 |
Nb insns | 124.00 |
Nb uops | 132.00 |
Nb loads | 46.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.16 |
Nb FLOP add-sub | 30.00 |
Nb FLOP mul | 20.00 |
Nb FLOP fma | 28.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 1.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 364.00 |
Bytes stored | 32.00 |
Stride 0 | 2.00 |
Stride 1 | 7.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.90 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 4.00 |
Vector-efficiency ratio all | 12.33 |
Vector-efficiency ratio load | 12.36 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 11.75 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 2.15 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.51 |
Bottlenecks | P0, P1, |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:188-217 |
Source loop unroll info | multi-versionned |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 49.50 |
CQA cycles if no scalar integer | 49.50 |
CQA cycles if FP arith vectorized | 23.00 |
CQA cycles if fully vectorized | 6.19 |
Front-end cycles | 32.75 |
DIV/SQRT cycles | 49.50 |
P0 cycles | 49.50 |
P1 cycles | 23.00 |
P2 cycles | 23.00 |
P3 cycles | 4.00 |
P4 cycles | 9.00 |
P5 cycles | 15.00 |
P6 cycles | 4.00 |
P7 cycles | 4.50 - 6.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 65.68 - 65.62 |
Stall cycles (UFS) | 32.49 - 32.41 |
Nb insns | 123.00 |
Nb uops | 131.00 |
Nb loads | 46.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.16 |
Nb FLOP add-sub | 30.00 |
Nb FLOP mul | 20.00 |
Nb FLOP fma | 28.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 1.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 364.00 |
Bytes stored | 32.00 |
Stride 0 | 2.00 |
Stride 1 | 7.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.91 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 4.17 |
Vector-efficiency ratio all | 12.33 |
Vector-efficiency ratio load | 12.36 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 11.72 |
Path / |
nb instructions | 123.50 |
nb uops | 131.50 |
loop length | 682.50 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 15.50 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 32.88 cycles |
front end | 32.88 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.50 | 15.00 | 4.00 |
cycles | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.50 | 15.00 | 4.00 |
Cycles executing div or sqrt instructions | 4.50-6.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 66.11-66.07 |
Stall cycles | 32.79-32.74 |
RS full (events) | 61.42-56.55 |
LB full (events) | 0.33-3.35 |
Front-end | 32.88 |
Dispatch | 49.50 |
DIV/SQRT | 4.50-6.00 |
Data deps. | 0.00 |
Overall L1 | 49.50 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 5% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 4% |
all | 6% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 13% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 11% |
nb instructions | 124 |
nb uops | 132 |
loop length | 685 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 16 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 33.00 cycles |
front end | 33.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 10.00 | 15.00 | 4.00 |
cycles | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 10.00 | 15.00 | 4.00 |
Cycles executing div or sqrt instructions | 4.50-6.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 66.54-66.52 |
Stall cycles | 33.09-33.06 |
RS full (events) | 61.85-58.18 |
LB full (events) | 0.37-3.00 |
Front-end | 33.00 |
Dispatch | 49.50 |
DIV/SQRT | 4.50-6.00 |
Data deps. | 0.00 |
Overall L1 | 49.50 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 5% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 4% |
all | 6% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 13% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMOVSD %XMM8,%XMM8,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CMP %ECX,0x28(%RBP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JLE 49183f <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x9f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVSD %XMM10,%XMM10,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD (%R11,%RCX,8),%XMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%R10,%RCX,8),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%RSI,%RCX,8),%XMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD %XMM6,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM5,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM7,%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM2,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM0,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM3,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x20(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x8(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x38(%RAX),%XMM15,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD (%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VXORPD %XMM9,%XMM0,%XMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VMULSD 0x68(%RAX),%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMULSD 0x50(%RAX),%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x80(%RAX),%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM15,%XMM0,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM14,%XMM1,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM12,%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xd8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM15,%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x98(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x118(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM14,%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM13,%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe0(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x128(%RAX),%XMM13,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM3,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xa0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x120(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SETA %R13B | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VADDSD 0xf0(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOVZX %R13B,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV $0x4,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe8(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM2,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xa8(%RAX),%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R15,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM2,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM2,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x130(%RAX),%XMM13,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x138(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM4,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xf8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R14,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM1,%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xc0(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM2,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb8(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM2,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x5,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM1,%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x100(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x148(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x140(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x6,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x108(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM0,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xc8(%RAX),%XMM14,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x7,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM1,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM1,%XMM2,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VSQRTSD %XMM12,%XMM12,%XMM12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-19 | 4.50-6 |
LEA (%RAX,%RDX,8),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VADDSD 0x90(%RDX),%XMM14,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM12,(%R8,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM11,%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM2,(%R12,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0xd0(%RDX),%XMM15,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM15,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM0,(%RBX,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0x110(%RDX),%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM13,%XMM11 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM11,(%RDI,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %RCX,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 491830 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x90> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
nb instructions | 123 |
nb uops | 131 |
loop length | 680 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 15 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 32.75 cycles |
front end | 32.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.00 | 15.00 | 4.00 |
cycles | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.00 | 15.00 | 4.00 |
Cycles executing div or sqrt instructions | 4.50-6.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 65.68-65.62 |
Stall cycles | 32.49-32.41 |
RS full (events) | 60.99-54.93 |
LB full (events) | 0.28-3.70 |
Front-end | 32.75 |
Dispatch | 49.50 |
DIV/SQRT | 4.50-6.00 |
Data deps. | 0.00 |
Overall L1 | 49.50 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 5% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 4% |
all | 6% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 13% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMOVSD %XMM8,%XMM8,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CMP %ECX,0x28(%RBP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JLE 49183f <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x9f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVSD (%R11,%RCX,8),%XMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%R10,%RCX,8),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%RSI,%RCX,8),%XMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD %XMM6,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM5,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM7,%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM2,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM0,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM3,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x20(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x8(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x38(%RAX),%XMM15,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD (%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VXORPD %XMM9,%XMM0,%XMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VMULSD 0x68(%RAX),%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMULSD 0x50(%RAX),%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x80(%RAX),%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM15,%XMM0,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM14,%XMM1,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM12,%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xd8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM15,%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x98(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x118(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM14,%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM13,%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe0(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x128(%RAX),%XMM13,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM3,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xa0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x120(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SETA %R13B | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VADDSD 0xf0(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOVZX %R13B,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV $0x4,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe8(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM2,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xa8(%RAX),%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R15,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM2,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM2,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x130(%RAX),%XMM13,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x138(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM4,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xf8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R14,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM1,%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xc0(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM2,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb8(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM2,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x5,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM1,%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x100(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x148(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x140(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x6,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x108(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM0,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xc8(%RAX),%XMM14,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x7,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM1,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM1,%XMM2,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VSQRTSD %XMM12,%XMM12,%XMM12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-19 | 4.50-6 |
LEA (%RAX,%RDX,8),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VADDSD 0x90(%RDX),%XMM14,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM12,(%R8,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM11,%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM2,(%R12,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0xd0(%RDX),%XMM15,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM15,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM0,(%RBX,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0x110(%RDX),%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM13,%XMM11 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM11,(%RDI,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %RCX,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 491830 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x90> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Metric | run_0 |
---|---|
Coverage (% app. time) | 40.42 |
Time (s) | 39.43 |
Instance Count | 190164 |
Iteration Count - min | 6144 |
Iteration Count - avg | 6144 |
Iteration Count - max | 6144 |
Cycles per Iteration - min | 70.73 |
Cycles per Iteration - avg | 71.25 |
Cycles per Iteration - max | 240.15 |
Metric | Value |
---|---|
Bucket Coverage (% loop time) | 99.91 |
Instance Count | 190164 |
ORIG CPI:min | 70.74 |
ORIG CPI:med | 70.85 |
ORIG CPI:max | 74.04 |
DL1 CPI:min | 70.93 |
DL1 CPI:med | 71.60 |
DL1 CPI:max | 80.88 |
ORIG (min) / DL1 (min) | 1.00 |
ORIG (med) / DL1 (med) | 0.99 |
ORIG (max) / DL1 (max) | 0.92 |
Nb Iteration:min | 6144 |
Nb Iteration:med | 6144.00 |
Nb Iteration:max | 6144 |
ORIG: min (cycles) | 434616 |
ORIG: med (cycles) | 435292.00 |
ORIG: max (cycles) | 454876 |
DL1:min (cycles) | 435812 |
DL1:med (cycles) | 439908.00 |
DL1:max (cycles) | 496942 |
Metric (average per iteration except for Time and Iteration Count) | ORIG | DL1 | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | |
Time | 435292.00 | 435292.00 | 435292.00 | 435292.00 | 434616.00 | 435292.00 | 454876.00 | 439908.00 | 439908.00 | 439908.00 | 439908.00 | 435812.00 | 439908.00 | 496942.00 |
CPI MIN | 70.74 | 70.93 | ||||||||||||
CPI MED | 70.85 | 70.85 | 70.85 | 70.85 | 70.74 | 70.85 | 74.04 | 71.60 | 71.60 | 71.60 | 71.60 | 70.93 | 71.60 | 80.88 |
CPI AVG | 71.05 | 72.05 | ||||||||||||
CPI MAX | 74.04 | 80.88 | ||||||||||||
Iteration Count | 6144.00 | 6144.00 | 6144.00 | 6144.00 | 6144.00 | 6144.00 | 6144.00 | 6144.00 | 6144.00 | 6144.00 | 6144.00 | 6144.00 | 6144.00 | 6144.00 |
ORIG | DL1 | Original Code |
---|---|---|
0x4e80e4 ADDQ $0x1,-0x4d6c(%RIP) 0x4e80ec VMOVSD %XMM8,%XMM8,%XMM11 | 0x4e8958 VMOVSD %XMM8,%XMM8,%XMM11 | 0x491830 VMOVSD %XMM8,%XMM8,%XMM11 |
0x4e80f1 CMP %ECX,0x28(%RBP) | 0x4e895d CMP %ECX,-0x6463(%RIP) | 0x491835 CMP %ECX,0x28(%RBP) |
0x4e80f4 JLE 4e80fb <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x5695b> | 0x4e8963 JLE 4e896a <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x571ca> | 0x491838 JLE 49183f <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x9f> |
0x4e80f6 VMOVSD %XMM10,%XMM10,%XMM11 | 0x4e8965 VMOVSD %XMM10,%XMM10,%XMM11 | 0x49183a VMOVSD %XMM10,%XMM10,%XMM11 |
0x4e80fb VMOVSD (%R11,%RCX,8),%XMM12 | 0x4e896a VMOVSD -0x63b2(%RIP),%XMM12 | 0x49183f VMOVSD (%R11,%RCX,8),%XMM12 |
0x4e8101 VMOVSD (%R10,%RCX,8),%XMM1 | 0x4e8972 VMOVSD -0x63ba(%RIP),%XMM1 | 0x491845 VMOVSD (%R10,%RCX,8),%XMM1 |
0x4e8107 VMOVSD (%RSI,%RCX,8),%XMM13 | 0x4e897a VMOVSD -0x63c2(%RIP),%XMM13 | 0x49184b VMOVSD (%RSI,%RCX,8),%XMM13 |
0x4e810c VSUBSD %XMM6,%XMM12,%XMM2 | 0x4e8982 VSUBSD %XMM6,%XMM12,%XMM2 | 0x491850 VSUBSD %XMM6,%XMM12,%XMM2 |
0x4e8110 VSUBSD %XMM5,%XMM1,%XMM0 | 0x4e8986 VSUBSD %XMM5,%XMM1,%XMM0 | 0x491854 VSUBSD %XMM5,%XMM1,%XMM0 |
0x4e8114 VSUBSD %XMM7,%XMM13,%XMM3 | 0x4e898a VSUBSD %XMM7,%XMM13,%XMM3 | 0x491858 VSUBSD %XMM7,%XMM13,%XMM3 |
0x4e8118 VMULSD %XMM11,%XMM2,%XMM15 | 0x4e898e VMULSD %XMM11,%XMM2,%XMM15 | 0x49185c VMULSD %XMM11,%XMM2,%XMM15 |
0x4e811d VMULSD %XMM11,%XMM0,%XMM14 | 0x4e8993 VMULSD %XMM11,%XMM0,%XMM14 | 0x491861 VMULSD %XMM11,%XMM0,%XMM14 |
0x4e8122 VMULSD %XMM11,%XMM3,%XMM12 | 0x4e8998 VMULSD %XMM11,%XMM3,%XMM12 | 0x491866 VMULSD %XMM11,%XMM3,%XMM12 |
0x4e8127 VMULSD 0x20(%RAX),%XMM15,%XMM1 | 0x4e899d VMULSD -0x63e5(%RIP),%XMM15,%XMM1 | 0x49186b VMULSD 0x20(%RAX),%XMM15,%XMM1 |
0x4e812c VMULSD 0x8(%RAX),%XMM15,%XMM4 | 0x4e89a5 VMULSD -0x63ed(%RIP),%XMM15,%XMM4 | 0x491870 VMULSD 0x8(%RAX),%XMM15,%XMM4 |
0x4e8131 VMULSD 0x38(%RAX),%XMM15,%XMM3 | 0x4e89ad VMULSD -0x63f5(%RIP),%XMM15,%XMM3 | 0x491875 VMULSD 0x38(%RAX),%XMM15,%XMM3 |
0x4e8136 VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 | 0x4e89b5 VFMADD231SD -0x63fe(%RIP),%XMM14,%XMM1 | 0x49187a VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 |
0x4e813c VFMADD231SD (%RAX),%XMM14,%XMM4 | 0x4e89be VFMADD231SD -0x6407(%RIP),%XMM14,%XMM4 | 0x491880 VFMADD231SD (%RAX),%XMM14,%XMM4 |
0x4e8141 VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 | 0x4e89c7 VFMADD231SD -0x6410(%RIP),%XMM14,%XMM3 | 0x491885 VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 |
0x4e8147 VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 | 0x4e89d0 VFMADD231SD -0x6419(%RIP),%XMM12,%XMM1 | 0x49188b VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 |
0x4e814d VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 | 0x4e89d9 VFMADD231SD -0x6422(%RIP),%XMM12,%XMM4 | 0x491891 VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 |
0x4e8153 VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 | 0x4e89e2 VFMADD231SD -0x642b(%RIP),%XMM12,%XMM3 | 0x491897 VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 |
0x4e8159 VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 0x4e89eb VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 0x49189d VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 |
0x4e8160 VXORPD %XMM9,%XMM0,%XMM2 | 0x4e89f2 VXORPD %XMM9,%XMM0,%XMM2 | 0x4918a4 VXORPD %XMM9,%XMM0,%XMM2 |
0x4e8165 VMULSD 0x68(%RAX),%XMM2,%XMM0 | 0x4e89f7 VMULSD -0x643f(%RIP),%XMM2,%XMM0 | 0x4918a9 VMULSD 0x68(%RAX),%XMM2,%XMM0 |
0x4e816a VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 0x4e89ff VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 0x4918ae VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 |
0x4e8171 VMULSD 0x50(%RAX),%XMM2,%XMM1 | 0x4e8a06 VMULSD -0x644e(%RIP),%XMM2,%XMM1 | 0x4918b5 VMULSD 0x50(%RAX),%XMM2,%XMM1 |
0x4e8176 VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 0x4e8a0e VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 0x4918ba VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 |
0x4e817d VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 | 0x4e8a15 VFNMADD231SD -0x645e(%RIP),%XMM4,%XMM15 | 0x4918c1 VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 |
0x4e8183 VMULSD 0x80(%RAX),%XMM2,%XMM2 | 0x4e8a1e VMULSD -0x6466(%RIP),%XMM2,%XMM2 | 0x4918c7 VMULSD 0x80(%RAX),%XMM2,%XMM2 |
0x4e818b VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 | 0x4e8a26 VFNMADD231SD -0x646f(%RIP),%XMM4,%XMM12 | 0x4918cf VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 |
0x4e8194 VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 | 0x4e8a2f VFNMADD231SD -0x6478(%RIP),%XMM4,%XMM14 | 0x4918d8 VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 |
0x4e819a VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 | 0x4e8a38 VFNMADD231SD -0x6481(%RIP),%XMM13,%XMM0 | 0x4918de VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 |
0x4e81a0 VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 | 0x4e8a41 VFNMADD231SD -0x648a(%RIP),%XMM13,%XMM1 | 0x4918e4 VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 |
0x4e81a6 VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 | 0x4e8a4a VFNMADD132SD -0x6493(%RIP),%XMM2,%XMM13 | 0x4918ea VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 |
0x4e81ac VADDSD %XMM15,%XMM0,%XMM15 | 0x4e8a53 VADDSD %XMM15,%XMM0,%XMM15 | 0x4918f0 VADDSD %XMM15,%XMM0,%XMM15 |
0x4e81b1 VADDSD %XMM14,%XMM1,%XMM14 | 0x4e8a58 VADDSD %XMM14,%XMM1,%XMM14 | 0x4918f5 VADDSD %XMM14,%XMM1,%XMM14 |
0x4e81b6 VADDSD %XMM12,%XMM13,%XMM13 | 0x4e8a5d VADDSD %XMM12,%XMM13,%XMM13 | 0x4918fa VADDSD %XMM12,%XMM13,%XMM13 |
0x4e81bb VADDSD 0xd8(%RAX),%XMM15,%XMM12 | 0x4e8a62 VADDSD -0x64aa(%RIP),%XMM15,%XMM12 | 0x4918ff VADDSD 0xd8(%RAX),%XMM15,%XMM12 |
0x4e81c3 VMULSD %XMM15,%XMM15,%XMM1 | 0x4e8a6a VMULSD %XMM15,%XMM15,%XMM1 | 0x491907 VMULSD %XMM15,%XMM15,%XMM1 |
0x4e81c8 VADDSD 0x98(%RAX),%XMM14,%XMM4 | 0x4e8a6f VADDSD -0x64b7(%RIP),%XMM14,%XMM4 | 0x49190c VADDSD 0x98(%RAX),%XMM14,%XMM4 |
0x4e81d0 VADDSD 0x118(%RAX),%XMM13,%XMM3 | 0x4e8a77 VADDSD -0x64bf(%RIP),%XMM13,%XMM3 | 0x491914 VADDSD 0x118(%RAX),%XMM13,%XMM3 |
0x4e81d8 VMULSD %XMM12,%XMM12,%XMM0 | 0x4e8a7f VMULSD %XMM12,%XMM12,%XMM0 | 0x49191c VMULSD %XMM12,%XMM12,%XMM0 |
0x4e81dd VFMADD231SD %XMM14,%XMM14,%XMM1 | 0x4e8a84 VFMADD231SD %XMM14,%XMM14,%XMM1 | 0x491921 VFMADD231SD %XMM14,%XMM14,%XMM1 |
0x4e81e2 VFMADD132SD %XMM4,%XMM0,%XMM4 | 0x4e8a89 VFMADD132SD %XMM4,%XMM0,%XMM4 | 0x491926 VFMADD132SD %XMM4,%XMM0,%XMM4 |
0x4e81e7 VFMADD231SD %XMM13,%XMM13,%XMM1 | 0x4e8a8e VFMADD231SD %XMM13,%XMM13,%XMM1 | 0x49192b VFMADD231SD %XMM13,%XMM13,%XMM1 |
0x4e81ec VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x4e8a93 VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x491930 VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x4e81f1 VADDSD 0xe0(%RAX),%XMM15,%XMM4 | 0x4e8a98 VADDSD -0x64e0(%RIP),%XMM15,%XMM4 | 0x491935 VADDSD 0xe0(%RAX),%XMM15,%XMM4 |
0x4e81f9 VMULSD %XMM4,%XMM4,%XMM12 | 0x4e8aa0 VMULSD %XMM4,%XMM4,%XMM12 | 0x49193d VMULSD %XMM4,%XMM4,%XMM12 |
0x4e81fd VADDSD 0x128(%RAX),%XMM13,%XMM4 | 0x4e8aa4 VADDSD -0x64ec(%RIP),%XMM13,%XMM4 | 0x491941 VADDSD 0x128(%RAX),%XMM13,%XMM4 |
0x4e8205 VMINSD %XMM3,%XMM1,%XMM2 | 0x4e8aac VMINSD %XMM3,%XMM1,%XMM2 | 0x491949 VMINSD %XMM3,%XMM1,%XMM2 |
0x4e8209 VCOMISD %XMM3,%XMM1 | 0x4e8ab0 VCOMISD %XMM3,%XMM1 | 0x49194d VCOMISD %XMM3,%XMM1 |
0x4e820d VADDSD 0xa0(%RAX),%XMM14,%XMM1 | 0x4e8ab4 VADDSD -0x64fc(%RIP),%XMM14,%XMM1 | 0x491951 VADDSD 0xa0(%RAX),%XMM14,%XMM1 |
0x4e8215 VADDSD 0x120(%RAX),%XMM13,%XMM3 | 0x4e8abc VADDSD -0x6504(%RIP),%XMM13,%XMM3 | 0x491959 VADDSD 0x120(%RAX),%XMM13,%XMM3 |
0x4e821d VFMADD132SD %XMM1,%XMM12,%XMM1 | 0x4e8ac4 VFMADD132SD %XMM1,%XMM12,%XMM1 | 0x491961 VFMADD132SD %XMM1,%XMM12,%XMM1 |
0x4e8222 SETA %R13B | 0x4e8ac9 SETA %R13B | 0x491966 SETA %R13B |
0x4e8226 VADDSD 0xf0(%RAX),%XMM15,%XMM12 | 0x4e8acd VADDSD -0x6515(%RIP),%XMM15,%XMM12 | 0x49196a VADDSD 0xf0(%RAX),%XMM15,%XMM12 |
0x4e822e MOVZX %R13B,%EDX | 0x4e8ad5 MOVZX %R13B,%EDX | 0x491972 MOVZX %R13B,%EDX |
0x4e8232 MOV $0x4,%R13D | 0x4e8ad9 MOV $0x4,%R13D | 0x491976 MOV $0x4,%R13D |
0x4e8238 VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x4e8adf VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x49197c VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x4e823d VADDSD 0xe8(%RAX),%XMM15,%XMM1 | 0x4e8ae4 VADDSD -0x652c(%RIP),%XMM15,%XMM1 | 0x491981 VADDSD 0xe8(%RAX),%XMM15,%XMM1 |
0x4e8245 VCOMISD %XMM3,%XMM2 | 0x4e8aec VCOMISD %XMM3,%XMM2 | 0x491989 VCOMISD %XMM3,%XMM2 |
0x4e8249 VMINSD %XMM2,%XMM3,%XMM0 | 0x4e8af0 VMINSD %XMM2,%XMM3,%XMM0 | 0x49198d VMINSD %XMM2,%XMM3,%XMM0 |
0x4e824d VADDSD 0xa8(%RAX),%XMM14,%XMM2 | 0x4e8af4 VADDSD -0x653c(%RIP),%XMM14,%XMM2 | 0x491991 VADDSD 0xa8(%RAX),%XMM14,%XMM2 |
0x4e8255 VMULSD %XMM1,%XMM1,%XMM3 | 0x4e8afc VMULSD %XMM1,%XMM1,%XMM3 | 0x491999 VMULSD %XMM1,%XMM1,%XMM3 |
0x4e8259 VADDSD 0xb0(%RAX),%XMM14,%XMM1 | 0x4e8b00 VADDSD -0x6548(%RIP),%XMM14,%XMM1 | 0x49199d VADDSD 0xb0(%RAX),%XMM14,%XMM1 |
0x4e8261 CMOVA %R15,%RDX | 0x4e8b08 CMOVA %R15,%RDX | 0x4919a5 CMOVA %R15,%RDX |
0x4e8265 VFMADD132SD %XMM2,%XMM3,%XMM2 | 0x4e8b0c VFMADD132SD %XMM2,%XMM3,%XMM2 | 0x4919a9 VFMADD132SD %XMM2,%XMM3,%XMM2 |
0x4e826a VFMADD132SD %XMM4,%XMM2,%XMM4 | 0x4e8b11 VFMADD132SD %XMM4,%XMM2,%XMM4 | 0x4919ae VFMADD132SD %XMM4,%XMM2,%XMM4 |
0x4e826f VADDSD 0x130(%RAX),%XMM13,%XMM2 | 0x4e8b16 VADDSD -0x655e(%RIP),%XMM13,%XMM2 | 0x4919b3 VADDSD 0x130(%RAX),%XMM13,%XMM2 |
0x4e8277 VADDSD 0x138(%RAX),%XMM13,%XMM3 | 0x4e8b1e VADDSD -0x6566(%RIP),%XMM13,%XMM3 | 0x4919bb VADDSD 0x138(%RAX),%XMM13,%XMM3 |
0x4e827f VCOMISD %XMM4,%XMM0 | 0x4e8b26 VCOMISD %XMM4,%XMM0 | 0x4919c3 VCOMISD %XMM4,%XMM0 |
0x4e8283 VMINSD %XMM0,%XMM4,%XMM0 | 0x4e8b2a VMINSD %XMM0,%XMM4,%XMM0 | 0x4919c7 VMINSD %XMM0,%XMM4,%XMM0 |
0x4e8287 VMULSD %XMM12,%XMM12,%XMM4 | 0x4e8b2e VMULSD %XMM12,%XMM12,%XMM4 | 0x4919cb VMULSD %XMM12,%XMM12,%XMM4 |
0x4e828c VADDSD 0xf8(%RAX),%XMM15,%XMM12 | 0x4e8b33 VADDSD -0x657b(%RIP),%XMM15,%XMM12 | 0x4919d0 VADDSD 0xf8(%RAX),%XMM15,%XMM12 |
0x4e8294 CMOVA %R14,%RDX | 0x4e8b3b CMOVA %R14,%RDX | 0x4919d8 CMOVA %R14,%RDX |
0x4e8298 VFMADD132SD %XMM1,%XMM4,%XMM1 | 0x4e8b3f VFMADD132SD %XMM1,%XMM4,%XMM1 | 0x4919dc VFMADD132SD %XMM1,%XMM4,%XMM1 |
0x4e829d VADDSD 0xc0(%RAX),%XMM14,%XMM4 | 0x4e8b44 VADDSD -0x658c(%RIP),%XMM14,%XMM4 | 0x4919e1 VADDSD 0xc0(%RAX),%XMM14,%XMM4 |
0x4e82a5 VFMADD132SD %XMM2,%XMM1,%XMM2 | 0x4e8b4c VFMADD132SD %XMM2,%XMM1,%XMM2 | 0x4919e9 VFMADD132SD %XMM2,%XMM1,%XMM2 |
0x4e82aa VADDSD 0xb8(%RAX),%XMM14,%XMM1 | 0x4e8b51 VADDSD -0x6599(%RIP),%XMM14,%XMM1 | 0x4919ee VADDSD 0xb8(%RAX),%XMM14,%XMM1 |
0x4e82b2 VCOMISD %XMM2,%XMM0 | 0x4e8b59 VCOMISD %XMM2,%XMM0 | 0x4919f6 VCOMISD %XMM2,%XMM0 |
0x4e82b6 VMINSD %XMM0,%XMM2,%XMM0 | 0x4e8b5d VMINSD %XMM0,%XMM2,%XMM0 | 0x4919fa VMINSD %XMM0,%XMM2,%XMM0 |
0x4e82ba VMULSD %XMM12,%XMM12,%XMM2 | 0x4e8b61 VMULSD %XMM12,%XMM12,%XMM2 | 0x4919fe VMULSD %XMM12,%XMM12,%XMM2 |
0x4e82bf CMOVA %R13,%RDX | 0x4e8b66 CMOVA %R13,%RDX | 0x491a03 CMOVA %R13,%RDX |
0x4e82c3 MOV $0x5,%R13D | 0x4e8b6a MOV $0x5,%R13D | 0x491a07 MOV $0x5,%R13D |
0x4e82c9 VFMADD132SD %XMM1,%XMM2,%XMM1 | 0x4e8b70 VFMADD132SD %XMM1,%XMM2,%XMM1 | 0x491a0d VFMADD132SD %XMM1,%XMM2,%XMM1 |
0x4e82ce VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x4e8b75 VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x491a12 VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x4e82d3 VADDSD 0x100(%RAX),%XMM15,%XMM1 | 0x4e8b7a VADDSD -0x65c2(%RIP),%XMM15,%XMM1 | 0x491a17 VADDSD 0x100(%RAX),%XMM15,%XMM1 |
0x4e82db VMULSD %XMM1,%XMM1,%XMM12 | 0x4e8b82 VMULSD %XMM1,%XMM1,%XMM12 | 0x491a1f VMULSD %XMM1,%XMM1,%XMM12 |
0x4e82df VADDSD 0x148(%RAX),%XMM13,%XMM1 | 0x4e8b86 VADDSD -0x65ce(%RIP),%XMM13,%XMM1 | 0x491a23 VADDSD 0x148(%RAX),%XMM13,%XMM1 |
0x4e82e7 VCOMISD %XMM3,%XMM0 | 0x4e8b8e VCOMISD %XMM3,%XMM0 | 0x491a2b VCOMISD %XMM3,%XMM0 |
0x4e82eb VMINSD %XMM0,%XMM3,%XMM0 | 0x4e8b92 VMINSD %XMM0,%XMM3,%XMM0 | 0x491a2f VMINSD %XMM0,%XMM3,%XMM0 |
0x4e82ef VADDSD 0x140(%RAX),%XMM13,%XMM3 | 0x4e8b96 VADDSD -0x65de(%RIP),%XMM13,%XMM3 | 0x491a33 VADDSD 0x140(%RAX),%XMM13,%XMM3 |
0x4e82f7 VFMADD132SD %XMM4,%XMM12,%XMM4 | 0x4e8b9e VFMADD132SD %XMM4,%XMM12,%XMM4 | 0x491a3b VFMADD132SD %XMM4,%XMM12,%XMM4 |
0x4e82fc CMOVA %R13,%RDX | 0x4e8ba3 CMOVA %R13,%RDX | 0x491a40 CMOVA %R13,%RDX |
0x4e8300 MOV $0x6,%R13D | 0x4e8ba7 MOV $0x6,%R13D | 0x491a44 MOV $0x6,%R13D |
0x4e8306 VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x4e8bad VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x491a4a VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x4e830b VADDSD 0x108(%RAX),%XMM15,%XMM4 | 0x4e8bb2 VADDSD -0x65fa(%RIP),%XMM15,%XMM4 | 0x491a4f VADDSD 0x108(%RAX),%XMM15,%XMM4 |
0x4e8313 VMINSD %XMM0,%XMM3,%XMM2 | 0x4e8bba VMINSD %XMM0,%XMM3,%XMM2 | 0x491a57 VMINSD %XMM0,%XMM3,%XMM2 |
0x4e8317 VCOMISD %XMM3,%XMM0 | 0x4e8bbe VCOMISD %XMM3,%XMM0 | 0x491a5b VCOMISD %XMM3,%XMM0 |
0x4e831b VADDSD 0xc8(%RAX),%XMM14,%XMM0 | 0x4e8bc2 VADDSD -0x660a(%RIP),%XMM14,%XMM0 | 0x491a5f VADDSD 0xc8(%RAX),%XMM14,%XMM0 |
0x4e8323 VMULSD %XMM4,%XMM4,%XMM3 | 0x4e8bca VMULSD %XMM4,%XMM4,%XMM3 | 0x491a67 VMULSD %XMM4,%XMM4,%XMM3 |
0x4e8327 CMOVA %R13,%RDX | 0x4e8bce CMOVA %R13,%RDX | 0x491a6b CMOVA %R13,%RDX |
0x4e832b MOV $0x7,%R13D | 0x4e8bd2 MOV $0x7,%R13D | 0x491a6f MOV $0x7,%R13D |
0x4e8331 VFMADD132SD %XMM0,%XMM3,%XMM0 | 0x4e8bd8 VFMADD132SD %XMM0,%XMM3,%XMM0 | 0x491a75 VFMADD132SD %XMM0,%XMM3,%XMM0 |
0x4e8336 VFMADD132SD %XMM1,%XMM0,%XMM1 | 0x4e8bdd VFMADD132SD %XMM1,%XMM0,%XMM1 | 0x491a7a VFMADD132SD %XMM1,%XMM0,%XMM1 |
0x4e833b VCOMISD %XMM1,%XMM2 | 0x4e8be2 VCOMISD %XMM1,%XMM2 | 0x491a7f VCOMISD %XMM1,%XMM2 |
0x4e833f VMINSD %XMM1,%XMM2,%XMM12 | 0x4e8be6 VMINSD %XMM1,%XMM2,%XMM12 | 0x491a83 VMINSD %XMM1,%XMM2,%XMM12 |
0x4e8343 CMOVA %R13,%RDX | 0x4e8bea CMOVA %R13,%RDX | 0x491a87 CMOVA %R13,%RDX |
0x4e8347 VSQRTSD %XMM12,%XMM12,%XMM12 | 0x4e8bee VSQRTSD -0x6676(%RIP),%XMM12,%XMM12 | 0x491a8b VSQRTSD %XMM12,%XMM12,%XMM12 |
0x4e834c LEA (%RAX,%RDX,8),%RDX | 0x4e8bf6 LEA (%RAX,%RDX,8),%RDX | 0x491a90 LEA (%RAX,%RDX,8),%RDX |
0x4e8350 VADDSD 0x90(%RDX),%XMM14,%XMM14 | 0x4e8bfa VADDSD -0x6642(%RIP),%XMM14,%XMM14 | 0x491a94 VADDSD 0x90(%RDX),%XMM14,%XMM14 |
0x4e8358 VMOVSD %XMM12,(%R8,%RCX,8) | 0x4e8c02 VMOVSD %XMM12,-0x65ca(%RIP) 0x4e8c0a NOP | 0x491a9c VMOVSD %XMM12,(%R8,%RCX,8) |
0x4e835e VMULSD %XMM11,%XMM14,%XMM2 | 0x4e8c0b VMULSD %XMM11,%XMM14,%XMM2 | 0x491aa2 VMULSD %XMM11,%XMM14,%XMM2 |
0x4e8363 VMOVSD %XMM2,(%R12,%RCX,8) | 0x4e8c10 VMOVSD %XMM2,-0x6598(%RIP) 0x4e8c18 NOP | 0x491aa7 VMOVSD %XMM2,(%R12,%RCX,8) |
0x4e8369 VADDSD 0xd0(%RDX),%XMM15,%XMM15 | 0x4e8c19 VADDSD -0x6661(%RIP),%XMM15,%XMM15 | 0x491aad VADDSD 0xd0(%RDX),%XMM15,%XMM15 |
0x4e8371 VMULSD %XMM11,%XMM15,%XMM0 | 0x4e8c21 VMULSD %XMM11,%XMM15,%XMM0 | 0x491ab5 VMULSD %XMM11,%XMM15,%XMM0 |
0x4e8376 VMOVSD %XMM0,(%RBX,%RCX,8) | 0x4e8c26 VMOVSD %XMM0,-0x656e(%RIP) 0x4e8c2e NOP | 0x491aba VMOVSD %XMM0,(%RBX,%RCX,8) |
0x4e837b VADDSD 0x110(%RDX),%XMM13,%XMM13 | 0x4e8c2f VADDSD -0x6677(%RIP),%XMM13,%XMM13 | 0x491abf VADDSD 0x110(%RDX),%XMM13,%XMM13 |
0x4e8383 VMULSD %XMM11,%XMM13,%XMM11 | 0x4e8c37 VMULSD %XMM11,%XMM13,%XMM11 | 0x491ac7 VMULSD %XMM11,%XMM13,%XMM11 |
0x4e8388 VMOVSD %XMM11,(%RDI,%RCX,8) | 0x4e8c3c VMOVSD %XMM11,-0x6544(%RIP) 0x4e8c44 NOP | 0x491acc VMOVSD %XMM11,(%RDI,%RCX,8) |
0x4e838d INC %RCX | 0x4e8c45 INC %RCX | 0x491ad1 INC %RCX |
0x4e8390 CMP %RCX,%R9 | 0x4e8c48 CMP %RCX,%R9 | 0x491ad4 CMP %RCX,%R9 |
0x4e8393 JNE 4e80e4 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x56944> | 0x4e8c4b JNE 4e8958 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x571b8> | 0x491ad7 JNE 491830 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x90> |
Path / |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 2.16, 2.16, | 2.16, 2.16, | 2.16, 2.16, |
cycles L1 CQA | 49.50 | 49.50 | 49.50 |
cycles UFS | 66.18 | 65.75 | 66.07 |
bytes loaded | 372.00 | 372.00 | 364.00 |
bytes stored | 40.00 | 32.00 | 32.00 |
nb loads | 47.00 | 47.00 | 46.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 49.50 | 49.50 | 49.50 |
cycles front end | 33.38 | 33.88 | 32.88 |
cycles P0 | 49.50 | 49.50 | 49.50 |
cycles P1 | 49.50 | 49.50 | 49.50 |
cycles P2 | 23.50 | 23.50 | 23.00 |
cycles P3 | 23.50 | 23.50 | 23.00 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 10.50 | 9.50 | 9.50 |
cycles P6 | 15.00 | 15.00 | 15.00 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 32.36 | 31.42 | 32.74 |
LB full | 5.67 | 7.29 | 3.35 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 0.00 | 0.00 | 0.00 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 54.10 | 49.15 | 56.55 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 133.50 | 135.50 | 131.50 |
uops P0 | 49.50 | 49.50 | 49.50 |
uops P1 | 49.50 | 49.50 | 49.50 |
uops P2 | 23.50 | 23.50 | 23.00 |
uops P3 | 23.50 | 23.50 | 23.00 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 10.50 | 9.50 | 9.50 |
uops P6 | 15.00 | 15.00 | 15.00 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 1119 | 1121 | 1107 |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 2.16, 2.16, | 2.16, 2.16, | 2.16, 2.16, |
cycles L1 CQA | 49.50 | 49.50 | 49.50 |
cycles UFS | 66.21 | 65.88 | 66.52 |
bytes loaded | 372.00 | 372.00 | 364.00 |
bytes stored | 40.00 | 32.00 | 32.00 |
nb loads | 47.00 | 47.00 | 46.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 49.50 | 49.50 | 49.50 |
cycles front end | 33.50 | 34.00 | 33.00 |
cycles P0 | 49.50 | 49.50 | 49.50 |
cycles P1 | 49.50 | 49.50 | 49.50 |
cycles P2 | 23.50 | 23.50 | 23.00 |
cycles P3 | 23.50 | 23.50 | 23.00 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 11.00 | 10.00 | 10.00 |
cycles P6 | 15.00 | 15.00 | 15.00 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 32.26 | 31.43 | 33.06 |
LB full | 5.67 | 7.41 | 3.00 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 0.00 | 0.00 | 0.00 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 53.86 | 49.23 | 58.18 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 134.00 | 136.00 | 132.00 |
uops P0 | 49.50 | 49.50 | 49.50 |
uops P1 | 49.50 | 49.50 | 49.50 |
uops P2 | 23.50 | 23.50 | 23.00 |
uops P3 | 23.50 | 23.50 | 23.00 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 11.00 | 10.00 | 10.00 |
uops P6 | 15.00 | 15.00 | 15.00 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 1119 | 1121 | 1107 |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 2.16, 2.16, | 2.16, 2.16, | 2.16, 2.16, |
cycles L1 CQA | 49.50 | 49.50 | 49.50 |
cycles UFS | 66.15 | 65.61 | 65.62 |
bytes loaded | 372.00 | 372.00 | 364.00 |
bytes stored | 40.00 | 32.00 | 32.00 |
nb loads | 47.00 | 47.00 | 46.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 49.50 | 49.50 | 49.50 |
cycles front end | 33.25 | 33.75 | 32.75 |
cycles P0 | 49.50 | 49.50 | 49.50 |
cycles P1 | 49.50 | 49.50 | 49.50 |
cycles P2 | 23.50 | 23.50 | 23.00 |
cycles P3 | 23.50 | 23.50 | 23.00 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 10.00 | 9.00 | 9.00 |
cycles P6 | 15.00 | 15.00 | 15.00 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 32.45 | 31.41 | 32.41 |
LB full | 5.68 | 7.16 | 3.70 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 0.00 | 0.00 | 0.00 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 54.35 | 49.07 | 54.93 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 133.00 | 135.00 | 131.00 |
uops P0 | 49.50 | 49.50 | 49.50 |
uops P1 | 49.50 | 49.50 | 49.50 |
uops P2 | 23.50 | 23.50 | 23.00 |
uops P3 | 23.50 | 23.50 | 23.00 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 10.00 | 9.00 | 9.00 |
uops P6 | 15.00 | 15.00 | 15.00 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 1119 | 1121 | 1107 |