Loop Id: 1106 | Module: exec | Source: ParticleBConds.h:188-217 | Coverage: 3.19% |
---|
Loop Id: 1106 | Module: exec | Source: ParticleBConds.h:188-217 | Coverage: 3.19% |
---|
0x4958b0 VMOVSD %XMM8,%XMM8,%XMM11 |
0x4958b5 CMP %ECX,0x28(%RBP) [9] |
0x4958b8 JLE 4958bf |
0x4958ba VMOVSD %XMM10,%XMM10,%XMM11 |
0x4958bf VMOVSD (%R11,%RCX,8),%XMM12 [4] |
0x4958c5 VMOVSD (%R10,%RCX,8),%XMM1 [6] |
0x4958cb VMOVSD (%RSI,%RCX,8),%XMM13 [5] |
0x4958d0 VSUBSD %XMM6,%XMM12,%XMM2 |
0x4958d4 VSUBSD %XMM5,%XMM1,%XMM0 |
0x4958d8 VSUBSD %XMM7,%XMM13,%XMM3 |
0x4958dc VMULSD %XMM11,%XMM2,%XMM15 |
0x4958e1 VMULSD %XMM11,%XMM0,%XMM14 |
0x4958e6 VMULSD %XMM11,%XMM3,%XMM12 |
0x4958eb VMULSD 0x20(%RAX),%XMM15,%XMM1 [1] |
0x4958f0 VMULSD 0x8(%RAX),%XMM15,%XMM4 [1] |
0x4958f5 VMULSD 0x38(%RAX),%XMM15,%XMM3 [1] |
0x4958fa VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 [1] |
0x495900 VFMADD231SD (%RAX),%XMM14,%XMM4 [1] |
0x495905 VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 [1] |
0x49590b VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 [1] |
0x495911 VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 [1] |
0x495917 VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 [1] |
0x49591d VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 |
0x495924 VXORPD %XMM9,%XMM0,%XMM2 |
0x495929 VMULSD 0x68(%RAX),%XMM2,%XMM0 [1] |
0x49592e VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 |
0x495935 VMULSD 0x50(%RAX),%XMM2,%XMM1 [1] |
0x49593a VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 |
0x495941 VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 [1] |
0x495947 VMULSD 0x80(%RAX),%XMM2,%XMM2 [1] |
0x49594f VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 [1] |
0x495958 VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 [1] |
0x49595e VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 [1] |
0x495964 VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 [1] |
0x49596a VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 [1] |
0x495970 VADDSD %XMM15,%XMM0,%XMM15 |
0x495975 VADDSD %XMM14,%XMM1,%XMM14 |
0x49597a VADDSD %XMM12,%XMM13,%XMM13 |
0x49597f VADDSD 0xd8(%RAX),%XMM15,%XMM12 [1] |
0x495987 VMULSD %XMM15,%XMM15,%XMM1 |
0x49598c VADDSD 0x98(%RAX),%XMM14,%XMM4 [1] |
0x495994 VADDSD 0x118(%RAX),%XMM13,%XMM3 [1] |
0x49599c VMULSD %XMM12,%XMM12,%XMM0 |
0x4959a1 VFMADD231SD %XMM14,%XMM14,%XMM1 |
0x4959a6 VFMADD132SD %XMM4,%XMM0,%XMM4 |
0x4959ab VFMADD231SD %XMM13,%XMM13,%XMM1 |
0x4959b0 VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x4959b5 VADDSD 0xe0(%RAX),%XMM15,%XMM4 [1] |
0x4959bd VMULSD %XMM4,%XMM4,%XMM12 |
0x4959c1 VADDSD 0x128(%RAX),%XMM13,%XMM4 [1] |
0x4959c9 VMINSD %XMM3,%XMM1,%XMM2 |
0x4959cd VCOMISD %XMM3,%XMM1 |
0x4959d1 VADDSD 0xa0(%RAX),%XMM14,%XMM1 [1] |
0x4959d9 VADDSD 0x120(%RAX),%XMM13,%XMM3 [1] |
0x4959e1 VFMADD132SD %XMM1,%XMM12,%XMM1 |
0x4959e6 SETA %R13B |
0x4959ea VADDSD 0xf0(%RAX),%XMM15,%XMM12 [1] |
0x4959f2 MOVZX %R13B,%EDX |
0x4959f6 MOV $0x4,%R13D |
0x4959fc VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x495a01 VADDSD 0xe8(%RAX),%XMM15,%XMM1 [1] |
0x495a09 VCOMISD %XMM3,%XMM2 |
0x495a0d VMINSD %XMM2,%XMM3,%XMM0 |
0x495a11 VADDSD 0xa8(%RAX),%XMM14,%XMM2 [1] |
0x495a19 VMULSD %XMM1,%XMM1,%XMM3 |
0x495a1d VADDSD 0xb0(%RAX),%XMM14,%XMM1 [1] |
0x495a25 CMOVA %R15,%RDX |
0x495a29 VFMADD132SD %XMM2,%XMM3,%XMM2 |
0x495a2e VFMADD132SD %XMM4,%XMM2,%XMM4 |
0x495a33 VADDSD 0x130(%RAX),%XMM13,%XMM2 [1] |
0x495a3b VADDSD 0x138(%RAX),%XMM13,%XMM3 [1] |
0x495a43 VCOMISD %XMM4,%XMM0 |
0x495a47 VMINSD %XMM0,%XMM4,%XMM0 |
0x495a4b VMULSD %XMM12,%XMM12,%XMM4 |
0x495a50 VADDSD 0xf8(%RAX),%XMM15,%XMM12 [1] |
0x495a58 CMOVA %R14,%RDX |
0x495a5c VFMADD132SD %XMM1,%XMM4,%XMM1 |
0x495a61 VADDSD 0xc0(%RAX),%XMM14,%XMM4 [1] |
0x495a69 VFMADD132SD %XMM2,%XMM1,%XMM2 |
0x495a6e VADDSD 0xb8(%RAX),%XMM14,%XMM1 [1] |
0x495a76 VCOMISD %XMM2,%XMM0 |
0x495a7a VMINSD %XMM0,%XMM2,%XMM0 |
0x495a7e VMULSD %XMM12,%XMM12,%XMM2 |
0x495a83 CMOVA %R13,%RDX |
0x495a87 MOV $0x5,%R13D |
0x495a8d VFMADD132SD %XMM1,%XMM2,%XMM1 |
0x495a92 VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x495a97 VADDSD 0x100(%RAX),%XMM15,%XMM1 [1] |
0x495a9f VMULSD %XMM1,%XMM1,%XMM12 |
0x495aa3 VADDSD 0x148(%RAX),%XMM13,%XMM1 [1] |
0x495aab VCOMISD %XMM3,%XMM0 |
0x495aaf VMINSD %XMM0,%XMM3,%XMM0 |
0x495ab3 VADDSD 0x140(%RAX),%XMM13,%XMM3 [1] |
0x495abb VFMADD132SD %XMM4,%XMM12,%XMM4 |
0x495ac0 CMOVA %R13,%RDX |
0x495ac4 MOV $0x6,%R13D |
0x495aca VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x495acf VADDSD 0x108(%RAX),%XMM15,%XMM4 [1] |
0x495ad7 VMINSD %XMM0,%XMM3,%XMM2 |
0x495adb VCOMISD %XMM3,%XMM0 |
0x495adf VADDSD 0xc8(%RAX),%XMM14,%XMM0 [1] |
0x495ae7 VMULSD %XMM4,%XMM4,%XMM3 |
0x495aeb CMOVA %R13,%RDX |
0x495aef MOV $0x7,%R13D |
0x495af5 VFMADD132SD %XMM0,%XMM3,%XMM0 |
0x495afa VFMADD132SD %XMM1,%XMM0,%XMM1 |
0x495aff VCOMISD %XMM1,%XMM2 |
0x495b03 VMINSD %XMM1,%XMM2,%XMM12 |
0x495b07 CMOVA %R13,%RDX |
0x495b0b VSQRTSD %XMM12,%XMM12,%XMM12 |
0x495b10 LEA (%RAX,%RDX,8),%RDX |
0x495b14 VADDSD 0x90(%RDX),%XMM14,%XMM14 [8] |
0x495b1c VMOVSD %XMM12,(%R8,%RCX,8) [7] |
0x495b22 VMULSD %XMM11,%XMM14,%XMM2 |
0x495b27 VMOVSD %XMM2,(%R12,%RCX,8) [3] |
0x495b2d VADDSD 0xd0(%RDX),%XMM15,%XMM15 [8] |
0x495b35 VMULSD %XMM11,%XMM15,%XMM0 |
0x495b3a VMOVSD %XMM0,(%RBX,%RCX,8) [2] |
0x495b3f VADDSD 0x110(%RDX),%XMM13,%XMM13 [8] |
0x495b47 VMULSD %XMM11,%XMM13,%XMM11 |
0x495b4c VMOVSD %XMM11,(%RDI,%RCX,8) [10] |
0x495b51 INC %RCX |
0x495b54 CMP %RCX,%R9 |
0x495b57 JNE 4958b0 |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Particle/Lattice/ParticleBConds.h: 188 - 217 |
-------------------------------------------------------------------------------- |
188: const T flip = iat < flip_ind ? one : minusone; |
189: const T displ_0 = (px[iat] - x0) * flip; |
190: const T displ_1 = (py[iat] - y0) * flip; |
191: const T displ_2 = (pz[iat] - z0) * flip; |
192: |
193: const T ar_0 = -std::floor(displ_0 * g00 + displ_1 * g10 + displ_2 * g20); |
194: const T ar_1 = -std::floor(displ_0 * g01 + displ_1 * g11 + displ_2 * g21); |
195: const T ar_2 = -std::floor(displ_0 * g02 + displ_1 * g12 + displ_2 * g22); |
196: |
197: const T delx = displ_0 + ar_0 * r00 + ar_1 * r10 + ar_2 * r20; |
198: const T dely = displ_1 + ar_0 * r01 + ar_1 * r11 + ar_2 * r21; |
199: const T delz = displ_2 + ar_0 * r02 + ar_1 * r12 + ar_2 * r22; |
200: |
201: T rmin = delx * delx + dely * dely + delz * delz; |
202: int ic = 0; |
203: #pragma unroll(7) |
204: for (int c = 1; c < 8; ++c) |
205: { |
206: const T x = delx + cellx[c]; |
207: const T y = dely + celly[c]; |
208: const T z = delz + cellz[c]; |
209: const T r2 = x * x + y * y + z * z; |
210: ic = (r2 < rmin) ? c : ic; |
211: rmin = (r2 < rmin) ? r2 : rmin; |
212: } |
213: |
214: temp_r[iat] = std::sqrt(rmin); |
215: dx[iat] = flip * (delx + cellx[ic]); |
216: dy[iat] = flip * (dely + celly[ic]); |
217: dz[iat] = flip * (delz + cellz[ic]); |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►31.30+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:120 | exec |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | exec |
○ | main._omp_fn.1 | stl_vector.h:1123 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►28.41+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:120 | exec |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | exec |
○ | main._omp_fn.1 | refwrap.h:346 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►17.98+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:120 | exec |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | exec |
○ | main._omp_fn.1 | stl_vector.h:1123 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►16.37+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:114 | exec |
○ | qmcplusplus::ParticleSet::setA[...] | ParticleSet.cpp:259 | exec |
○ | main._omp_fn.1 | stl_vector.h:1126 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►5.94+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:120 | exec |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | exec |
○ | main._omp_fn.1 | miniqmc.cpp:484 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 2.15 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.51 |
Bottlenecks | |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:188-217 |
Source loop unroll info | multi-versionned |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 49.50 |
CQA cycles if no scalar integer | 49.50 |
CQA cycles if FP arith vectorized | 23.00 |
CQA cycles if fully vectorized | 6.19 |
Front-end cycles | 32.88 |
DIV/SQRT cycles | 49.50 |
P0 cycles | 49.50 |
P1 cycles | 23.00 |
P2 cycles | 23.00 |
P3 cycles | 4.00 |
P4 cycles | 9.50 |
P5 cycles | 15.00 |
P6 cycles | 4.00 |
P7 cycles | 4.50 - 6.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 66.11 - 66.07 |
Stall cycles (UFS) | 32.79 - 32.74 |
Nb insns | 123.50 |
Nb uops | 131.50 |
Nb loads | 46.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.16 |
Nb FLOP add-sub | 30.00 |
Nb FLOP mul | 20.00 |
Nb FLOP fma | 28.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 1.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 364.00 |
Bytes stored | 32.00 |
Stride 0 | 2.00 |
Stride 1 | 7.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.90 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 4.08 |
Vector-efficiency ratio all | 12.33 |
Vector-efficiency ratio load | 12.36 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 11.73 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 2.15 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.50 |
Bottlenecks | P0, P1, |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:188-217 |
Source loop unroll info | multi-versionned |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 49.50 |
CQA cycles if no scalar integer | 49.50 |
CQA cycles if FP arith vectorized | 23.00 |
CQA cycles if fully vectorized | 6.19 |
Front-end cycles | 33.00 |
DIV/SQRT cycles | 49.50 |
P0 cycles | 49.50 |
P1 cycles | 23.00 |
P2 cycles | 23.00 |
P3 cycles | 4.00 |
P4 cycles | 10.00 |
P5 cycles | 15.00 |
P6 cycles | 4.00 |
P7 cycles | 4.50 - 6.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 66.54 - 66.52 |
Stall cycles (UFS) | 33.09 - 33.06 |
Nb insns | 124.00 |
Nb uops | 132.00 |
Nb loads | 46.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.16 |
Nb FLOP add-sub | 30.00 |
Nb FLOP mul | 20.00 |
Nb FLOP fma | 28.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 1.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 364.00 |
Bytes stored | 32.00 |
Stride 0 | 2.00 |
Stride 1 | 7.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.90 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 4.00 |
Vector-efficiency ratio all | 12.33 |
Vector-efficiency ratio load | 12.36 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 11.75 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 2.15 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.51 |
Bottlenecks | P0, P1, |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:188-217 |
Source loop unroll info | multi-versionned |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 49.50 |
CQA cycles if no scalar integer | 49.50 |
CQA cycles if FP arith vectorized | 23.00 |
CQA cycles if fully vectorized | 6.19 |
Front-end cycles | 32.75 |
DIV/SQRT cycles | 49.50 |
P0 cycles | 49.50 |
P1 cycles | 23.00 |
P2 cycles | 23.00 |
P3 cycles | 4.00 |
P4 cycles | 9.00 |
P5 cycles | 15.00 |
P6 cycles | 4.00 |
P7 cycles | 4.50 - 6.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 65.68 - 65.62 |
Stall cycles (UFS) | 32.49 - 32.41 |
Nb insns | 123.00 |
Nb uops | 131.00 |
Nb loads | 46.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.16 |
Nb FLOP add-sub | 30.00 |
Nb FLOP mul | 20.00 |
Nb FLOP fma | 28.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 1.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 364.00 |
Bytes stored | 32.00 |
Stride 0 | 2.00 |
Stride 1 | 7.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.91 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 4.17 |
Vector-efficiency ratio all | 12.33 |
Vector-efficiency ratio load | 12.36 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 11.72 |
Path / |
nb instructions | 123.50 |
nb uops | 131.50 |
loop length | 682.50 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 15.50 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 32.88 cycles |
front end | 32.88 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.50 | 15.00 | 4.00 |
cycles | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.50 | 15.00 | 4.00 |
Cycles executing div or sqrt instructions | 4.50-6.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 66.11-66.07 |
Stall cycles | 32.79-32.74 |
RS full (events) | 61.42-56.55 |
LB full (events) | 0.33-3.35 |
Front-end | 32.88 |
Dispatch | 49.50 |
DIV/SQRT | 4.50-6.00 |
Data deps. | 0.00 |
Overall L1 | 49.50 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 5% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 4% |
all | 6% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 13% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 11% |
nb instructions | 124 |
nb uops | 132 |
loop length | 685 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 16 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 33.00 cycles |
front end | 33.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 10.00 | 15.00 | 4.00 |
cycles | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 10.00 | 15.00 | 4.00 |
Cycles executing div or sqrt instructions | 4.50-6.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 66.54-66.52 |
Stall cycles | 33.09-33.06 |
RS full (events) | 61.85-58.18 |
LB full (events) | 0.37-3.00 |
Front-end | 33.00 |
Dispatch | 49.50 |
DIV/SQRT | 4.50-6.00 |
Data deps. | 0.00 |
Overall L1 | 49.50 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 5% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 4% |
all | 6% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 13% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMOVSD %XMM8,%XMM8,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CMP %ECX,0x28(%RBP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JLE 4958bf <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x9f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVSD %XMM10,%XMM10,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD (%R11,%RCX,8),%XMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%R10,%RCX,8),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%RSI,%RCX,8),%XMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD %XMM6,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM5,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM7,%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM2,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM0,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM3,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x20(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x8(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x38(%RAX),%XMM15,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD (%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VXORPD %XMM9,%XMM0,%XMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VMULSD 0x68(%RAX),%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMULSD 0x50(%RAX),%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x80(%RAX),%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM15,%XMM0,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM14,%XMM1,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM12,%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xd8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM15,%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x98(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x118(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM14,%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM13,%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe0(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x128(%RAX),%XMM13,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM3,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xa0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x120(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SETA %R13B | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VADDSD 0xf0(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOVZX %R13B,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV $0x4,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe8(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM2,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xa8(%RAX),%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R15,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM2,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM2,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x130(%RAX),%XMM13,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x138(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM4,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xf8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R14,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM1,%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xc0(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM2,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb8(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM2,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x5,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM1,%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x100(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x148(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x140(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x6,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x108(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM0,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xc8(%RAX),%XMM14,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x7,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM1,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM1,%XMM2,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VSQRTSD %XMM12,%XMM12,%XMM12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-19 | 4.50-6 |
LEA (%RAX,%RDX,8),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VADDSD 0x90(%RDX),%XMM14,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM12,(%R8,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM11,%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM2,(%R12,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0xd0(%RDX),%XMM15,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM15,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM0,(%RBX,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0x110(%RDX),%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM13,%XMM11 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM11,(%RDI,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %RCX,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 4958b0 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x90> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
nb instructions | 123 |
nb uops | 131 |
loop length | 680 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 15 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 32.75 cycles |
front end | 32.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.00 | 15.00 | 4.00 |
cycles | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.00 | 15.00 | 4.00 |
Cycles executing div or sqrt instructions | 4.50-6.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 65.68-65.62 |
Stall cycles | 32.49-32.41 |
RS full (events) | 60.99-54.93 |
LB full (events) | 0.28-3.70 |
Front-end | 32.75 |
Dispatch | 49.50 |
DIV/SQRT | 4.50-6.00 |
Data deps. | 0.00 |
Overall L1 | 49.50 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 5% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 4% |
all | 6% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 13% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMOVSD %XMM8,%XMM8,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CMP %ECX,0x28(%RBP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JLE 4958bf <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x9f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVSD (%R11,%RCX,8),%XMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%R10,%RCX,8),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%RSI,%RCX,8),%XMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD %XMM6,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM5,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM7,%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM2,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM0,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM3,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x20(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x8(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x38(%RAX),%XMM15,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD (%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VXORPD %XMM9,%XMM0,%XMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VMULSD 0x68(%RAX),%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMULSD 0x50(%RAX),%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x80(%RAX),%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM15,%XMM0,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM14,%XMM1,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM12,%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xd8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM15,%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x98(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x118(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM14,%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM13,%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe0(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x128(%RAX),%XMM13,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM3,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xa0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x120(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SETA %R13B | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VADDSD 0xf0(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOVZX %R13B,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV $0x4,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe8(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM2,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xa8(%RAX),%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R15,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM2,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM2,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x130(%RAX),%XMM13,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x138(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM4,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xf8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R14,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM1,%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xc0(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM2,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb8(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM2,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x5,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM1,%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x100(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x148(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x140(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x6,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x108(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM0,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xc8(%RAX),%XMM14,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x7,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM1,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM1,%XMM2,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VSQRTSD %XMM12,%XMM12,%XMM12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-19 | 4.50-6 |
LEA (%RAX,%RDX,8),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VADDSD 0x90(%RDX),%XMM14,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM12,(%R8,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM11,%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM2,(%R12,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0xd0(%RDX),%XMM15,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM15,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM0,(%RBX,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0x110(%RDX),%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM13,%XMM11 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM11,(%RDI,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %RCX,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 4958b0 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x90> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Metric | run_0 |
---|---|
Coverage (% app. time) | 3.19 |
Time (s) | 3.11 |
Instance Count | 184020 |
Iteration Count - min | 512 |
Iteration Count - avg | 512 |
Iteration Count - max | 512 |
Cycles per Iteration - min | 70.68 |
Cycles per Iteration - avg | 71.38 |
Cycles per Iteration - max | 1632.19 |
Metric | Value |
---|---|
Bucket Coverage (% loop time) | 99.91 |
Instance Count | 184020 |
ORIG CPI:min | 71.00 |
ORIG CPI:med | 72.40 |
ORIG CPI:max | 118.54 |
DL1 CPI:min | 71.00 |
DL1 CPI:med | 71.39 |
DL1 CPI:max | 73.86 |
ORIG (min) / DL1 (min) | 1.00 |
ORIG (med) / DL1 (med) | 1.01 |
ORIG (max) / DL1 (max) | 1.61 |
Nb Iteration:min | 512 |
Nb Iteration:med | 512.00 |
Nb Iteration:max | 512 |
ORIG: min (cycles) | 36350 |
ORIG: med (cycles) | 37070.00 |
ORIG: max (cycles) | 60692 |
DL1:min (cycles) | 36350 |
DL1:med (cycles) | 36554.00 |
DL1:max (cycles) | 37814 |
Metric (average per iteration except for Time and Iteration Count) | ORIG | DL1 | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | |
Time | 37070.00 | 37070.00 | 37070.00 | 37070.00 | 36350.00 | 37070.00 | 60692.00 | 36554.00 | 36554.00 | 36554.00 | 36554.00 | 36350.00 | 36554.00 | 37814.00 |
CPI MIN | 71.00 | 71.00 | ||||||||||||
CPI MED | 72.40 | 72.40 | 72.40 | 72.40 | 71.00 | 72.40 | 118.54 | 71.39 | 71.39 | 71.39 | 71.39 | 71.00 | 71.39 | 73.86 |
CPI AVG | 73.76 | 71.98 | ||||||||||||
CPI MAX | 118.54 | 73.86 | ||||||||||||
Iteration Count | 512.00 | 512.00 | 512.00 | 512.00 | 512.00 | 512.00 | 512.00 | 512.00 | 512.00 | 512.00 | 512.00 | 512.00 | 512.00 | 512.00 |
ORIG | DL1 | Original Code |
---|---|---|
0x4e9947 ADDQ $0x1,-0x4ecf(%RIP) 0x4e994f VMOVSD %XMM8,%XMM8,%XMM11 | 0x4ea1bb VMOVSD %XMM8,%XMM8,%XMM11 | 0x4958b0 VMOVSD %XMM8,%XMM8,%XMM11 |
0x4e9954 CMP %ECX,0x28(%RBP) | 0x4ea1c0 CMP %ECX,-0x7cc6(%RIP) | 0x4958b5 CMP %ECX,0x28(%RBP) |
0x4e9957 JLE 4e995e <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x581be> | 0x4ea1c6 JLE 4ea1cd <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x58a2d> | 0x4958b8 JLE 4958bf <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x9f> |
0x4e9959 VMOVSD %XMM10,%XMM10,%XMM11 | 0x4ea1c8 VMOVSD %XMM10,%XMM10,%XMM11 | 0x4958ba VMOVSD %XMM10,%XMM10,%XMM11 |
0x4e995e VMOVSD (%R11,%RCX,8),%XMM12 | 0x4ea1cd VMOVSD -0x7555(%RIP),%XMM12 | 0x4958bf VMOVSD (%R11,%RCX,8),%XMM12 |
0x4e9964 VMOVSD (%R10,%RCX,8),%XMM1 | 0x4ea1d5 VMOVSD -0x755d(%RIP),%XMM1 | 0x4958c5 VMOVSD (%R10,%RCX,8),%XMM1 |
0x4e996a VMOVSD (%RSI,%RCX,8),%XMM13 | 0x4ea1dd VMOVSD -0x7565(%RIP),%XMM13 | 0x4958cb VMOVSD (%RSI,%RCX,8),%XMM13 |
0x4e996f VSUBSD %XMM6,%XMM12,%XMM2 | 0x4ea1e5 VSUBSD %XMM6,%XMM12,%XMM2 | 0x4958d0 VSUBSD %XMM6,%XMM12,%XMM2 |
0x4e9973 VSUBSD %XMM5,%XMM1,%XMM0 | 0x4ea1e9 VSUBSD %XMM5,%XMM1,%XMM0 | 0x4958d4 VSUBSD %XMM5,%XMM1,%XMM0 |
0x4e9977 VSUBSD %XMM7,%XMM13,%XMM3 | 0x4ea1ed VSUBSD %XMM7,%XMM13,%XMM3 | 0x4958d8 VSUBSD %XMM7,%XMM13,%XMM3 |
0x4e997b VMULSD %XMM11,%XMM2,%XMM15 | 0x4ea1f1 VMULSD %XMM11,%XMM2,%XMM15 | 0x4958dc VMULSD %XMM11,%XMM2,%XMM15 |
0x4e9980 VMULSD %XMM11,%XMM0,%XMM14 | 0x4ea1f6 VMULSD %XMM11,%XMM0,%XMM14 | 0x4958e1 VMULSD %XMM11,%XMM0,%XMM14 |
0x4e9985 VMULSD %XMM11,%XMM3,%XMM12 | 0x4ea1fb VMULSD %XMM11,%XMM3,%XMM12 | 0x4958e6 VMULSD %XMM11,%XMM3,%XMM12 |
0x4e998a VMULSD 0x20(%RAX),%XMM15,%XMM1 | 0x4ea200 VMULSD -0x7588(%RIP),%XMM15,%XMM1 | 0x4958eb VMULSD 0x20(%RAX),%XMM15,%XMM1 |
0x4e998f VMULSD 0x8(%RAX),%XMM15,%XMM4 | 0x4ea208 VMULSD -0x7590(%RIP),%XMM15,%XMM4 | 0x4958f0 VMULSD 0x8(%RAX),%XMM15,%XMM4 |
0x4e9994 VMULSD 0x38(%RAX),%XMM15,%XMM3 | 0x4ea210 VMULSD -0x7598(%RIP),%XMM15,%XMM3 | 0x4958f5 VMULSD 0x38(%RAX),%XMM15,%XMM3 |
0x4e9999 VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 | 0x4ea218 VFMADD231SD -0x75a1(%RIP),%XMM14,%XMM1 | 0x4958fa VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 |
0x4e999f VFMADD231SD (%RAX),%XMM14,%XMM4 | 0x4ea221 VFMADD231SD -0x75aa(%RIP),%XMM14,%XMM4 | 0x495900 VFMADD231SD (%RAX),%XMM14,%XMM4 |
0x4e99a4 VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 | 0x4ea22a VFMADD231SD -0x75b3(%RIP),%XMM14,%XMM3 | 0x495905 VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 |
0x4e99aa VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 | 0x4ea233 VFMADD231SD -0x75bc(%RIP),%XMM12,%XMM1 | 0x49590b VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 |
0x4e99b0 VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 | 0x4ea23c VFMADD231SD -0x75c5(%RIP),%XMM12,%XMM4 | 0x495911 VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 |
0x4e99b6 VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 | 0x4ea245 VFMADD231SD -0x75ce(%RIP),%XMM12,%XMM3 | 0x495917 VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 |
0x4e99bc VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 0x4ea24e VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 0x49591d VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 |
0x4e99c3 VXORPD %XMM9,%XMM0,%XMM2 | 0x4ea255 VXORPD %XMM9,%XMM0,%XMM2 | 0x495924 VXORPD %XMM9,%XMM0,%XMM2 |
0x4e99c8 VMULSD 0x68(%RAX),%XMM2,%XMM0 | 0x4ea25a VMULSD -0x75e2(%RIP),%XMM2,%XMM0 | 0x495929 VMULSD 0x68(%RAX),%XMM2,%XMM0 |
0x4e99cd VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 0x4ea262 VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 0x49592e VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 |
0x4e99d4 VMULSD 0x50(%RAX),%XMM2,%XMM1 | 0x4ea269 VMULSD -0x75f1(%RIP),%XMM2,%XMM1 | 0x495935 VMULSD 0x50(%RAX),%XMM2,%XMM1 |
0x4e99d9 VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 0x4ea271 VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 0x49593a VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 |
0x4e99e0 VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 | 0x4ea278 VFNMADD231SD -0x7601(%RIP),%XMM4,%XMM15 | 0x495941 VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 |
0x4e99e6 VMULSD 0x80(%RAX),%XMM2,%XMM2 | 0x4ea281 VMULSD -0x7609(%RIP),%XMM2,%XMM2 | 0x495947 VMULSD 0x80(%RAX),%XMM2,%XMM2 |
0x4e99ee VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 | 0x4ea289 VFNMADD231SD -0x7612(%RIP),%XMM4,%XMM12 | 0x49594f VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 |
0x4e99f7 VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 | 0x4ea292 VFNMADD231SD -0x761b(%RIP),%XMM4,%XMM14 | 0x495958 VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 |
0x4e99fd VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 | 0x4ea29b VFNMADD231SD -0x7624(%RIP),%XMM13,%XMM0 | 0x49595e VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 |
0x4e9a03 VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 | 0x4ea2a4 VFNMADD231SD -0x762d(%RIP),%XMM13,%XMM1 | 0x495964 VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 |
0x4e9a09 VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 | 0x4ea2ad VFNMADD132SD -0x7636(%RIP),%XMM2,%XMM13 | 0x49596a VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 |
0x4e9a0f VADDSD %XMM15,%XMM0,%XMM15 | 0x4ea2b6 VADDSD %XMM15,%XMM0,%XMM15 | 0x495970 VADDSD %XMM15,%XMM0,%XMM15 |
0x4e9a14 VADDSD %XMM14,%XMM1,%XMM14 | 0x4ea2bb VADDSD %XMM14,%XMM1,%XMM14 | 0x495975 VADDSD %XMM14,%XMM1,%XMM14 |
0x4e9a19 VADDSD %XMM12,%XMM13,%XMM13 | 0x4ea2c0 VADDSD %XMM12,%XMM13,%XMM13 | 0x49597a VADDSD %XMM12,%XMM13,%XMM13 |
0x4e9a1e VADDSD 0xd8(%RAX),%XMM15,%XMM12 | 0x4ea2c5 VADDSD -0x764d(%RIP),%XMM15,%XMM12 | 0x49597f VADDSD 0xd8(%RAX),%XMM15,%XMM12 |
0x4e9a26 VMULSD %XMM15,%XMM15,%XMM1 | 0x4ea2cd VMULSD %XMM15,%XMM15,%XMM1 | 0x495987 VMULSD %XMM15,%XMM15,%XMM1 |
0x4e9a2b VADDSD 0x98(%RAX),%XMM14,%XMM4 | 0x4ea2d2 VADDSD -0x765a(%RIP),%XMM14,%XMM4 | 0x49598c VADDSD 0x98(%RAX),%XMM14,%XMM4 |
0x4e9a33 VADDSD 0x118(%RAX),%XMM13,%XMM3 | 0x4ea2da VADDSD -0x7662(%RIP),%XMM13,%XMM3 | 0x495994 VADDSD 0x118(%RAX),%XMM13,%XMM3 |
0x4e9a3b VMULSD %XMM12,%XMM12,%XMM0 | 0x4ea2e2 VMULSD %XMM12,%XMM12,%XMM0 | 0x49599c VMULSD %XMM12,%XMM12,%XMM0 |
0x4e9a40 VFMADD231SD %XMM14,%XMM14,%XMM1 | 0x4ea2e7 VFMADD231SD %XMM14,%XMM14,%XMM1 | 0x4959a1 VFMADD231SD %XMM14,%XMM14,%XMM1 |
0x4e9a45 VFMADD132SD %XMM4,%XMM0,%XMM4 | 0x4ea2ec VFMADD132SD %XMM4,%XMM0,%XMM4 | 0x4959a6 VFMADD132SD %XMM4,%XMM0,%XMM4 |
0x4e9a4a VFMADD231SD %XMM13,%XMM13,%XMM1 | 0x4ea2f1 VFMADD231SD %XMM13,%XMM13,%XMM1 | 0x4959ab VFMADD231SD %XMM13,%XMM13,%XMM1 |
0x4e9a4f VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x4ea2f6 VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x4959b0 VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x4e9a54 VADDSD 0xe0(%RAX),%XMM15,%XMM4 | 0x4ea2fb VADDSD -0x7683(%RIP),%XMM15,%XMM4 | 0x4959b5 VADDSD 0xe0(%RAX),%XMM15,%XMM4 |
0x4e9a5c VMULSD %XMM4,%XMM4,%XMM12 | 0x4ea303 VMULSD %XMM4,%XMM4,%XMM12 | 0x4959bd VMULSD %XMM4,%XMM4,%XMM12 |
0x4e9a60 VADDSD 0x128(%RAX),%XMM13,%XMM4 | 0x4ea307 VADDSD -0x768f(%RIP),%XMM13,%XMM4 | 0x4959c1 VADDSD 0x128(%RAX),%XMM13,%XMM4 |
0x4e9a68 VMINSD %XMM3,%XMM1,%XMM2 | 0x4ea30f VMINSD %XMM3,%XMM1,%XMM2 | 0x4959c9 VMINSD %XMM3,%XMM1,%XMM2 |
0x4e9a6c VCOMISD %XMM3,%XMM1 | 0x4ea313 VCOMISD %XMM3,%XMM1 | 0x4959cd VCOMISD %XMM3,%XMM1 |
0x4e9a70 VADDSD 0xa0(%RAX),%XMM14,%XMM1 | 0x4ea317 VADDSD -0x769f(%RIP),%XMM14,%XMM1 | 0x4959d1 VADDSD 0xa0(%RAX),%XMM14,%XMM1 |
0x4e9a78 VADDSD 0x120(%RAX),%XMM13,%XMM3 | 0x4ea31f VADDSD -0x76a7(%RIP),%XMM13,%XMM3 | 0x4959d9 VADDSD 0x120(%RAX),%XMM13,%XMM3 |
0x4e9a80 VFMADD132SD %XMM1,%XMM12,%XMM1 | 0x4ea327 VFMADD132SD %XMM1,%XMM12,%XMM1 | 0x4959e1 VFMADD132SD %XMM1,%XMM12,%XMM1 |
0x4e9a85 SETA %R13B | 0x4ea32c SETA %R13B | 0x4959e6 SETA %R13B |
0x4e9a89 VADDSD 0xf0(%RAX),%XMM15,%XMM12 | 0x4ea330 VADDSD -0x76b8(%RIP),%XMM15,%XMM12 | 0x4959ea VADDSD 0xf0(%RAX),%XMM15,%XMM12 |
0x4e9a91 MOVZX %R13B,%EDX | 0x4ea338 MOVZX %R13B,%EDX | 0x4959f2 MOVZX %R13B,%EDX |
0x4e9a95 MOV $0x4,%R13D | 0x4ea33c MOV $0x4,%R13D | 0x4959f6 MOV $0x4,%R13D |
0x4e9a9b VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x4ea342 VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x4959fc VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x4e9aa0 VADDSD 0xe8(%RAX),%XMM15,%XMM1 | 0x4ea347 VADDSD -0x76cf(%RIP),%XMM15,%XMM1 | 0x495a01 VADDSD 0xe8(%RAX),%XMM15,%XMM1 |
0x4e9aa8 VCOMISD %XMM3,%XMM2 | 0x4ea34f VCOMISD %XMM3,%XMM2 | 0x495a09 VCOMISD %XMM3,%XMM2 |
0x4e9aac VMINSD %XMM2,%XMM3,%XMM0 | 0x4ea353 VMINSD %XMM2,%XMM3,%XMM0 | 0x495a0d VMINSD %XMM2,%XMM3,%XMM0 |
0x4e9ab0 VADDSD 0xa8(%RAX),%XMM14,%XMM2 | 0x4ea357 VADDSD -0x76df(%RIP),%XMM14,%XMM2 | 0x495a11 VADDSD 0xa8(%RAX),%XMM14,%XMM2 |
0x4e9ab8 VMULSD %XMM1,%XMM1,%XMM3 | 0x4ea35f VMULSD %XMM1,%XMM1,%XMM3 | 0x495a19 VMULSD %XMM1,%XMM1,%XMM3 |
0x4e9abc VADDSD 0xb0(%RAX),%XMM14,%XMM1 | 0x4ea363 VADDSD -0x76eb(%RIP),%XMM14,%XMM1 | 0x495a1d VADDSD 0xb0(%RAX),%XMM14,%XMM1 |
0x4e9ac4 CMOVA %R15,%RDX | 0x4ea36b CMOVA %R15,%RDX | 0x495a25 CMOVA %R15,%RDX |
0x4e9ac8 VFMADD132SD %XMM2,%XMM3,%XMM2 | 0x4ea36f VFMADD132SD %XMM2,%XMM3,%XMM2 | 0x495a29 VFMADD132SD %XMM2,%XMM3,%XMM2 |
0x4e9acd VFMADD132SD %XMM4,%XMM2,%XMM4 | 0x4ea374 VFMADD132SD %XMM4,%XMM2,%XMM4 | 0x495a2e VFMADD132SD %XMM4,%XMM2,%XMM4 |
0x4e9ad2 VADDSD 0x130(%RAX),%XMM13,%XMM2 | 0x4ea379 VADDSD -0x7701(%RIP),%XMM13,%XMM2 | 0x495a33 VADDSD 0x130(%RAX),%XMM13,%XMM2 |
0x4e9ada VADDSD 0x138(%RAX),%XMM13,%XMM3 | 0x4ea381 VADDSD -0x7709(%RIP),%XMM13,%XMM3 | 0x495a3b VADDSD 0x138(%RAX),%XMM13,%XMM3 |
0x4e9ae2 VCOMISD %XMM4,%XMM0 | 0x4ea389 VCOMISD %XMM4,%XMM0 | 0x495a43 VCOMISD %XMM4,%XMM0 |
0x4e9ae6 VMINSD %XMM0,%XMM4,%XMM0 | 0x4ea38d VMINSD %XMM0,%XMM4,%XMM0 | 0x495a47 VMINSD %XMM0,%XMM4,%XMM0 |
0x4e9aea VMULSD %XMM12,%XMM12,%XMM4 | 0x4ea391 VMULSD %XMM12,%XMM12,%XMM4 | 0x495a4b VMULSD %XMM12,%XMM12,%XMM4 |
0x4e9aef VADDSD 0xf8(%RAX),%XMM15,%XMM12 | 0x4ea396 VADDSD -0x771e(%RIP),%XMM15,%XMM12 | 0x495a50 VADDSD 0xf8(%RAX),%XMM15,%XMM12 |
0x4e9af7 CMOVA %R14,%RDX | 0x4ea39e CMOVA %R14,%RDX | 0x495a58 CMOVA %R14,%RDX |
0x4e9afb VFMADD132SD %XMM1,%XMM4,%XMM1 | 0x4ea3a2 VFMADD132SD %XMM1,%XMM4,%XMM1 | 0x495a5c VFMADD132SD %XMM1,%XMM4,%XMM1 |
0x4e9b00 VADDSD 0xc0(%RAX),%XMM14,%XMM4 | 0x4ea3a7 VADDSD -0x772f(%RIP),%XMM14,%XMM4 | 0x495a61 VADDSD 0xc0(%RAX),%XMM14,%XMM4 |
0x4e9b08 VFMADD132SD %XMM2,%XMM1,%XMM2 | 0x4ea3af VFMADD132SD %XMM2,%XMM1,%XMM2 | 0x495a69 VFMADD132SD %XMM2,%XMM1,%XMM2 |
0x4e9b0d VADDSD 0xb8(%RAX),%XMM14,%XMM1 | 0x4ea3b4 VADDSD -0x773c(%RIP),%XMM14,%XMM1 | 0x495a6e VADDSD 0xb8(%RAX),%XMM14,%XMM1 |
0x4e9b15 VCOMISD %XMM2,%XMM0 | 0x4ea3bc VCOMISD %XMM2,%XMM0 | 0x495a76 VCOMISD %XMM2,%XMM0 |
0x4e9b19 VMINSD %XMM0,%XMM2,%XMM0 | 0x4ea3c0 VMINSD %XMM0,%XMM2,%XMM0 | 0x495a7a VMINSD %XMM0,%XMM2,%XMM0 |
0x4e9b1d VMULSD %XMM12,%XMM12,%XMM2 | 0x4ea3c4 VMULSD %XMM12,%XMM12,%XMM2 | 0x495a7e VMULSD %XMM12,%XMM12,%XMM2 |
0x4e9b22 CMOVA %R13,%RDX | 0x4ea3c9 CMOVA %R13,%RDX | 0x495a83 CMOVA %R13,%RDX |
0x4e9b26 MOV $0x5,%R13D | 0x4ea3cd MOV $0x5,%R13D | 0x495a87 MOV $0x5,%R13D |
0x4e9b2c VFMADD132SD %XMM1,%XMM2,%XMM1 | 0x4ea3d3 VFMADD132SD %XMM1,%XMM2,%XMM1 | 0x495a8d VFMADD132SD %XMM1,%XMM2,%XMM1 |
0x4e9b31 VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x4ea3d8 VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x495a92 VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x4e9b36 VADDSD 0x100(%RAX),%XMM15,%XMM1 | 0x4ea3dd VADDSD -0x7765(%RIP),%XMM15,%XMM1 | 0x495a97 VADDSD 0x100(%RAX),%XMM15,%XMM1 |
0x4e9b3e VMULSD %XMM1,%XMM1,%XMM12 | 0x4ea3e5 VMULSD %XMM1,%XMM1,%XMM12 | 0x495a9f VMULSD %XMM1,%XMM1,%XMM12 |
0x4e9b42 VADDSD 0x148(%RAX),%XMM13,%XMM1 | 0x4ea3e9 VADDSD -0x7771(%RIP),%XMM13,%XMM1 | 0x495aa3 VADDSD 0x148(%RAX),%XMM13,%XMM1 |
0x4e9b4a VCOMISD %XMM3,%XMM0 | 0x4ea3f1 VCOMISD %XMM3,%XMM0 | 0x495aab VCOMISD %XMM3,%XMM0 |
0x4e9b4e VMINSD %XMM0,%XMM3,%XMM0 | 0x4ea3f5 VMINSD %XMM0,%XMM3,%XMM0 | 0x495aaf VMINSD %XMM0,%XMM3,%XMM0 |
0x4e9b52 VADDSD 0x140(%RAX),%XMM13,%XMM3 | 0x4ea3f9 VADDSD -0x7781(%RIP),%XMM13,%XMM3 | 0x495ab3 VADDSD 0x140(%RAX),%XMM13,%XMM3 |
0x4e9b5a VFMADD132SD %XMM4,%XMM12,%XMM4 | 0x4ea401 VFMADD132SD %XMM4,%XMM12,%XMM4 | 0x495abb VFMADD132SD %XMM4,%XMM12,%XMM4 |
0x4e9b5f CMOVA %R13,%RDX | 0x4ea406 CMOVA %R13,%RDX | 0x495ac0 CMOVA %R13,%RDX |
0x4e9b63 MOV $0x6,%R13D | 0x4ea40a MOV $0x6,%R13D | 0x495ac4 MOV $0x6,%R13D |
0x4e9b69 VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x4ea410 VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x495aca VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x4e9b6e VADDSD 0x108(%RAX),%XMM15,%XMM4 | 0x4ea415 VADDSD -0x779d(%RIP),%XMM15,%XMM4 | 0x495acf VADDSD 0x108(%RAX),%XMM15,%XMM4 |
0x4e9b76 VMINSD %XMM0,%XMM3,%XMM2 | 0x4ea41d VMINSD %XMM0,%XMM3,%XMM2 | 0x495ad7 VMINSD %XMM0,%XMM3,%XMM2 |
0x4e9b7a VCOMISD %XMM3,%XMM0 | 0x4ea421 VCOMISD %XMM3,%XMM0 | 0x495adb VCOMISD %XMM3,%XMM0 |
0x4e9b7e VADDSD 0xc8(%RAX),%XMM14,%XMM0 | 0x4ea425 VADDSD -0x77ad(%RIP),%XMM14,%XMM0 | 0x495adf VADDSD 0xc8(%RAX),%XMM14,%XMM0 |
0x4e9b86 VMULSD %XMM4,%XMM4,%XMM3 | 0x4ea42d VMULSD %XMM4,%XMM4,%XMM3 | 0x495ae7 VMULSD %XMM4,%XMM4,%XMM3 |
0x4e9b8a CMOVA %R13,%RDX | 0x4ea431 CMOVA %R13,%RDX | 0x495aeb CMOVA %R13,%RDX |
0x4e9b8e MOV $0x7,%R13D | 0x4ea435 MOV $0x7,%R13D | 0x495aef MOV $0x7,%R13D |
0x4e9b94 VFMADD132SD %XMM0,%XMM3,%XMM0 | 0x4ea43b VFMADD132SD %XMM0,%XMM3,%XMM0 | 0x495af5 VFMADD132SD %XMM0,%XMM3,%XMM0 |
0x4e9b99 VFMADD132SD %XMM1,%XMM0,%XMM1 | 0x4ea440 VFMADD132SD %XMM1,%XMM0,%XMM1 | 0x495afa VFMADD132SD %XMM1,%XMM0,%XMM1 |
0x4e9b9e VCOMISD %XMM1,%XMM2 | 0x4ea445 VCOMISD %XMM1,%XMM2 | 0x495aff VCOMISD %XMM1,%XMM2 |
0x4e9ba2 VMINSD %XMM1,%XMM2,%XMM12 | 0x4ea449 VMINSD %XMM1,%XMM2,%XMM12 | 0x495b03 VMINSD %XMM1,%XMM2,%XMM12 |
0x4e9ba6 CMOVA %R13,%RDX | 0x4ea44d CMOVA %R13,%RDX | 0x495b07 CMOVA %R13,%RDX |
0x4e9baa VSQRTSD %XMM12,%XMM12,%XMM12 | 0x4ea451 VSQRTSD -0x7819(%RIP),%XMM12,%XMM12 | 0x495b0b VSQRTSD %XMM12,%XMM12,%XMM12 |
0x4e9baf LEA (%RAX,%RDX,8),%RDX | 0x4ea459 LEA (%RAX,%RDX,8),%RDX | 0x495b10 LEA (%RAX,%RDX,8),%RDX |
0x4e9bb3 VADDSD 0x90(%RDX),%XMM14,%XMM14 | 0x4ea45d VADDSD -0x77e5(%RIP),%XMM14,%XMM14 | 0x495b14 VADDSD 0x90(%RDX),%XMM14,%XMM14 |
0x4e9bbb VMOVSD %XMM12,(%R8,%RCX,8) | 0x4ea465 VMOVSD %XMM12,-0x776d(%RIP) 0x4ea46d NOP | 0x495b1c VMOVSD %XMM12,(%R8,%RCX,8) |
0x4e9bc1 VMULSD %XMM11,%XMM14,%XMM2 | 0x4ea46e VMULSD %XMM11,%XMM14,%XMM2 | 0x495b22 VMULSD %XMM11,%XMM14,%XMM2 |
0x4e9bc6 VMOVSD %XMM2,(%R12,%RCX,8) | 0x4ea473 VMOVSD %XMM2,-0x773b(%RIP) 0x4ea47b NOP | 0x495b27 VMOVSD %XMM2,(%R12,%RCX,8) |
0x4e9bcc VADDSD 0xd0(%RDX),%XMM15,%XMM15 | 0x4ea47c VADDSD -0x7804(%RIP),%XMM15,%XMM15 | 0x495b2d VADDSD 0xd0(%RDX),%XMM15,%XMM15 |
0x4e9bd4 VMULSD %XMM11,%XMM15,%XMM0 | 0x4ea484 VMULSD %XMM11,%XMM15,%XMM0 | 0x495b35 VMULSD %XMM11,%XMM15,%XMM0 |
0x4e9bd9 VMOVSD %XMM0,(%RBX,%RCX,8) | 0x4ea489 VMOVSD %XMM0,-0x7711(%RIP) 0x4ea491 NOP | 0x495b3a VMOVSD %XMM0,(%RBX,%RCX,8) |
0x4e9bde VADDSD 0x110(%RDX),%XMM13,%XMM13 | 0x4ea492 VADDSD -0x781a(%RIP),%XMM13,%XMM13 | 0x495b3f VADDSD 0x110(%RDX),%XMM13,%XMM13 |
0x4e9be6 VMULSD %XMM11,%XMM13,%XMM11 | 0x4ea49a VMULSD %XMM11,%XMM13,%XMM11 | 0x495b47 VMULSD %XMM11,%XMM13,%XMM11 |
0x4e9beb VMOVSD %XMM11,(%RDI,%RCX,8) | 0x4ea49f VMOVSD %XMM11,-0x76e7(%RIP) 0x4ea4a7 NOP | 0x495b4c VMOVSD %XMM11,(%RDI,%RCX,8) |
0x4e9bf0 INC %RCX | 0x4ea4a8 INC %RCX | 0x495b51 INC %RCX |
0x4e9bf3 CMP %RCX,%R9 | 0x4ea4ab CMP %RCX,%R9 | 0x495b54 CMP %RCX,%R9 |
0x4e9bf6 JNE 4e9947 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x581a7> | 0x4ea4ae JNE 4ea1bb <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x58a1b> | 0x495b57 JNE 4958b0 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.isra.0+0x90> |
Path / |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 2.16, 2.16, | 2.16, 2.16, | 2.16, 2.16, |
cycles L1 CQA | 49.50 | 49.50 | 49.50 |
cycles UFS | 66.18 | 65.75 | 66.07 |
bytes loaded | 372.00 | 372.00 | 364.00 |
bytes stored | 40.00 | 32.00 | 32.00 |
nb loads | 47.00 | 47.00 | 46.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 49.50 | 49.50 | 49.50 |
cycles front end | 33.38 | 33.88 | 32.88 |
cycles P0 | 49.50 | 49.50 | 49.50 |
cycles P1 | 49.50 | 49.50 | 49.50 |
cycles P2 | 23.50 | 23.50 | 23.00 |
cycles P3 | 23.50 | 23.50 | 23.00 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 10.50 | 9.50 | 9.50 |
cycles P6 | 15.00 | 15.00 | 15.00 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 32.36 | 31.42 | 32.74 |
LB full | 5.67 | 7.29 | 3.35 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 0.00 | 0.00 | 0.00 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 54.10 | 49.15 | 56.55 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 133.50 | 135.50 | 131.50 |
uops P0 | 49.50 | 49.50 | 49.50 |
uops P1 | 49.50 | 49.50 | 49.50 |
uops P2 | 23.50 | 23.50 | 23.00 |
uops P3 | 23.50 | 23.50 | 23.00 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 10.50 | 9.50 | 9.50 |
uops P6 | 15.00 | 15.00 | 15.00 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 1114 | 1116 | 1106 |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 2.16, 2.16, | 2.16, 2.16, | 2.16, 2.16, |
cycles L1 CQA | 49.50 | 49.50 | 49.50 |
cycles UFS | 66.21 | 65.88 | 66.52 |
bytes loaded | 372.00 | 372.00 | 364.00 |
bytes stored | 40.00 | 32.00 | 32.00 |
nb loads | 47.00 | 47.00 | 46.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 49.50 | 49.50 | 49.50 |
cycles front end | 33.50 | 34.00 | 33.00 |
cycles P0 | 49.50 | 49.50 | 49.50 |
cycles P1 | 49.50 | 49.50 | 49.50 |
cycles P2 | 23.50 | 23.50 | 23.00 |
cycles P3 | 23.50 | 23.50 | 23.00 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 11.00 | 10.00 | 10.00 |
cycles P6 | 15.00 | 15.00 | 15.00 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 32.26 | 31.43 | 33.06 |
LB full | 5.67 | 7.41 | 3.00 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 0.00 | 0.00 | 0.00 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 53.86 | 49.23 | 58.18 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 134.00 | 136.00 | 132.00 |
uops P0 | 49.50 | 49.50 | 49.50 |
uops P1 | 49.50 | 49.50 | 49.50 |
uops P2 | 23.50 | 23.50 | 23.00 |
uops P3 | 23.50 | 23.50 | 23.00 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 11.00 | 10.00 | 10.00 |
uops P6 | 15.00 | 15.00 | 15.00 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 1114 | 1116 | 1106 |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 2.16, 2.16, | 2.16, 2.16, | 2.16, 2.16, |
cycles L1 CQA | 49.50 | 49.50 | 49.50 |
cycles UFS | 66.15 | 65.61 | 65.62 |
bytes loaded | 372.00 | 372.00 | 364.00 |
bytes stored | 40.00 | 32.00 | 32.00 |
nb loads | 47.00 | 47.00 | 46.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 49.50 | 49.50 | 49.50 |
cycles front end | 33.25 | 33.75 | 32.75 |
cycles P0 | 49.50 | 49.50 | 49.50 |
cycles P1 | 49.50 | 49.50 | 49.50 |
cycles P2 | 23.50 | 23.50 | 23.00 |
cycles P3 | 23.50 | 23.50 | 23.00 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 10.00 | 9.00 | 9.00 |
cycles P6 | 15.00 | 15.00 | 15.00 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 32.45 | 31.41 | 32.41 |
LB full | 5.68 | 7.16 | 3.70 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 0.00 | 0.00 | 0.00 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 54.35 | 49.07 | 54.93 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 133.00 | 135.00 | 131.00 |
uops P0 | 49.50 | 49.50 | 49.50 |
uops P1 | 49.50 | 49.50 | 49.50 |
uops P2 | 23.50 | 23.50 | 23.00 |
uops P3 | 23.50 | 23.50 | 23.00 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 10.00 | 9.00 | 9.00 |
uops P6 | 15.00 | 15.00 | 15.00 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 1114 | 1116 | 1106 |