Loop Id: 167 | Module: libqmcparticle.so | Source: ParticleBConds.h:188-217 | Coverage: 52.75% |
---|
Loop Id: 167 | Module: libqmcparticle.so | Source: ParticleBConds.h:188-217 | Coverage: 52.75% |
---|
0x1b6a0 VMOVSD %XMM8,%XMM8,%XMM11 |
0x1b6a5 CMP %ECX,0x10(%RBP) [7] |
0x1b6a8 JLE 1b6af |
0x1b6aa VMOVSD %XMM10,%XMM10,%XMM11 |
0x1b6af VMOVSD (%RBX,%RCX,8),%XMM12 [2] |
0x1b6b4 VMOVSD (%R11,%RCX,8),%XMM1 [8] |
0x1b6ba MOV $0x4,%R13D |
0x1b6c0 VMOVSD (%RSI,%RCX,8),%XMM13 [3] |
0x1b6c5 VSUBSD %XMM6,%XMM12,%XMM2 |
0x1b6c9 VSUBSD %XMM5,%XMM1,%XMM0 |
0x1b6cd VSUBSD %XMM7,%XMM13,%XMM3 |
0x1b6d1 VMULSD %XMM11,%XMM2,%XMM15 |
0x1b6d6 VMULSD %XMM11,%XMM0,%XMM14 |
0x1b6db VMULSD %XMM11,%XMM3,%XMM12 |
0x1b6e0 VMULSD 0x20(%RAX),%XMM15,%XMM1 [4] |
0x1b6e5 VMULSD 0x8(%RAX),%XMM15,%XMM4 [4] |
0x1b6ea VMULSD 0x38(%RAX),%XMM15,%XMM3 [4] |
0x1b6ef VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 [4] |
0x1b6f5 VFMADD231SD (%RAX),%XMM14,%XMM4 [4] |
0x1b6fa VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 [4] |
0x1b700 VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 [4] |
0x1b706 VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 [4] |
0x1b70c VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 [4] |
0x1b712 VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 |
0x1b719 VXORPD %XMM9,%XMM0,%XMM2 |
0x1b71e VMULSD 0x68(%RAX),%XMM2,%XMM0 [4] |
0x1b723 VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 |
0x1b72a VMULSD 0x50(%RAX),%XMM2,%XMM1 [4] |
0x1b72f VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 |
0x1b736 VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 [4] |
0x1b73c VMULSD 0x80(%RAX),%XMM2,%XMM2 [4] |
0x1b744 VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 [4] |
0x1b74d VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 [4] |
0x1b753 VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 [4] |
0x1b759 VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 [4] |
0x1b75f VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 [4] |
0x1b765 VADDSD %XMM15,%XMM0,%XMM15 |
0x1b76a VADDSD %XMM14,%XMM1,%XMM14 |
0x1b76f VADDSD %XMM12,%XMM13,%XMM13 |
0x1b774 VADDSD 0xd8(%RAX),%XMM15,%XMM12 [4] |
0x1b77c VMULSD %XMM15,%XMM15,%XMM1 |
0x1b781 VADDSD 0x98(%RAX),%XMM14,%XMM4 [4] |
0x1b789 VADDSD 0x118(%RAX),%XMM13,%XMM3 [4] |
0x1b791 VMULSD %XMM12,%XMM12,%XMM0 |
0x1b796 VFMADD231SD %XMM14,%XMM14,%XMM1 |
0x1b79b VFMADD132SD %XMM4,%XMM0,%XMM4 |
0x1b7a0 VFMADD231SD %XMM13,%XMM13,%XMM1 |
0x1b7a5 VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x1b7aa VADDSD 0xe0(%RAX),%XMM15,%XMM4 [4] |
0x1b7b2 VMULSD %XMM4,%XMM4,%XMM12 |
0x1b7b6 VADDSD 0x128(%RAX),%XMM13,%XMM4 [4] |
0x1b7be VMINSD %XMM3,%XMM1,%XMM2 |
0x1b7c2 VCOMISD %XMM3,%XMM1 |
0x1b7c6 VADDSD 0xa0(%RAX),%XMM14,%XMM1 [4] |
0x1b7ce VADDSD 0x120(%RAX),%XMM13,%XMM3 [4] |
0x1b7d6 VFMADD132SD %XMM1,%XMM12,%XMM1 |
0x1b7db SETA %DL |
0x1b7de VADDSD 0xf0(%RAX),%XMM15,%XMM12 [4] |
0x1b7e6 MOVZX %DL,%EDX |
0x1b7e9 VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x1b7ee VADDSD 0xe8(%RAX),%XMM15,%XMM1 [4] |
0x1b7f6 VCOMISD %XMM3,%XMM2 |
0x1b7fa VMINSD %XMM2,%XMM3,%XMM0 |
0x1b7fe VADDSD 0xa8(%RAX),%XMM14,%XMM2 [4] |
0x1b806 VMULSD %XMM1,%XMM1,%XMM3 |
0x1b80a VADDSD 0xb0(%RAX),%XMM14,%XMM1 [4] |
0x1b812 CMOVA %R15,%RDX |
0x1b816 VFMADD132SD %XMM2,%XMM3,%XMM2 |
0x1b81b VFMADD132SD %XMM4,%XMM2,%XMM4 |
0x1b820 VADDSD 0x130(%RAX),%XMM13,%XMM2 [4] |
0x1b828 VADDSD 0x138(%RAX),%XMM13,%XMM3 [4] |
0x1b830 VCOMISD %XMM4,%XMM0 |
0x1b834 VMINSD %XMM0,%XMM4,%XMM0 |
0x1b838 VMULSD %XMM12,%XMM12,%XMM4 |
0x1b83d VADDSD 0xf8(%RAX),%XMM15,%XMM12 [4] |
0x1b845 CMOVA %R14,%RDX |
0x1b849 VFMADD132SD %XMM1,%XMM4,%XMM1 |
0x1b84e VADDSD 0xc0(%RAX),%XMM14,%XMM4 [4] |
0x1b856 VFMADD132SD %XMM2,%XMM1,%XMM2 |
0x1b85b VADDSD 0xb8(%RAX),%XMM14,%XMM1 [4] |
0x1b863 VCOMISD %XMM2,%XMM0 |
0x1b867 VMINSD %XMM0,%XMM2,%XMM0 |
0x1b86b VMULSD %XMM12,%XMM12,%XMM2 |
0x1b870 CMOVA %R13,%RDX |
0x1b874 MOV $0x5,%R13D |
0x1b87a VFMADD132SD %XMM1,%XMM2,%XMM1 |
0x1b87f VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x1b884 VADDSD 0x100(%RAX),%XMM15,%XMM1 [4] |
0x1b88c VMULSD %XMM1,%XMM1,%XMM12 |
0x1b890 VADDSD 0x148(%RAX),%XMM13,%XMM1 [4] |
0x1b898 VCOMISD %XMM3,%XMM0 |
0x1b89c VMINSD %XMM0,%XMM3,%XMM0 |
0x1b8a0 VADDSD 0x140(%RAX),%XMM13,%XMM3 [4] |
0x1b8a8 VFMADD132SD %XMM4,%XMM12,%XMM4 |
0x1b8ad CMOVA %R13,%RDX |
0x1b8b1 MOV $0x6,%R13D |
0x1b8b7 VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x1b8bc VADDSD 0x108(%RAX),%XMM15,%XMM4 [4] |
0x1b8c4 VMINSD %XMM0,%XMM3,%XMM2 |
0x1b8c8 VCOMISD %XMM3,%XMM0 |
0x1b8cc VADDSD 0xc8(%RAX),%XMM14,%XMM0 [4] |
0x1b8d4 VMULSD %XMM4,%XMM4,%XMM3 |
0x1b8d8 CMOVA %R13,%RDX |
0x1b8dc MOV $0x7,%R13D |
0x1b8e2 VFMADD132SD %XMM0,%XMM3,%XMM0 |
0x1b8e7 VFMADD132SD %XMM1,%XMM0,%XMM1 |
0x1b8ec VCOMISD %XMM1,%XMM2 |
0x1b8f0 VMINSD %XMM1,%XMM2,%XMM12 |
0x1b8f4 CMOVA %R13,%RDX |
0x1b8f8 VSQRTSD %XMM12,%XMM12,%XMM12 |
0x1b8fd LEA (%RAX,%RDX,8),%RDX |
0x1b901 VADDSD 0x90(%RDX),%XMM14,%XMM14 [5] |
0x1b909 VMOVSD %XMM12,(%R10,%RCX,8) [6] |
0x1b90f VMULSD %XMM11,%XMM14,%XMM2 |
0x1b914 VMOVSD %XMM2,(%R12,%RCX,8) [9] |
0x1b91a VADDSD 0xd0(%RDX),%XMM15,%XMM15 [5] |
0x1b922 VMULSD %XMM11,%XMM15,%XMM0 |
0x1b927 VMOVSD %XMM0,(%R8,%RCX,8) [1] |
0x1b92d VADDSD 0x110(%RDX),%XMM13,%XMM13 [5] |
0x1b935 VMULSD %XMM11,%XMM13,%XMM11 |
0x1b93a VMOVSD %XMM11,(%RDI,%RCX,8) [10] |
0x1b93f INC %RCX |
0x1b942 CMP %R9,%RCX |
0x1b945 JNE 1b6a0 |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Particle/Lattice/ParticleBConds.h: 188 - 217 |
-------------------------------------------------------------------------------- |
188: const T flip = iat < flip_ind ? one : minusone; |
189: const T displ_0 = (px[iat] - x0) * flip; |
190: const T displ_1 = (py[iat] - y0) * flip; |
191: const T displ_2 = (pz[iat] - z0) * flip; |
192: |
193: const T ar_0 = -std::floor(displ_0 * g00 + displ_1 * g10 + displ_2 * g20); |
194: const T ar_1 = -std::floor(displ_0 * g01 + displ_1 * g11 + displ_2 * g21); |
195: const T ar_2 = -std::floor(displ_0 * g02 + displ_1 * g12 + displ_2 * g22); |
196: |
197: const T delx = displ_0 + ar_0 * r00 + ar_1 * r10 + ar_2 * r20; |
198: const T dely = displ_1 + ar_0 * r01 + ar_1 * r11 + ar_2 * r21; |
199: const T delz = displ_2 + ar_0 * r02 + ar_1 * r12 + ar_2 * r22; |
200: |
201: T rmin = delx * delx + dely * dely + delz * delz; |
202: int ic = 0; |
203: #pragma unroll(7) |
204: for (int c = 1; c < 8; ++c) |
205: { |
206: const T x = delx + cellx[c]; |
207: const T y = dely + celly[c]; |
208: const T z = delz + cellz[c]; |
209: const T r2 = x * x + y * y + z * z; |
210: ic = (r2 < rmin) ? c : ic; |
211: rmin = (r2 < rmin) ? r2 : rmin; |
212: } |
213: |
214: temp_r[iat] = std::sqrt(rmin); |
215: dx[iat] = flip * (delx + cellx[ic]); |
216: dy[iat] = flip * (dely + celly[ic]); |
217: dz[iat] = flip * (delz + cellz[ic]); |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►40.97+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | refwrap.h:346 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►20.83+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | refwrap.h:346 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►14.58+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:77 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::setA[...] | ParticleSet.cpp:259 | libqmcparticle.so |
○ | main._omp_fn.1 | stl_vector.h:1123 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►11.81+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | stl_vector.h:1126 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►5.56+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | stl_vector.h:1123 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►1.39+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:120 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | refwrap.h:346 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►1.39+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:120 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | stl_vector.h:1126 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 2.15 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.51 |
Bottlenecks | |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:188-217 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 49.50 |
CQA cycles if no scalar integer | 49.50 |
CQA cycles if FP arith vectorized | 23.00 |
CQA cycles if fully vectorized | 6.19 |
Front-end cycles | 32.88 |
DIV/SQRT cycles | 49.50 |
P0 cycles | 49.50 |
P1 cycles | 23.00 |
P2 cycles | 23.00 |
P3 cycles | 4.00 |
P4 cycles | 9.50 |
P5 cycles | 15.00 |
P6 cycles | 4.00 |
P7 cycles | 4.50 - 6.00 |
Inter-iter dependencies cycles | 8 |
FE+BE cycles (UFS) | 65.51 - 65.90 |
Stall cycles (UFS) | 32.19 - 32.57 |
Nb insns | 123.50 |
Nb uops | 131.50 |
Nb loads | 46.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.16 |
Nb FLOP add-sub | 30.00 |
Nb FLOP mul | 20.00 |
Nb FLOP fma | 28.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 1.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 364.00 |
Bytes stored | 32.00 |
Stride 0 | 2.00 |
Stride 1 | 7.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.90 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 4.08 |
Vector-efficiency ratio all | 12.33 |
Vector-efficiency ratio load | 12.36 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 11.73 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 2.15 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.50 |
Bottlenecks | P0, P1, |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:188-217 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 49.50 |
CQA cycles if no scalar integer | 49.50 |
CQA cycles if FP arith vectorized | 23.00 |
CQA cycles if fully vectorized | 6.19 |
Front-end cycles | 33.00 |
DIV/SQRT cycles | 49.50 |
P0 cycles | 49.50 |
P1 cycles | 23.00 |
P2 cycles | 23.00 |
P3 cycles | 4.00 |
P4 cycles | 10.00 |
P5 cycles | 15.00 |
P6 cycles | 4.00 |
P7 cycles | 4.50 - 6.00 |
Inter-iter dependencies cycles | 8 |
FE+BE cycles (UFS) | 65.50 - 65.64 |
Stall cycles (UFS) | 32.05 - 32.18 |
Nb insns | 124.00 |
Nb uops | 132.00 |
Nb loads | 46.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.16 |
Nb FLOP add-sub | 30.00 |
Nb FLOP mul | 20.00 |
Nb FLOP fma | 28.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 1.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 364.00 |
Bytes stored | 32.00 |
Stride 0 | 2.00 |
Stride 1 | 7.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.90 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 4.00 |
Vector-efficiency ratio all | 12.33 |
Vector-efficiency ratio load | 12.36 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 11.75 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 2.15 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.51 |
Bottlenecks | P0, P1, |
Function | void qmcplusplus::DTD_BConds |
Source | ParticleBConds.h:188-217 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 49.50 |
CQA cycles if no scalar integer | 49.50 |
CQA cycles if FP arith vectorized | 23.00 |
CQA cycles if fully vectorized | 6.19 |
Front-end cycles | 32.75 |
DIV/SQRT cycles | 49.50 |
P0 cycles | 49.50 |
P1 cycles | 23.00 |
P2 cycles | 23.00 |
P3 cycles | 4.00 |
P4 cycles | 9.00 |
P5 cycles | 15.00 |
P6 cycles | 4.00 |
P7 cycles | 4.50 - 6.00 |
Inter-iter dependencies cycles | 8 |
FE+BE cycles (UFS) | 65.52 - 66.16 |
Stall cycles (UFS) | 32.32 - 32.95 |
Nb insns | 123.00 |
Nb uops | 131.00 |
Nb loads | 46.00 |
Nb stores | 4.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.16 |
Nb FLOP add-sub | 30.00 |
Nb FLOP mul | 20.00 |
Nb FLOP fma | 28.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 1.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 364.00 |
Bytes stored | 32.00 |
Stride 0 | 2.00 |
Stride 1 | 7.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.91 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 4.17 |
Vector-efficiency ratio all | 12.33 |
Vector-efficiency ratio load | 12.36 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | 12.50 |
Vector-efficiency ratio other | 11.72 |
Path / |
Function | void qmcplusplus::DTD_BConds |
Source file and lines | ParticleBConds.h:188-217 |
Module | libqmcparticle.so |
nb instructions | 123.50 |
nb uops | 131.50 |
loop length | 680.50 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 15.50 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 32.88 cycles |
front end | 32.88 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.50 | 15.00 | 4.00 |
cycles | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.50 | 15.00 | 4.00 |
Cycles executing div or sqrt instructions | 4.50-6.00 |
Longest recurrence chain latency (RecMII) | 8.00 |
FE+BE cycles | 65.51-65.90 |
Stall cycles | 32.19-32.57 |
RS full (events) | 60.28-55.23 |
LB full (events) | 0.27-3.92 |
Front-end | 32.88 |
Dispatch | 49.50 |
DIV/SQRT | 4.50-6.00 |
Data deps. | 8.00 |
Overall L1 | 49.50 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 5% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 4% |
all | 6% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 13% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 11% |
Function | void qmcplusplus::DTD_BConds |
Source file and lines | ParticleBConds.h:188-217 |
Module | libqmcparticle.so |
nb instructions | 124 |
nb uops | 132 |
loop length | 683 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 16 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 33.00 cycles |
front end | 33.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 10.00 | 15.00 | 4.00 |
cycles | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 10.00 | 15.00 | 4.00 |
Cycles executing div or sqrt instructions | 4.50-6.00 |
Longest recurrence chain latency (RecMII) | 8.00 |
FE+BE cycles | 65.50-65.64 |
Stall cycles | 32.05-32.18 |
RS full (events) | 60.13-54.69 |
LB full (events) | 0.30-4.01 |
Front-end | 33.00 |
Dispatch | 49.50 |
DIV/SQRT | 4.50-6.00 |
Data deps. | 8.00 |
Overall L1 | 49.50 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 5% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 4% |
all | 6% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 13% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMOVSD %XMM8,%XMM8,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CMP %ECX,0x10(%RBP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JLE 1b6af <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0x8f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVSD %XMM10,%XMM10,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD (%RBX,%RCX,8),%XMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%R11,%RCX,8),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV $0x4,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VMOVSD (%RSI,%RCX,8),%XMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD %XMM6,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM5,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM7,%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM2,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM0,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM3,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x20(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x8(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x38(%RAX),%XMM15,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD (%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VXORPD %XMM9,%XMM0,%XMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VMULSD 0x68(%RAX),%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMULSD 0x50(%RAX),%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x80(%RAX),%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM15,%XMM0,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM14,%XMM1,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM12,%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xd8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM15,%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x98(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x118(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM14,%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM13,%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe0(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x128(%RAX),%XMM13,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM3,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xa0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x120(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SETA %DL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VADDSD 0xf0(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOVZX %DL,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe8(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM2,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xa8(%RAX),%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R15,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM2,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM2,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x130(%RAX),%XMM13,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x138(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM4,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xf8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R14,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM1,%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xc0(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM2,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb8(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM2,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x5,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM1,%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x100(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x148(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x140(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x6,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x108(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM0,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xc8(%RAX),%XMM14,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x7,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM1,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM1,%XMM2,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VSQRTSD %XMM12,%XMM12,%XMM12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-19 | 4.50-6 |
LEA (%RAX,%RDX,8),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VADDSD 0x90(%RDX),%XMM14,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM12,(%R10,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM11,%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM2,(%R12,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0xd0(%RDX),%XMM15,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM15,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM0,(%R8,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0x110(%RDX),%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM13,%XMM11 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM11,(%RDI,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %R9,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 1b6a0 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0x80> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Function | void qmcplusplus::DTD_BConds |
Source file and lines | ParticleBConds.h:188-217 |
Module | libqmcparticle.so |
nb instructions | 123 |
nb uops | 131 |
loop length | 678 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 15 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 32.75 cycles |
front end | 32.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.00 | 15.00 | 4.00 |
cycles | 49.50 | 49.50 | 23.00 | 23.00 | 4.00 | 9.00 | 15.00 | 4.00 |
Cycles executing div or sqrt instructions | 4.50-6.00 |
Longest recurrence chain latency (RecMII) | 8.00 |
FE+BE cycles | 65.52-66.16 |
Stall cycles | 32.32-32.95 |
RS full (events) | 60.44-55.78 |
LB full (events) | 0.23-3.82 |
Front-end | 32.75 |
Dispatch | 49.50 |
DIV/SQRT | 4.50-6.00 |
Data deps. | 8.00 |
Overall L1 | 49.50 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 5% |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 4% |
all | 6% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 13% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | 12% |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMOVSD %XMM8,%XMM8,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CMP %ECX,0x10(%RBP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JLE 1b6af <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0x8f> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVSD (%RBX,%RCX,8),%XMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%R11,%RCX,8),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV $0x4,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VMOVSD (%RSI,%RCX,8),%XMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD %XMM6,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM5,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM7,%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM2,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM0,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM3,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x20(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x8(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x38(%RAX),%XMM15,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD (%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VXORPD %XMM9,%XMM0,%XMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VMULSD 0x68(%RAX),%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMULSD 0x50(%RAX),%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x80(%RAX),%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM15,%XMM0,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM14,%XMM1,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM12,%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xd8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM15,%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x98(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x118(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM14,%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM13,%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe0(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x128(%RAX),%XMM13,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM3,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xa0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x120(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SETA %DL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VADDSD 0xf0(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOVZX %DL,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xe8(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM2,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xa8(%RAX),%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb0(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R15,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM2,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM2,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x130(%RAX),%XMM13,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x138(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM4,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xf8(%RAX),%XMM15,%XMM12 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R14,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VFMADD132SD %XMM1,%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xc0(%RAX),%XMM14,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM2,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0xb8(%RAX),%XMM14,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM2,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x5,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM1,%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x100(%RAX),%XMM15,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x148(%RAX),%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x140(%RAX),%XMM13,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM12,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x6,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM3,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD 0x108(%RAX),%XMM15,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINSD %XMM0,%XMM3,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM3,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDSD 0xc8(%RAX),%XMM14,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM4,%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
MOV $0x7,%R13D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM0,%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM1,%XMM0,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD %XMM1,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINSD %XMM1,%XMM2,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMOVA %R13,%RDX | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
VSQRTSD %XMM12,%XMM12,%XMM12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-19 | 4.50-6 |
LEA (%RAX,%RDX,8),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VADDSD 0x90(%RDX),%XMM14,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM12,(%R10,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM11,%XMM14,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM2,(%R12,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0xd0(%RDX),%XMM15,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM15,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM0,(%R8,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VADDSD 0x110(%RDX),%XMM13,%XMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM13,%XMM11 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM11,(%RDI,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
INC %RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %R9,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 1b6a0 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0x80> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Metric | run_0 |
---|---|
Coverage (% app. time) | 52.75 |
Time (s) | 0.72 |
Instance Count | 47712 |
Iteration Count - min | 64 |
Iteration Count - avg | 421.67 |
Iteration Count - max | 768 |
Cycles per Iteration - min | 70.49 |
Cycles per Iteration - avg | 71.32 |
Cycles per Iteration - max | 651.26 |
Metric | Value |
---|---|
Bucket Coverage (% loop time) | 99.92 |
Instance Count | 47712 |
ORIG CPI:min | 70.58 |
ORIG CPI:med | 70.94 |
ORIG CPI:max | 72.09 |
DL1 CPI:min | 71.21 |
DL1 CPI:med | 71.44 |
DL1 CPI:max | 92.93 |
ORIG (min) / DL1 (min) | 0.99 |
ORIG (med) / DL1 (med) | 0.99 |
ORIG (max) / DL1 (max) | 0.78 |
Nb Iteration:min | 768 |
Nb Iteration:med | 768.00 |
Nb Iteration:max | 768 |
ORIG: min (cycles) | 54208 |
ORIG: med (cycles) | 54484.00 |
ORIG: max (cycles) | 55368 |
DL1:min (cycles) | 54686 |
DL1:med (cycles) | 54868.00 |
DL1:max (cycles) | 71372 |
Metric (average per iteration except for Time and Iteration Count) | ORIG | DL1 | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | Min (Thread) | Med (Thread) | Avg (Thread) | Max (Thread) | Min (Instances) | Med (Instances) | Max (Instances) | |
Time | 54484.00 | 54484.00 | 54484.00 | 54484.00 | 54208.00 | 54484.00 | 55368.00 | 54868.00 | 54868.00 | 54868.00 | 54868.00 | 54686.00 | 54868.00 | 71372.00 |
CPI MIN | 70.58 | 71.21 | ||||||||||||
CPI MED | 70.94 | 70.94 | 70.94 | 70.94 | 70.58 | 70.94 | 72.09 | 71.44 | 71.44 | 71.44 | 71.44 | 71.21 | 71.44 | 92.93 |
CPI AVG | 71.02 | 72.15 | ||||||||||||
CPI MAX | 72.09 | 92.93 | ||||||||||||
Iteration Count | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 |
ORIG | DL1 | Original Code |
---|---|---|
0x25807 ADDQ $0x1,-0xf4f(%RIP) 0x2580f VMOVSD %XMM8,%XMM8,%XMM11 | 0x26079 VMOVSD %XMM8,%XMM8,%XMM11 | 0x1b6a0 VMOVSD %XMM8,%XMM8,%XMM11 |
0x25814 CMP %ECX,0x10(%RBP) | 0x2607e CMP %ECX,-0x1d84(%RIP) | 0x1b6a5 CMP %ECX,0x10(%RBP) |
0x25817 JLE 2581e <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0xa1fe> | 0x26084 JLE 2608b <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0xaa6b> | 0x1b6a8 JLE 1b6af <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0x8f> |
0x25819 VMOVSD %XMM10,%XMM10,%XMM11 | 0x26086 VMOVSD %XMM10,%XMM10,%XMM11 | 0x1b6aa VMOVSD %XMM10,%XMM10,%XMM11 |
0x2581e VMOVSD (%RBX,%RCX,8),%XMM12 | 0x2608b VMOVSD -0x1cd3(%RIP),%XMM12 | 0x1b6af VMOVSD (%RBX,%RCX,8),%XMM12 |
0x25823 VMOVSD (%R11,%RCX,8),%XMM1 | 0x26093 VMOVSD -0x1cdb(%RIP),%XMM1 | 0x1b6b4 VMOVSD (%R11,%RCX,8),%XMM1 |
0x25829 MOV $0x4,%R13D | 0x2609b MOV $0x4,%R13D | 0x1b6ba MOV $0x4,%R13D |
0x2582f VMOVSD (%RSI,%RCX,8),%XMM13 | 0x260a1 VMOVSD -0x1ce9(%RIP),%XMM13 | 0x1b6c0 VMOVSD (%RSI,%RCX,8),%XMM13 |
0x25834 VSUBSD %XMM6,%XMM12,%XMM2 | 0x260a9 VSUBSD %XMM6,%XMM12,%XMM2 | 0x1b6c5 VSUBSD %XMM6,%XMM12,%XMM2 |
0x25838 VSUBSD %XMM5,%XMM1,%XMM0 | 0x260ad VSUBSD %XMM5,%XMM1,%XMM0 | 0x1b6c9 VSUBSD %XMM5,%XMM1,%XMM0 |
0x2583c VSUBSD %XMM7,%XMM13,%XMM3 | 0x260b1 VSUBSD %XMM7,%XMM13,%XMM3 | 0x1b6cd VSUBSD %XMM7,%XMM13,%XMM3 |
0x25840 VMULSD %XMM11,%XMM2,%XMM15 | 0x260b5 VMULSD %XMM11,%XMM2,%XMM15 | 0x1b6d1 VMULSD %XMM11,%XMM2,%XMM15 |
0x25845 VMULSD %XMM11,%XMM0,%XMM14 | 0x260ba VMULSD %XMM11,%XMM0,%XMM14 | 0x1b6d6 VMULSD %XMM11,%XMM0,%XMM14 |
0x2584a VMULSD %XMM11,%XMM3,%XMM12 | 0x260bf VMULSD %XMM11,%XMM3,%XMM12 | 0x1b6db VMULSD %XMM11,%XMM3,%XMM12 |
0x2584f VMULSD 0x20(%RAX),%XMM15,%XMM1 | 0x260c4 VMULSD -0x1d0c(%RIP),%XMM15,%XMM1 | 0x1b6e0 VMULSD 0x20(%RAX),%XMM15,%XMM1 |
0x25854 VMULSD 0x8(%RAX),%XMM15,%XMM4 | 0x260cc VMULSD -0x1d14(%RIP),%XMM15,%XMM4 | 0x1b6e5 VMULSD 0x8(%RAX),%XMM15,%XMM4 |
0x25859 VMULSD 0x38(%RAX),%XMM15,%XMM3 | 0x260d4 VMULSD -0x1d1c(%RIP),%XMM15,%XMM3 | 0x1b6ea VMULSD 0x38(%RAX),%XMM15,%XMM3 |
0x2585e VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 | 0x260dc VFMADD231SD -0x1d25(%RIP),%XMM14,%XMM1 | 0x1b6ef VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 |
0x25864 VFMADD231SD (%RAX),%XMM14,%XMM4 | 0x260e5 VFMADD231SD -0x1d2e(%RIP),%XMM14,%XMM4 | 0x1b6f5 VFMADD231SD (%RAX),%XMM14,%XMM4 |
0x25869 VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 | 0x260ee VFMADD231SD -0x1d37(%RIP),%XMM14,%XMM3 | 0x1b6fa VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 |
0x2586f VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 | 0x260f7 VFMADD231SD -0x1d40(%RIP),%XMM12,%XMM1 | 0x1b700 VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 |
0x25875 VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 | 0x26100 VFMADD231SD -0x1d49(%RIP),%XMM12,%XMM4 | 0x1b706 VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 |
0x2587b VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 | 0x26109 VFMADD231SD -0x1d52(%RIP),%XMM12,%XMM3 | 0x1b70c VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 |
0x25881 VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 0x26112 VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 0x1b712 VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 |
0x25888 VXORPD %XMM9,%XMM0,%XMM2 | 0x26119 VXORPD %XMM9,%XMM0,%XMM2 | 0x1b719 VXORPD %XMM9,%XMM0,%XMM2 |
0x2588d VMULSD 0x68(%RAX),%XMM2,%XMM0 | 0x2611e VMULSD -0x1d66(%RIP),%XMM2,%XMM0 | 0x1b71e VMULSD 0x68(%RAX),%XMM2,%XMM0 |
0x25892 VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 0x26126 VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 | 0x1b723 VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 |
0x25899 VMULSD 0x50(%RAX),%XMM2,%XMM1 | 0x2612d VMULSD -0x1d75(%RIP),%XMM2,%XMM1 | 0x1b72a VMULSD 0x50(%RAX),%XMM2,%XMM1 |
0x2589e VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 0x26135 VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 | 0x1b72f VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 |
0x258a5 VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 | 0x2613c VFNMADD231SD -0x1d85(%RIP),%XMM4,%XMM15 | 0x1b736 VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 |
0x258ab VMULSD 0x80(%RAX),%XMM2,%XMM2 | 0x26145 VMULSD -0x1d8d(%RIP),%XMM2,%XMM2 | 0x1b73c VMULSD 0x80(%RAX),%XMM2,%XMM2 |
0x258b3 VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 | 0x2614d VFNMADD231SD -0x1d96(%RIP),%XMM4,%XMM12 | 0x1b744 VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 |
0x258bc VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 | 0x26156 VFNMADD231SD -0x1d9f(%RIP),%XMM4,%XMM14 | 0x1b74d VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 |
0x258c2 VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 | 0x2615f VFNMADD231SD -0x1da8(%RIP),%XMM13,%XMM0 | 0x1b753 VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 |
0x258c8 VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 | 0x26168 VFNMADD231SD -0x1db1(%RIP),%XMM13,%XMM1 | 0x1b759 VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 |
0x258ce VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 | 0x26171 VFNMADD132SD -0x1dba(%RIP),%XMM2,%XMM13 | 0x1b75f VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 |
0x258d4 VADDSD %XMM15,%XMM0,%XMM15 | 0x2617a VADDSD %XMM15,%XMM0,%XMM15 | 0x1b765 VADDSD %XMM15,%XMM0,%XMM15 |
0x258d9 VADDSD %XMM14,%XMM1,%XMM14 | 0x2617f VADDSD %XMM14,%XMM1,%XMM14 | 0x1b76a VADDSD %XMM14,%XMM1,%XMM14 |
0x258de VADDSD %XMM12,%XMM13,%XMM13 | 0x26184 VADDSD %XMM12,%XMM13,%XMM13 | 0x1b76f VADDSD %XMM12,%XMM13,%XMM13 |
0x258e3 VADDSD 0xd8(%RAX),%XMM15,%XMM12 | 0x26189 VADDSD -0x1dd1(%RIP),%XMM15,%XMM12 | 0x1b774 VADDSD 0xd8(%RAX),%XMM15,%XMM12 |
0x258eb VMULSD %XMM15,%XMM15,%XMM1 | 0x26191 VMULSD %XMM15,%XMM15,%XMM1 | 0x1b77c VMULSD %XMM15,%XMM15,%XMM1 |
0x258f0 VADDSD 0x98(%RAX),%XMM14,%XMM4 | 0x26196 VADDSD -0x1dde(%RIP),%XMM14,%XMM4 | 0x1b781 VADDSD 0x98(%RAX),%XMM14,%XMM4 |
0x258f8 VADDSD 0x118(%RAX),%XMM13,%XMM3 | 0x2619e VADDSD -0x1de6(%RIP),%XMM13,%XMM3 | 0x1b789 VADDSD 0x118(%RAX),%XMM13,%XMM3 |
0x25900 VMULSD %XMM12,%XMM12,%XMM0 | 0x261a6 VMULSD %XMM12,%XMM12,%XMM0 | 0x1b791 VMULSD %XMM12,%XMM12,%XMM0 |
0x25905 VFMADD231SD %XMM14,%XMM14,%XMM1 | 0x261ab VFMADD231SD %XMM14,%XMM14,%XMM1 | 0x1b796 VFMADD231SD %XMM14,%XMM14,%XMM1 |
0x2590a VFMADD132SD %XMM4,%XMM0,%XMM4 | 0x261b0 VFMADD132SD %XMM4,%XMM0,%XMM4 | 0x1b79b VFMADD132SD %XMM4,%XMM0,%XMM4 |
0x2590f VFMADD231SD %XMM13,%XMM13,%XMM1 | 0x261b5 VFMADD231SD %XMM13,%XMM13,%XMM1 | 0x1b7a0 VFMADD231SD %XMM13,%XMM13,%XMM1 |
0x25914 VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x261ba VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x1b7a5 VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x25919 VADDSD 0xe0(%RAX),%XMM15,%XMM4 | 0x261bf VADDSD -0x1e07(%RIP),%XMM15,%XMM4 | 0x1b7aa VADDSD 0xe0(%RAX),%XMM15,%XMM4 |
0x25921 VMULSD %XMM4,%XMM4,%XMM12 | 0x261c7 VMULSD %XMM4,%XMM4,%XMM12 | 0x1b7b2 VMULSD %XMM4,%XMM4,%XMM12 |
0x25925 VADDSD 0x128(%RAX),%XMM13,%XMM4 | 0x261cb VADDSD -0x1e13(%RIP),%XMM13,%XMM4 | 0x1b7b6 VADDSD 0x128(%RAX),%XMM13,%XMM4 |
0x2592d VMINSD %XMM3,%XMM1,%XMM2 | 0x261d3 VMINSD %XMM3,%XMM1,%XMM2 | 0x1b7be VMINSD %XMM3,%XMM1,%XMM2 |
0x25931 VCOMISD %XMM3,%XMM1 | 0x261d7 VCOMISD %XMM3,%XMM1 | 0x1b7c2 VCOMISD %XMM3,%XMM1 |
0x25935 VADDSD 0xa0(%RAX),%XMM14,%XMM1 | 0x261db VADDSD -0x1e23(%RIP),%XMM14,%XMM1 | 0x1b7c6 VADDSD 0xa0(%RAX),%XMM14,%XMM1 |
0x2593d VADDSD 0x120(%RAX),%XMM13,%XMM3 | 0x261e3 VADDSD -0x1e2b(%RIP),%XMM13,%XMM3 | 0x1b7ce VADDSD 0x120(%RAX),%XMM13,%XMM3 |
0x25945 VFMADD132SD %XMM1,%XMM12,%XMM1 | 0x261eb VFMADD132SD %XMM1,%XMM12,%XMM1 | 0x1b7d6 VFMADD132SD %XMM1,%XMM12,%XMM1 |
0x2594a SETA %DL | 0x261f0 SETA %DL | 0x1b7db SETA %DL |
0x2594d VADDSD 0xf0(%RAX),%XMM15,%XMM12 | 0x261f3 VADDSD -0x1e3b(%RIP),%XMM15,%XMM12 | 0x1b7de VADDSD 0xf0(%RAX),%XMM15,%XMM12 |
0x25955 MOVZX %DL,%EDX | 0x261fb MOVZX %DL,%EDX | 0x1b7e6 MOVZX %DL,%EDX |
0x25958 VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x261fe VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x1b7e9 VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x2595d VADDSD 0xe8(%RAX),%XMM15,%XMM1 | 0x26203 VADDSD -0x1e4b(%RIP),%XMM15,%XMM1 | 0x1b7ee VADDSD 0xe8(%RAX),%XMM15,%XMM1 |
0x25965 VCOMISD %XMM3,%XMM2 | 0x2620b VCOMISD %XMM3,%XMM2 | 0x1b7f6 VCOMISD %XMM3,%XMM2 |
0x25969 VMINSD %XMM2,%XMM3,%XMM0 | 0x2620f VMINSD %XMM2,%XMM3,%XMM0 | 0x1b7fa VMINSD %XMM2,%XMM3,%XMM0 |
0x2596d VADDSD 0xa8(%RAX),%XMM14,%XMM2 | 0x26213 VADDSD -0x1e5b(%RIP),%XMM14,%XMM2 | 0x1b7fe VADDSD 0xa8(%RAX),%XMM14,%XMM2 |
0x25975 VMULSD %XMM1,%XMM1,%XMM3 | 0x2621b VMULSD %XMM1,%XMM1,%XMM3 | 0x1b806 VMULSD %XMM1,%XMM1,%XMM3 |
0x25979 VADDSD 0xb0(%RAX),%XMM14,%XMM1 | 0x2621f VADDSD -0x1e67(%RIP),%XMM14,%XMM1 | 0x1b80a VADDSD 0xb0(%RAX),%XMM14,%XMM1 |
0x25981 CMOVA %R15,%RDX | 0x26227 CMOVA %R15,%RDX | 0x1b812 CMOVA %R15,%RDX |
0x25985 VFMADD132SD %XMM2,%XMM3,%XMM2 | 0x2622b VFMADD132SD %XMM2,%XMM3,%XMM2 | 0x1b816 VFMADD132SD %XMM2,%XMM3,%XMM2 |
0x2598a VFMADD132SD %XMM4,%XMM2,%XMM4 | 0x26230 VFMADD132SD %XMM4,%XMM2,%XMM4 | 0x1b81b VFMADD132SD %XMM4,%XMM2,%XMM4 |
0x2598f VADDSD 0x130(%RAX),%XMM13,%XMM2 | 0x26235 VADDSD -0x1e7d(%RIP),%XMM13,%XMM2 | 0x1b820 VADDSD 0x130(%RAX),%XMM13,%XMM2 |
0x25997 VADDSD 0x138(%RAX),%XMM13,%XMM3 | 0x2623d VADDSD -0x1e85(%RIP),%XMM13,%XMM3 | 0x1b828 VADDSD 0x138(%RAX),%XMM13,%XMM3 |
0x2599f VCOMISD %XMM4,%XMM0 | 0x26245 VCOMISD %XMM4,%XMM0 | 0x1b830 VCOMISD %XMM4,%XMM0 |
0x259a3 VMINSD %XMM0,%XMM4,%XMM0 | 0x26249 VMINSD %XMM0,%XMM4,%XMM0 | 0x1b834 VMINSD %XMM0,%XMM4,%XMM0 |
0x259a7 VMULSD %XMM12,%XMM12,%XMM4 | 0x2624d VMULSD %XMM12,%XMM12,%XMM4 | 0x1b838 VMULSD %XMM12,%XMM12,%XMM4 |
0x259ac VADDSD 0xf8(%RAX),%XMM15,%XMM12 | 0x26252 VADDSD -0x1e9a(%RIP),%XMM15,%XMM12 | 0x1b83d VADDSD 0xf8(%RAX),%XMM15,%XMM12 |
0x259b4 CMOVA %R14,%RDX | 0x2625a CMOVA %R14,%RDX | 0x1b845 CMOVA %R14,%RDX |
0x259b8 VFMADD132SD %XMM1,%XMM4,%XMM1 | 0x2625e VFMADD132SD %XMM1,%XMM4,%XMM1 | 0x1b849 VFMADD132SD %XMM1,%XMM4,%XMM1 |
0x259bd VADDSD 0xc0(%RAX),%XMM14,%XMM4 | 0x26263 VADDSD -0x1eab(%RIP),%XMM14,%XMM4 | 0x1b84e VADDSD 0xc0(%RAX),%XMM14,%XMM4 |
0x259c5 VFMADD132SD %XMM2,%XMM1,%XMM2 | 0x2626b VFMADD132SD %XMM2,%XMM1,%XMM2 | 0x1b856 VFMADD132SD %XMM2,%XMM1,%XMM2 |
0x259ca VADDSD 0xb8(%RAX),%XMM14,%XMM1 | 0x26270 VADDSD -0x1eb8(%RIP),%XMM14,%XMM1 | 0x1b85b VADDSD 0xb8(%RAX),%XMM14,%XMM1 |
0x259d2 VCOMISD %XMM2,%XMM0 | 0x26278 VCOMISD %XMM2,%XMM0 | 0x1b863 VCOMISD %XMM2,%XMM0 |
0x259d6 VMINSD %XMM0,%XMM2,%XMM0 | 0x2627c VMINSD %XMM0,%XMM2,%XMM0 | 0x1b867 VMINSD %XMM0,%XMM2,%XMM0 |
0x259da VMULSD %XMM12,%XMM12,%XMM2 | 0x26280 VMULSD %XMM12,%XMM12,%XMM2 | 0x1b86b VMULSD %XMM12,%XMM12,%XMM2 |
0x259df CMOVA %R13,%RDX | 0x26285 CMOVA %R13,%RDX | 0x1b870 CMOVA %R13,%RDX |
0x259e3 MOV $0x5,%R13D | 0x26289 MOV $0x5,%R13D | 0x1b874 MOV $0x5,%R13D |
0x259e9 VFMADD132SD %XMM1,%XMM2,%XMM1 | 0x2628f VFMADD132SD %XMM1,%XMM2,%XMM1 | 0x1b87a VFMADD132SD %XMM1,%XMM2,%XMM1 |
0x259ee VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x26294 VFMADD132SD %XMM3,%XMM1,%XMM3 | 0x1b87f VFMADD132SD %XMM3,%XMM1,%XMM3 |
0x259f3 VADDSD 0x100(%RAX),%XMM15,%XMM1 | 0x26299 VADDSD -0x1ee1(%RIP),%XMM15,%XMM1 | 0x1b884 VADDSD 0x100(%RAX),%XMM15,%XMM1 |
0x259fb VMULSD %XMM1,%XMM1,%XMM12 | 0x262a1 VMULSD %XMM1,%XMM1,%XMM12 | 0x1b88c VMULSD %XMM1,%XMM1,%XMM12 |
0x259ff VADDSD 0x148(%RAX),%XMM13,%XMM1 | 0x262a5 VADDSD -0x1eed(%RIP),%XMM13,%XMM1 | 0x1b890 VADDSD 0x148(%RAX),%XMM13,%XMM1 |
0x25a07 VCOMISD %XMM3,%XMM0 | 0x262ad VCOMISD %XMM3,%XMM0 | 0x1b898 VCOMISD %XMM3,%XMM0 |
0x25a0b VMINSD %XMM0,%XMM3,%XMM0 | 0x262b1 VMINSD %XMM0,%XMM3,%XMM0 | 0x1b89c VMINSD %XMM0,%XMM3,%XMM0 |
0x25a0f VADDSD 0x140(%RAX),%XMM13,%XMM3 | 0x262b5 VADDSD -0x1efd(%RIP),%XMM13,%XMM3 | 0x1b8a0 VADDSD 0x140(%RAX),%XMM13,%XMM3 |
0x25a17 VFMADD132SD %XMM4,%XMM12,%XMM4 | 0x262bd VFMADD132SD %XMM4,%XMM12,%XMM4 | 0x1b8a8 VFMADD132SD %XMM4,%XMM12,%XMM4 |
0x25a1c CMOVA %R13,%RDX | 0x262c2 CMOVA %R13,%RDX | 0x1b8ad CMOVA %R13,%RDX |
0x25a20 MOV $0x6,%R13D | 0x262c6 MOV $0x6,%R13D | 0x1b8b1 MOV $0x6,%R13D |
0x25a26 VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x262cc VFMADD132SD %XMM3,%XMM4,%XMM3 | 0x1b8b7 VFMADD132SD %XMM3,%XMM4,%XMM3 |
0x25a2b VADDSD 0x108(%RAX),%XMM15,%XMM4 | 0x262d1 VADDSD -0x1f19(%RIP),%XMM15,%XMM4 | 0x1b8bc VADDSD 0x108(%RAX),%XMM15,%XMM4 |
0x25a33 VMINSD %XMM0,%XMM3,%XMM2 | 0x262d9 VMINSD %XMM0,%XMM3,%XMM2 | 0x1b8c4 VMINSD %XMM0,%XMM3,%XMM2 |
0x25a37 VCOMISD %XMM3,%XMM0 | 0x262dd VCOMISD %XMM3,%XMM0 | 0x1b8c8 VCOMISD %XMM3,%XMM0 |
0x25a3b VADDSD 0xc8(%RAX),%XMM14,%XMM0 | 0x262e1 VADDSD -0x1f29(%RIP),%XMM14,%XMM0 | 0x1b8cc VADDSD 0xc8(%RAX),%XMM14,%XMM0 |
0x25a43 VMULSD %XMM4,%XMM4,%XMM3 | 0x262e9 VMULSD %XMM4,%XMM4,%XMM3 | 0x1b8d4 VMULSD %XMM4,%XMM4,%XMM3 |
0x25a47 CMOVA %R13,%RDX | 0x262ed CMOVA %R13,%RDX | 0x1b8d8 CMOVA %R13,%RDX |
0x25a4b MOV $0x7,%R13D | 0x262f1 MOV $0x7,%R13D | 0x1b8dc MOV $0x7,%R13D |
0x25a51 VFMADD132SD %XMM0,%XMM3,%XMM0 | 0x262f7 VFMADD132SD %XMM0,%XMM3,%XMM0 | 0x1b8e2 VFMADD132SD %XMM0,%XMM3,%XMM0 |
0x25a56 VFMADD132SD %XMM1,%XMM0,%XMM1 | 0x262fc VFMADD132SD %XMM1,%XMM0,%XMM1 | 0x1b8e7 VFMADD132SD %XMM1,%XMM0,%XMM1 |
0x25a5b VCOMISD %XMM1,%XMM2 | 0x26301 VCOMISD %XMM1,%XMM2 | 0x1b8ec VCOMISD %XMM1,%XMM2 |
0x25a5f VMINSD %XMM1,%XMM2,%XMM12 | 0x26305 VMINSD %XMM1,%XMM2,%XMM12 | 0x1b8f0 VMINSD %XMM1,%XMM2,%XMM12 |
0x25a63 CMOVA %R13,%RDX | 0x26309 CMOVA %R13,%RDX | 0x1b8f4 CMOVA %R13,%RDX |
0x25a67 VSQRTSD %XMM12,%XMM12,%XMM12 | 0x2630d VSQRTSD -0x1f95(%RIP),%XMM12,%XMM12 | 0x1b8f8 VSQRTSD %XMM12,%XMM12,%XMM12 |
0x25a6c LEA (%RAX,%RDX,8),%RDX | 0x26315 LEA (%RAX,%RDX,8),%RDX | 0x1b8fd LEA (%RAX,%RDX,8),%RDX |
0x25a70 VADDSD 0x90(%RDX),%XMM14,%XMM14 | 0x26319 VADDSD -0x1f61(%RIP),%XMM14,%XMM14 | 0x1b901 VADDSD 0x90(%RDX),%XMM14,%XMM14 |
0x25a78 VMOVSD %XMM12,(%R10,%RCX,8) | 0x26321 VMOVSD %XMM12,-0x1ee9(%RIP) 0x26329 NOP | 0x1b909 VMOVSD %XMM12,(%R10,%RCX,8) |
0x25a7e VMULSD %XMM11,%XMM14,%XMM2 | 0x2632a VMULSD %XMM11,%XMM14,%XMM2 | 0x1b90f VMULSD %XMM11,%XMM14,%XMM2 |
0x25a83 VMOVSD %XMM2,(%R12,%RCX,8) | 0x2632f VMOVSD %XMM2,-0x1eb7(%RIP) 0x26337 NOP | 0x1b914 VMOVSD %XMM2,(%R12,%RCX,8) |
0x25a89 VADDSD 0xd0(%RDX),%XMM15,%XMM15 | 0x26338 VADDSD -0x1f80(%RIP),%XMM15,%XMM15 | 0x1b91a VADDSD 0xd0(%RDX),%XMM15,%XMM15 |
0x25a91 VMULSD %XMM11,%XMM15,%XMM0 | 0x26340 VMULSD %XMM11,%XMM15,%XMM0 | 0x1b922 VMULSD %XMM11,%XMM15,%XMM0 |
0x25a96 VMOVSD %XMM0,(%R8,%RCX,8) | 0x26345 VMOVSD %XMM0,-0x1e8d(%RIP) 0x2634d NOP | 0x1b927 VMOVSD %XMM0,(%R8,%RCX,8) |
0x25a9c VADDSD 0x110(%RDX),%XMM13,%XMM13 | 0x2634e VADDSD -0x1f96(%RIP),%XMM13,%XMM13 | 0x1b92d VADDSD 0x110(%RDX),%XMM13,%XMM13 |
0x25aa4 VMULSD %XMM11,%XMM13,%XMM11 | 0x26356 VMULSD %XMM11,%XMM13,%XMM11 | 0x1b935 VMULSD %XMM11,%XMM13,%XMM11 |
0x25aa9 VMOVSD %XMM11,(%RDI,%RCX,8) | 0x2635b VMOVSD %XMM11,-0x1e63(%RIP) 0x26363 NOP | 0x1b93a VMOVSD %XMM11,(%RDI,%RCX,8) |
0x25aae INC %RCX | 0x26364 INC %RCX | 0x1b93f INC %RCX |
0x25ab1 CMP %R9,%RCX | 0x26367 CMP %R9,%RCX | 0x1b942 CMP %R9,%RCX |
0x25ab4 JNE 25807 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0xa1e7> | 0x2636a JNE 26079 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0xaa59> | 0x1b945 JNE 1b6a0 <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0x80> |
Path / |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 2.16, 2.16, | 2.16, 2.16, | 2.16, 2.16, |
cycles L1 CQA | 49.50 | 49.50 | 49.50 |
cycles UFS | 65.51 | 65.56 | 65.90 |
bytes loaded | 372.00 | 372.00 | 364.00 |
bytes stored | 40.00 | 32.00 | 32.00 |
nb loads | 47.00 | 47.00 | 46.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 49.50 | 49.50 | 49.50 |
cycles front end | 33.38 | 33.88 | 32.88 |
cycles P0 | 49.50 | 49.50 | 49.50 |
cycles P1 | 49.50 | 49.50 | 49.50 |
cycles P2 | 23.50 | 23.50 | 23.00 |
cycles P3 | 23.50 | 23.50 | 23.00 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 10.50 | 9.50 | 9.50 |
cycles P6 | 15.00 | 15.00 | 15.00 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 31.69 | 31.22 | 32.57 |
LB full | 6.62 | 7.95 | 3.92 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 0.00 | 0.00 | 0.00 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 51.72 | 48.37 | 55.23 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 133.50 | 135.50 | 131.50 |
uops P0 | 49.50 | 49.50 | 49.50 |
uops P1 | 49.50 | 49.50 | 49.50 |
uops P2 | 23.50 | 23.50 | 23.00 |
uops P3 | 23.50 | 23.50 | 23.00 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 10.50 | 9.50 | 9.50 |
uops P6 | 15.00 | 15.00 | 15.00 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 168 | 170 | 167 |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 2.16, 2.16, | 2.16, 2.16, | 2.16, 2.16, |
cycles L1 CQA | 49.50 | 49.50 | 49.50 |
cycles UFS | 65.65 | 65.38 | 65.64 |
bytes loaded | 372.00 | 372.00 | 364.00 |
bytes stored | 40.00 | 32.00 | 32.00 |
nb loads | 47.00 | 47.00 | 46.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 49.50 | 49.50 | 49.50 |
cycles front end | 33.50 | 34.00 | 33.00 |
cycles P0 | 49.50 | 49.50 | 49.50 |
cycles P1 | 49.50 | 49.50 | 49.50 |
cycles P2 | 23.50 | 23.50 | 23.00 |
cycles P3 | 23.50 | 23.50 | 23.00 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 11.00 | 10.00 | 10.00 |
cycles P6 | 15.00 | 15.00 | 15.00 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 31.70 | 30.92 | 32.18 |
LB full | 6.26 | 8.15 | 4.01 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 0.00 | 0.00 | 0.00 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 52.71 | 47.81 | 54.69 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 134.00 | 136.00 | 132.00 |
uops P0 | 49.50 | 49.50 | 49.50 |
uops P1 | 49.50 | 49.50 | 49.50 |
uops P2 | 23.50 | 23.50 | 23.00 |
uops P3 | 23.50 | 23.50 | 23.00 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 11.00 | 10.00 | 10.00 |
uops P6 | 15.00 | 15.00 | 15.00 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 168 | 170 | 167 |
Metric | ORIG | DL1 | Original |
---|---|---|---|
FP operations per cycle L1 | 2.16, 2.16, | 2.16, 2.16, | 2.16, 2.16, |
cycles L1 CQA | 49.50 | 49.50 | 49.50 |
cycles UFS | 65.37 | 65.73 | 66.16 |
bytes loaded | 372.00 | 372.00 | 364.00 |
bytes stored | 40.00 | 32.00 | 32.00 |
nb loads | 47.00 | 47.00 | 46.00 |
nb stores | 5.00 | 4.00 | 4.00 |
cycles dispatch | 49.50 | 49.50 | 49.50 |
cycles front end | 33.25 | 33.75 | 32.75 |
cycles P0 | 49.50 | 49.50 | 49.50 |
cycles P1 | 49.50 | 49.50 | 49.50 |
cycles P2 | 23.50 | 23.50 | 23.00 |
cycles P3 | 23.50 | 23.50 | 23.00 |
cycles P4 | 5.00 | 4.00 | 4.00 |
cycles P5 | 10.00 | 9.00 | 9.00 |
cycles P6 | 15.00 | 15.00 | 15.00 |
cycles P7 | 5.00 | 4.00 | 4.00 |
stall cycles | 31.67 | 31.52 | 32.95 |
LB full | 6.97 | 7.75 | 3.82 |
LM full | 0.00 | 0.00 | 0.00 |
PRF full | 0.00 | 0.00 | 0.00 |
PRF_FLOAT full | 0.00 | 0.00 | 0.00 |
PRF_INT full | 0.00 | 0.00 | 0.00 |
ROB full | 0.00 | 0.00 | 0.00 |
RS full | 50.73 | 48.92 | 55.78 |
SB full | 0.00 | 0.00 | 0.00 |
nb uops | 133.00 | 135.00 | 131.00 |
uops P0 | 49.50 | 49.50 | 49.50 |
uops P1 | 49.50 | 49.50 | 49.50 |
uops P2 | 23.50 | 23.50 | 23.00 |
uops P3 | 23.50 | 23.50 | 23.00 |
uops P4 | 5.00 | 4.00 | 4.00 |
uops P5 | 10.00 | 9.00 | 9.00 |
uops P6 | 15.00 | 15.00 | 15.00 |
uops P7 | 5.00 | 4.00 | 4.00 |
ID | 168 | 170 | 167 |