Function: void qmcplusplus::DTD_BConds<double, 3u, 39>::computeDistances<qmcplusplus::TinyVector<dou ... | Module: libqmcparticle.so | Source: ParticleBConds.h:159-219 [...] | Coverage: 52.75% |
---|
Function: void qmcplusplus::DTD_BConds<double, 3u, 39>::computeDistances<qmcplusplus::TinyVector<dou ... | Module: libqmcparticle.so | Source: ParticleBConds.h:159-219 [...] | Coverage: 52.75% |
---|
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/VectorSoAContainer.h: 241 - 243 |
-------------------------------------------------------------------------------- |
241: T* restrict data(size_t i) { return myData + i * nGhosts; } |
242: ///return the const pointer of the i-th components |
243: const T* restrict data(size_t i) const { return myData + i * nGhosts; } |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Particle/Lattice/ParticleBConds.h: 159 - 219 |
-------------------------------------------------------------------------------- |
159: void computeDistances(const PT& pos, |
[...] |
167: const T x0 = pos[0]; |
168: const T y0 = pos[1]; |
169: const T z0 = pos[2]; |
[...] |
185: #pragma omp simd aligned(temp_r, px, py, pz, dx, dy, dz: QMC_SIMD_ALIGNMENT) |
186: for (int iat = first; iat < last; ++iat) |
187: { |
188: const T flip = iat < flip_ind ? one : minusone; |
189: const T displ_0 = (px[iat] - x0) * flip; |
190: const T displ_1 = (py[iat] - y0) * flip; |
191: const T displ_2 = (pz[iat] - z0) * flip; |
192: |
193: const T ar_0 = -std::floor(displ_0 * g00 + displ_1 * g10 + displ_2 * g20); |
194: const T ar_1 = -std::floor(displ_0 * g01 + displ_1 * g11 + displ_2 * g21); |
195: const T ar_2 = -std::floor(displ_0 * g02 + displ_1 * g12 + displ_2 * g22); |
196: |
197: const T delx = displ_0 + ar_0 * r00 + ar_1 * r10 + ar_2 * r20; |
198: const T dely = displ_1 + ar_0 * r01 + ar_1 * r11 + ar_2 * r21; |
199: const T delz = displ_2 + ar_0 * r02 + ar_1 * r12 + ar_2 * r22; |
200: |
201: T rmin = delx * delx + dely * dely + delz * delz; |
202: int ic = 0; |
203: #pragma unroll(7) |
204: for (int c = 1; c < 8; ++c) |
205: { |
206: const T x = delx + cellx[c]; |
207: const T y = dely + celly[c]; |
208: const T z = delz + cellz[c]; |
209: const T r2 = x * x + y * y + z * z; |
210: ic = (r2 < rmin) ? c : ic; |
211: rmin = (r2 < rmin) ? r2 : rmin; |
212: } |
213: |
214: temp_r[iat] = std::sqrt(rmin); |
215: dx[iat] = flip * (delx + cellx[ic]); |
216: dy[iat] = flip * (dely + celly[ic]); |
217: dz[iat] = flip * (delz + cellz[ic]); |
218: } |
219: } |
0x1b620 PUSH %RBP |
0x1b621 MOV %RDI,%RAX |
0x1b624 MOV %RSP,%RBP |
0x1b627 PUSH %R15 |
0x1b629 PUSH %R14 |
0x1b62b PUSH %R13 |
0x1b62d PUSH %R12 |
0x1b62f PUSH %RBX |
0x1b630 MOV 0x8(%R8),%RDI |
0x1b634 VMOVSD (%RSI),%XMM5 |
0x1b638 VMOVSD 0x8(%RSI),%XMM6 |
0x1b63d VMOVSD 0x10(%RSI),%XMM7 |
0x1b642 MOV 0x8(%RDX),%RSI |
0x1b646 MOV 0x18(%RDX),%R11 |
0x1b64a SAL $0x3,%RDI |
0x1b64e MOV 0x18(%R8),%R12 |
0x1b652 SAL $0x3,%RSI |
0x1b656 LEA (%R11,%RSI,1),%RBX |
0x1b65a LEA (%R12,%RDI,1),%R8 |
0x1b65e ADD %R8,%RDI |
0x1b661 ADD %RBX,%RSI |
0x1b664 TEST %R9D,%R9D |
0x1b667 JLE 1b94b |
0x1b66d VMOVQ 0x310b(%RIP),%XMM9 |
0x1b675 VMOVSD 0x30a3(%RIP),%XMM8 |
0x1b67d MOV %RCX,%R10 |
0x1b680 MOVSXD %R9D,%R9 |
0x1b683 VMOVSD 0x3085(%RIP),%XMM10 |
0x1b68b XOR %ECX,%ECX |
0x1b68d MOV $0x2,%R15D |
0x1b693 MOV $0x3,%R14D |
0x1b699 NOPL (%RAX) |
(167) 0x1b6a0 VMOVSD %XMM8,%XMM8,%XMM11 |
(167) 0x1b6a5 CMP %ECX,0x10(%RBP) |
(167) 0x1b6a8 JLE 1b6af |
(167) 0x1b6aa VMOVSD %XMM10,%XMM10,%XMM11 |
(167) 0x1b6af VMOVSD (%RBX,%RCX,8),%XMM12 |
(167) 0x1b6b4 VMOVSD (%R11,%RCX,8),%XMM1 |
(167) 0x1b6ba MOV $0x4,%R13D |
(167) 0x1b6c0 VMOVSD (%RSI,%RCX,8),%XMM13 |
(167) 0x1b6c5 VSUBSD %XMM6,%XMM12,%XMM2 |
(167) 0x1b6c9 VSUBSD %XMM5,%XMM1,%XMM0 |
(167) 0x1b6cd VSUBSD %XMM7,%XMM13,%XMM3 |
(167) 0x1b6d1 VMULSD %XMM11,%XMM2,%XMM15 |
(167) 0x1b6d6 VMULSD %XMM11,%XMM0,%XMM14 |
(167) 0x1b6db VMULSD %XMM11,%XMM3,%XMM12 |
(167) 0x1b6e0 VMULSD 0x20(%RAX),%XMM15,%XMM1 |
(167) 0x1b6e5 VMULSD 0x8(%RAX),%XMM15,%XMM4 |
(167) 0x1b6ea VMULSD 0x38(%RAX),%XMM15,%XMM3 |
(167) 0x1b6ef VFMADD231SD 0x18(%RAX),%XMM14,%XMM1 |
(167) 0x1b6f5 VFMADD231SD (%RAX),%XMM14,%XMM4 |
(167) 0x1b6fa VFMADD231SD 0x30(%RAX),%XMM14,%XMM3 |
(167) 0x1b700 VFMADD231SD 0x28(%RAX),%XMM12,%XMM1 |
(167) 0x1b706 VFMADD231SD 0x10(%RAX),%XMM12,%XMM4 |
(167) 0x1b70c VFMADD231SD 0x40(%RAX),%XMM12,%XMM3 |
(167) 0x1b712 VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 |
(167) 0x1b719 VXORPD %XMM9,%XMM0,%XMM2 |
(167) 0x1b71e VMULSD 0x68(%RAX),%XMM2,%XMM0 |
(167) 0x1b723 VRNDSCALESD $0x9,%XMM4,%XMM4,%XMM13 |
(167) 0x1b72a VMULSD 0x50(%RAX),%XMM2,%XMM1 |
(167) 0x1b72f VRNDSCALESD $0x9,%XMM3,%XMM3,%XMM4 |
(167) 0x1b736 VFNMADD231SD 0x70(%RAX),%XMM4,%XMM15 |
(167) 0x1b73c VMULSD 0x80(%RAX),%XMM2,%XMM2 |
(167) 0x1b744 VFNMADD231SD 0x88(%RAX),%XMM4,%XMM12 |
(167) 0x1b74d VFNMADD231SD 0x58(%RAX),%XMM4,%XMM14 |
(167) 0x1b753 VFNMADD231SD 0x60(%RAX),%XMM13,%XMM0 |
(167) 0x1b759 VFNMADD231SD 0x48(%RAX),%XMM13,%XMM1 |
(167) 0x1b75f VFNMADD132SD 0x78(%RAX),%XMM2,%XMM13 |
(167) 0x1b765 VADDSD %XMM15,%XMM0,%XMM15 |
(167) 0x1b76a VADDSD %XMM14,%XMM1,%XMM14 |
(167) 0x1b76f VADDSD %XMM12,%XMM13,%XMM13 |
(167) 0x1b774 VADDSD 0xd8(%RAX),%XMM15,%XMM12 |
(167) 0x1b77c VMULSD %XMM15,%XMM15,%XMM1 |
(167) 0x1b781 VADDSD 0x98(%RAX),%XMM14,%XMM4 |
(167) 0x1b789 VADDSD 0x118(%RAX),%XMM13,%XMM3 |
(167) 0x1b791 VMULSD %XMM12,%XMM12,%XMM0 |
(167) 0x1b796 VFMADD231SD %XMM14,%XMM14,%XMM1 |
(167) 0x1b79b VFMADD132SD %XMM4,%XMM0,%XMM4 |
(167) 0x1b7a0 VFMADD231SD %XMM13,%XMM13,%XMM1 |
(167) 0x1b7a5 VFMADD132SD %XMM3,%XMM4,%XMM3 |
(167) 0x1b7aa VADDSD 0xe0(%RAX),%XMM15,%XMM4 |
(167) 0x1b7b2 VMULSD %XMM4,%XMM4,%XMM12 |
(167) 0x1b7b6 VADDSD 0x128(%RAX),%XMM13,%XMM4 |
(167) 0x1b7be VMINSD %XMM3,%XMM1,%XMM2 |
(167) 0x1b7c2 VCOMISD %XMM3,%XMM1 |
(167) 0x1b7c6 VADDSD 0xa0(%RAX),%XMM14,%XMM1 |
(167) 0x1b7ce VADDSD 0x120(%RAX),%XMM13,%XMM3 |
(167) 0x1b7d6 VFMADD132SD %XMM1,%XMM12,%XMM1 |
(167) 0x1b7db SETA %DL |
(167) 0x1b7de VADDSD 0xf0(%RAX),%XMM15,%XMM12 |
(167) 0x1b7e6 MOVZX %DL,%EDX |
(167) 0x1b7e9 VFMADD132SD %XMM3,%XMM1,%XMM3 |
(167) 0x1b7ee VADDSD 0xe8(%RAX),%XMM15,%XMM1 |
(167) 0x1b7f6 VCOMISD %XMM3,%XMM2 |
(167) 0x1b7fa VMINSD %XMM2,%XMM3,%XMM0 |
(167) 0x1b7fe VADDSD 0xa8(%RAX),%XMM14,%XMM2 |
(167) 0x1b806 VMULSD %XMM1,%XMM1,%XMM3 |
(167) 0x1b80a VADDSD 0xb0(%RAX),%XMM14,%XMM1 |
(167) 0x1b812 CMOVA %R15,%RDX |
(167) 0x1b816 VFMADD132SD %XMM2,%XMM3,%XMM2 |
(167) 0x1b81b VFMADD132SD %XMM4,%XMM2,%XMM4 |
(167) 0x1b820 VADDSD 0x130(%RAX),%XMM13,%XMM2 |
(167) 0x1b828 VADDSD 0x138(%RAX),%XMM13,%XMM3 |
(167) 0x1b830 VCOMISD %XMM4,%XMM0 |
(167) 0x1b834 VMINSD %XMM0,%XMM4,%XMM0 |
(167) 0x1b838 VMULSD %XMM12,%XMM12,%XMM4 |
(167) 0x1b83d VADDSD 0xf8(%RAX),%XMM15,%XMM12 |
(167) 0x1b845 CMOVA %R14,%RDX |
(167) 0x1b849 VFMADD132SD %XMM1,%XMM4,%XMM1 |
(167) 0x1b84e VADDSD 0xc0(%RAX),%XMM14,%XMM4 |
(167) 0x1b856 VFMADD132SD %XMM2,%XMM1,%XMM2 |
(167) 0x1b85b VADDSD 0xb8(%RAX),%XMM14,%XMM1 |
(167) 0x1b863 VCOMISD %XMM2,%XMM0 |
(167) 0x1b867 VMINSD %XMM0,%XMM2,%XMM0 |
(167) 0x1b86b VMULSD %XMM12,%XMM12,%XMM2 |
(167) 0x1b870 CMOVA %R13,%RDX |
(167) 0x1b874 MOV $0x5,%R13D |
(167) 0x1b87a VFMADD132SD %XMM1,%XMM2,%XMM1 |
(167) 0x1b87f VFMADD132SD %XMM3,%XMM1,%XMM3 |
(167) 0x1b884 VADDSD 0x100(%RAX),%XMM15,%XMM1 |
(167) 0x1b88c VMULSD %XMM1,%XMM1,%XMM12 |
(167) 0x1b890 VADDSD 0x148(%RAX),%XMM13,%XMM1 |
(167) 0x1b898 VCOMISD %XMM3,%XMM0 |
(167) 0x1b89c VMINSD %XMM0,%XMM3,%XMM0 |
(167) 0x1b8a0 VADDSD 0x140(%RAX),%XMM13,%XMM3 |
(167) 0x1b8a8 VFMADD132SD %XMM4,%XMM12,%XMM4 |
(167) 0x1b8ad CMOVA %R13,%RDX |
(167) 0x1b8b1 MOV $0x6,%R13D |
(167) 0x1b8b7 VFMADD132SD %XMM3,%XMM4,%XMM3 |
(167) 0x1b8bc VADDSD 0x108(%RAX),%XMM15,%XMM4 |
(167) 0x1b8c4 VMINSD %XMM0,%XMM3,%XMM2 |
(167) 0x1b8c8 VCOMISD %XMM3,%XMM0 |
(167) 0x1b8cc VADDSD 0xc8(%RAX),%XMM14,%XMM0 |
(167) 0x1b8d4 VMULSD %XMM4,%XMM4,%XMM3 |
(167) 0x1b8d8 CMOVA %R13,%RDX |
(167) 0x1b8dc MOV $0x7,%R13D |
(167) 0x1b8e2 VFMADD132SD %XMM0,%XMM3,%XMM0 |
(167) 0x1b8e7 VFMADD132SD %XMM1,%XMM0,%XMM1 |
(167) 0x1b8ec VCOMISD %XMM1,%XMM2 |
(167) 0x1b8f0 VMINSD %XMM1,%XMM2,%XMM12 |
(167) 0x1b8f4 CMOVA %R13,%RDX |
(167) 0x1b8f8 VSQRTSD %XMM12,%XMM12,%XMM12 |
(167) 0x1b8fd LEA (%RAX,%RDX,8),%RDX |
(167) 0x1b901 VADDSD 0x90(%RDX),%XMM14,%XMM14 |
(167) 0x1b909 VMOVSD %XMM12,(%R10,%RCX,8) |
(167) 0x1b90f VMULSD %XMM11,%XMM14,%XMM2 |
(167) 0x1b914 VMOVSD %XMM2,(%R12,%RCX,8) |
(167) 0x1b91a VADDSD 0xd0(%RDX),%XMM15,%XMM15 |
(167) 0x1b922 VMULSD %XMM11,%XMM15,%XMM0 |
(167) 0x1b927 VMOVSD %XMM0,(%R8,%RCX,8) |
(167) 0x1b92d VADDSD 0x110(%RDX),%XMM13,%XMM13 |
(167) 0x1b935 VMULSD %XMM11,%XMM13,%XMM11 |
(167) 0x1b93a VMOVSD %XMM11,(%RDI,%RCX,8) |
(167) 0x1b93f INC %RCX |
(167) 0x1b942 CMP %R9,%RCX |
(167) 0x1b945 JNE 1b6a0 |
0x1b94b POP %RBX |
0x1b94c POP %R12 |
0x1b94e POP %R13 |
0x1b950 POP %R14 |
0x1b952 POP %R15 |
0x1b954 POP %RBP |
0x1b955 RET |
0x1b956 NOPW %CS:(%RAX,%RAX,1) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►40.97+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | refwrap.h:346 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►20.83+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | refwrap.h:346 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►14.58+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:77 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::setA[...] | ParticleSet.cpp:259 | libqmcparticle.so |
○ | main._omp_fn.1 | stl_vector.h:1123 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►11.81+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | stl_vector.h:1126 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►5.56+ | qmcplusplus::DistanceTableAA<d[...] | DistanceTableAA.h:84 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | stl_vector.h:1123 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►1.39+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:120 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | refwrap.h:346 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►1.39+ | qmcplusplus::DistanceTableBA<d[...] | DistanceTableBA.h:120 | libqmcparticle.so |
○ | qmcplusplus::ParticleSet::make[...] | stl_vector.h:989 | libqmcparticle.so |
○ | main._omp_fn.1 | stl_vector.h:1126 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
Path / |
Source file and lines | ParticleBConds.h:159-219 |
Module | libqmcparticle.so |
nb instructions | 40 |
nb uops | 40 |
loop length | 149 |
used x86 registers | 16 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.00 cycles |
front end | 10.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 3.00 | 3.00 | 8.00 | 8.00 | 6.00 | 3.00 | 3.00 | 7.00 |
cycles | 3.00 | 3.00 | 8.00 | 8.00 | 6.00 | 3.00 | 3.00 | 7.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 10.12 |
Stall cycles | 0.00 |
Front-end | 10.00 |
Dispatch | 8.00 |
Overall L1 | 10.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 9% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 8% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 10% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 8% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
PUSH %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0x8(%R8),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%RSI),%XMM5 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x8(%RSI),%XMM6 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x10(%RSI),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x8(%RDX),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x18(%RDX),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
SAL $0x3,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
MOV 0x18(%R8),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
SAL $0x3,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
LEA (%R11,%RSI,1),%RBX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA (%R12,%RDI,1),%R8 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
ADD %R8,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RBX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
TEST %R9D,%R9D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JLE 1b94b <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0x32b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVQ 0x310b(%RIP),%XMM9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x30a3(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %RCX,%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOVSXD %R9D,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VMOVSD 0x3085(%RIP),%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV $0x2,%R15D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV $0x3,%R14D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
Source file and lines | ParticleBConds.h:159-219 |
Module | libqmcparticle.so |
nb instructions | 40 |
nb uops | 40 |
loop length | 149 |
used x86 registers | 16 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.00 cycles |
front end | 10.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 3.00 | 3.00 | 8.00 | 8.00 | 6.00 | 3.00 | 3.00 | 7.00 |
cycles | 3.00 | 3.00 | 8.00 | 8.00 | 6.00 | 3.00 | 3.00 | 7.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 10.12 |
Stall cycles | 0.00 |
Front-end | 10.00 |
Dispatch | 8.00 |
Overall L1 | 10.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 9% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 8% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 10% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 8% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
PUSH %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0x8(%R8),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%RSI),%XMM5 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x8(%RSI),%XMM6 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x10(%RSI),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x8(%RDX),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x18(%RDX),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
SAL $0x3,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
MOV 0x18(%R8),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
SAL $0x3,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
LEA (%R11,%RSI,1),%RBX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA (%R12,%RDI,1),%R8 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
ADD %R8,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD %RBX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
TEST %R9D,%R9D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JLE 1b94b <_ZNK11qmcplusplus10DTD_BCondsIdLj3ELi39EE16computeDistancesINS_10TinyVectorIdLj3EEENS_18VectorSoAContainerIdLj3ELm64ENS_10MallocatorIdLm64EEEEES8_EEvRKT_RKT0_PdRT1_iii.constprop.0+0x32b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVQ 0x310b(%RIP),%XMM9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x30a3(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %RCX,%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOVSXD %R9D,%R9 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VMOVSD 0x3085(%RIP),%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV $0x2,%R15D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV $0x3,%R14D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼void qmcplusplus::DTD_BConds | 52.75 | 0.72 |
○Loop 167 - ParticleBConds.h:188-217 - libqmcparticle.so | 52.75 | 0.72 |