Function: miniqmcreference::einspline_spo_ref<double>::evaluate(qmcplusplus::ParticleSet const&, int ... | Module: exec | Source: einspline_spo_ref.hpp:172-189 [...] | Coverage: 33.09% |
---|
Function: miniqmcreference::einspline_spo_ref<double>::evaluate(qmcplusplus::ParticleSet const&, int ... | Module: exec | Source: einspline_spo_ref.hpp:172-189 [...] | Coverage: 33.09% |
---|
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineEvalHelper.hpp: 47 - 49 |
-------------------------------------------------------------------------------- |
47: T sf = std::floor(x); |
48: T dx2 = x - sf; |
49: int ind2 = std::min(std::max(0, static_cast<int>(sf)), nmax); |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Particle/ParticleSet.h: 217 - 217 |
-------------------------------------------------------------------------------- |
217: inline const PosType& activeR(int iat) const { return (activePtcl == iat) ? activePos : R[iat]; } |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVectorOps.h: 59 - 59 |
-------------------------------------------------------------------------------- |
59: for (unsigned d = 0; d < D; ++d) |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineData.hpp: 54 - 57 |
-------------------------------------------------------------------------------- |
54: a[0] = ((A00 * tx + A01) * tx + A02) * tx + A03; |
55: a[1] = ((A10 * tx + A11) * tx + A12) * tx + A13; |
56: a[2] = ((A20 * tx + A21) * tx + A22) * tx + A23; |
57: a[3] = ((A30 * tx + A31) * tx + A32) * tx + A33; |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineRef.hpp: 43 - 74 |
-------------------------------------------------------------------------------- |
43: y -= spline_m->y_grid.start; |
44: z -= spline_m->z_grid.start; |
45: T tx, ty, tz; |
46: int ix, iy, iz; |
47: spline2::getSplineBound(x * spline_m->x_grid.delta_inv, tx, ix, spline_m->x_grid.num - 1); |
48: spline2::getSplineBound(y * spline_m->y_grid.delta_inv, ty, iy, spline_m->y_grid.num - 1); |
49: spline2::getSplineBound(z * spline_m->z_grid.delta_inv, tz, iz, spline_m->z_grid.num - 1); |
[...] |
56: const intptr_t xs = spline_m->x_stride; |
57: const intptr_t ys = spline_m->y_stride; |
58: const intptr_t zs = spline_m->z_stride; |
[...] |
65: for (size_t i = 0; i < 4; i++) |
66: for (size_t j = 0; j < 4; j++) |
67: { |
68: const T pre00 = a[i] * b[j]; |
69: const T* restrict coefs = spline_m->coefs + (ix + i) * xs + (iy + j) * ys + iz * zs; |
70: #pragma omp simd aligned(coefs: QMC_SIMD_ALIGNMENT) simdlen(simdlen_) |
71: for (size_t n = 0; n < num_splines; n++) |
72: vals[n] += pre00 * |
73: (c[0] * coefs[n] + c[1] * coefs[n + zs] + c[2] * coefs[n + 2 * zs] + |
74: c[3] * coefs[n + 3 * zs]); |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/einspline_spo_ref.hpp: 172 - 189 |
-------------------------------------------------------------------------------- |
172: ScopedTimer local_timer(timer); |
173: |
174: auto u = Lattice.toUnit_floor(P.activeR(iat)); |
175: for (int i = 0; i < nBlocks; ++i) |
176: MultiBsplineEvalRef::evaluate_v(einsplines[i], u[0], u[1], u[2], psi[i].data(), nSplinesPerBlock); |
177: } |
178: |
179: inline void evaluate(const ParticleSet& P, int iat, ValueVector_t& psi_v) |
180: { |
181: evaluate_v(P, iat); |
182: |
183: for (int i = 0; i < nBlocks; ++i) |
184: { |
185: // in real simulation, phase needs to be applied. Here just fake computation |
186: const int first = i * nBlocks; |
187: std::copy_n(psi[i].data(), std::min((i + 1) * nSplinesPerBlock, OrbitalSetSize) - first, psi_v.data() + first); |
188: } |
189: } |
/usr/lib64/gcc/x86_64-pc-linux-gnu/13.1.1/../../../../include/c++/13.1.1/bits/stl_algo.h: 731 - 757 |
-------------------------------------------------------------------------------- |
731: { return std::copy(__first, __first + __n, __result); } |
[...] |
757: if (__n2 <= 0) |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Utilities/NewTimer.h: 242 - 249 |
-------------------------------------------------------------------------------- |
242: ScopeGuard(TIMER& t) : timer(t) { timer.start(); } |
[...] |
249: ~ScopeGuard() { timer.stop(); } |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Particle/Lattice/CrystalLattice.h: 191 - 191 |
-------------------------------------------------------------------------------- |
191: if (-std::numeric_limits<T1>::epsilon() < val_dot[i] && val_dot[i] < 0) |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVectorTensorOps.h: 150 - 152 |
-------------------------------------------------------------------------------- |
150: return TinyVector<Type_t, 3>(lhs[0] * rhs[0] + lhs[1] * rhs[3] + lhs[2] * rhs[6], |
151: lhs[0] * rhs[1] + lhs[1] * rhs[4] + lhs[2] * rhs[7], |
152: lhs[0] * rhs[2] + lhs[1] * rhs[5] + lhs[2] * rhs[8]); |
/usr/lib64/gcc/x86_64-pc-linux-gnu/13.1.1/../../../../include/c++/13.1.1/bits/stl_algobase.h: 238 - 931 |
-------------------------------------------------------------------------------- |
238: if (__b < __a) |
[...] |
398: { *__to = *__from; } |
[...] |
435: const ptrdiff_t _Num = __last - __first; |
436: if (__builtin_expect(_Num > 1, true)) |
437: __builtin_memmove(__result, __first, sizeof(_Tp) * _Num); |
[...] |
930: for (; __first != __last; ++__first) |
931: *__first = __tmp; |
/usr/lib64/gcc/x86_64-pc-linux-gnu/13.1.1/../../../../include/c++/13.1.1/bits/stl_vector.h: 1126 - 1258 |
-------------------------------------------------------------------------------- |
1126: return *(this->_M_impl._M_start + __n); |
[...] |
1258: { return _M_data_ptr(this->_M_impl._M_start); } |
0x437aa0 PUSH %RBP |
0x437aa1 MOV %RSP,%RBP |
0x437aa4 PUSH %R15 |
0x437aa6 PUSH %R14 |
0x437aa8 PUSH %R13 |
0x437aaa PUSH %R12 |
0x437aac PUSH %RBX |
0x437aad AND $-0x40,%RSP |
0x437ab1 SUB $0x2c0,%RSP |
0x437ab8 MOV %RCX,0x98(%RSP) |
0x437ac0 MOV %EDX,%R13D |
0x437ac3 MOV %RSI,%RBX |
0x437ac6 MOV %RDI,%R12 |
0x437ac9 MOV 0x348(%RDI),%RDI |
0x437ad0 MOV %RDI,0xa0(%RSP) |
0x437ad8 CALL 45de10 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE5startEv> |
0x437add MOVSXD %R13D,%RAX |
0x437ae0 LEA (%RAX,%RAX,2),%RCX |
0x437ae4 SAL $0x3,%RCX |
0x437ae8 ADD 0x5e8(%RBX),%RCX |
0x437aef LEA 0x988(%RBX),%RDX |
0x437af6 CMP %EAX,0x984(%RBX) |
0x437afc CMOVNE %RCX,%RDX |
0x437b00 VMOVDDUP (%RDX),%XMM1 |
0x437b04 VMULPD 0xd0(%R12),%XMM1,%XMM0 |
0x437b0e VMOVDDUP 0x8(%RDX),%XMM2 |
0x437b13 VFMADD231PD 0xe8(%R12),%XMM2,%XMM0 |
0x437b1d VMOVDDUP 0x10(%RDX),%XMM3 |
0x437b22 VFMADD231PD 0x100(%R12),%XMM3,%XMM0 |
0x437b2c VFPCLASSPD $0x50,%XMM0,%K1 |
0x437b33 VCMPPD $0xe,0x5a1a2(%RIP){1to0},%XMM0,%K0{%K1} |
0x437b3e CMPL $0,0x30(%R12) |
0x437b44 JLE 438229 |
0x437b4a VMULSD 0xe0(%R12),%XMM1,%XMM1 |
0x437b54 VFMADD231SD 0xf8(%R12),%XMM2,%XMM1 |
0x437b5e VFMADD231SD 0x110(%R12),%XMM3,%XMM1 |
0x437b68 VROUNDPD $0x9,%XMM0,%XMM2 |
0x437b6e KNOTW %K0,%K1 |
0x437b72 VROUNDSD $0x9,%XMM1,%XMM1,%XMM3 |
0x437b78 VSUBPD %XMM2,%XMM0,%XMM0{%K1}{z} |
0x437b7e VMOVUPD %XMM0,0xe0(%RSP) |
0x437b87 VSUBSD %XMM3,%XMM1,%XMM3 |
0x437b8b VXORPD %XMM0,%XMM0,%XMM0 |
0x437b8f VCMPSD $0x1,%XMM0,%XMM1,%K1 |
0x437b96 VMOVAPD %XMM3,%XMM2 |
0x437b9a VCMPSD $0xe,0x5a13b(%RIP),%XMM1,%K2 |
0x437ba5 VMOVSD %XMM0,%XMM2,%XMM2{%K1} |
0x437bab VMOVSD %XMM2,%XMM3,%XMM3{%K2} |
0x437bb1 VMOVUPD %XMM3,0xd0(%RSP) |
0x437bba VMOVSD 0x5a126(%RIP),%XMM9 |
0x437bc2 VMOVSD 0x56446(%RIP),%XMM10 |
0x437bca XOR %EDX,%EDX |
0x437bcc VMOVDDUP 0x5643c(%RIP),%XMM0 |
0x437bd4 VMOVUPD %XMM0,0xc0(%RSP) |
0x437bdd MOV %R12,0xa8(%RSP) |
0x437be5 JMP 437c19 |
0x437be7 NOPW (%RAX,%RAX,1) |
(669) 0x437bf0 MOV 0xb0(%RSP),%RSP |
(669) 0x437bf8 MOV 0xb8(%RSP),%RDX |
(669) 0x437c00 INC %RDX |
(669) 0x437c03 MOV 0xa8(%RSP),%R12 |
(669) 0x437c0b MOVSXD 0x30(%R12),%RAX |
(669) 0x437c10 CMP %RAX,%RDX |
(669) 0x437c13 JGE 438229 |
(669) 0x437c19 MOV %RSP,0xb0(%RSP) |
(669) 0x437c21 MOV 0x2e8(%R12),%RAX |
(669) 0x437c29 MOV 0x300(%R12),%RCX |
(669) 0x437c31 MOV (%RAX,%RDX,8),%R14 |
(669) 0x437c35 MOV %RDX,0xb8(%RSP) |
(669) 0x437c3d LEA (%RDX,%RDX,2),%RAX |
(669) 0x437c41 MOV (%RCX,%RAX,8),%RBX |
(669) 0x437c45 MOVSXD 0x40(%R12),%R13 |
(669) 0x437c4a VMOVUPD 0xd0(%RSP),%XMM0 |
(669) 0x437c53 VSUBSD 0x78(%R14),%XMM0,%XMM0 |
(669) 0x437c59 VMULSD 0x98(%R14),%XMM0,%XMM0 |
(669) 0x437c62 VROUNDSD $0x9,%XMM0,%XMM0,%XMM1 |
(669) 0x437c68 VCVTTSD2SI %XMM1,%EAX |
(669) 0x437c6c MOV %EAX,%ECX |
(669) 0x437c6e SAR $0x1f,%ECX |
(669) 0x437c71 ANDN %EAX,%ECX,%EAX |
(669) 0x437c76 MOV 0x88(%R14),%R12D |
(669) 0x437c7d DEC %R12D |
(669) 0x437c80 CMP %EAX,%R12D |
(669) 0x437c83 CMOVGE %EAX,%R12D |
(669) 0x437c87 VSUBSD %XMM1,%XMM0,%XMM21 |
(669) 0x437c8d VMULSD %XMM9,%XMM21,%XMM19 |
(669) 0x437c93 VSUBSD %XMM19,%XMM10,%XMM17 |
(669) 0x437c99 VFMADD213SD 0x563d5(%RIP),%XMM21,%XMM17 |
(669) 0x437ca3 VMULSD %XMM10,%XMM21,%XMM22 |
(669) 0x437ca9 VADDSD 0x5637d(%RIP),%XMM22,%XMM18 |
(669) 0x437cb3 VMULSD %XMM21,%XMM21,%XMM20 |
(669) 0x437cb9 VFMADD213SD 0x5a02d(%RIP),%XMM20,%XMM18 |
(669) 0x437cc3 VMOVSD 0x28(%R14),%XMM0 |
(669) 0x437cc9 VMOVSD 0x48(%R14),%XMM1 |
(669) 0x437ccf VMOVHPD 0x50(%R14),%XMM0,%XMM0 |
(669) 0x437cd5 VMOVHPD 0x70(%R14),%XMM1,%XMM1 |
(669) 0x437cdb VMOVUPD 0xe0(%RSP),%XMM2 |
(669) 0x437ce4 VSUBPD %XMM0,%XMM2,%XMM0 |
(669) 0x437ce8 VMULPD %XMM0,%XMM1,%XMM0 |
(669) 0x437cec VRNDSCALEPD $0x9,%XMM0,%XMM23 |
(669) 0x437cf3 VMOVD 0x38(%R14),%XMM1 |
(669) 0x437cf9 VSUBPD %XMM23,%XMM0,%XMM0 |
(669) 0x437cff VPINSRD $0x1,0x60(%R14),%XMM1,%XMM24 |
(669) 0x437d07 VPERMILPD $0x1,%XMM0,%XMM1 |
(669) 0x437d0d VMULSD %XMM0,%XMM9,%XMM2 |
(669) 0x437d11 VSUBSD %XMM2,%XMM10,%XMM3 |
(669) 0x437d15 VMULPD %XMM0,%XMM0,%XMM4 |
(669) 0x437d19 VMOVDDUP %XMM0,%XMM5 |
(669) 0x437d1d VMOVUPD 0xc0(%RSP),%XMM11 |
(669) 0x437d26 VBLENDPD $0x1,%XMM3,%XMM11,%XMM3 |
(669) 0x437d2c VMULPD %XMM3,%XMM5,%XMM3 |
(669) 0x437d30 VMOVUPD 0x5a718(%RIP),%XMM8 |
(669) 0x437d38 VADDPD %XMM3,%XMM8,%XMM5 |
(669) 0x437d3c VUNPCKLPD %XMM4,%XMM0,%XMM6 |
(669) 0x437d40 VMOVUPD 0x5a718(%RIP),%XMM7 |
(669) 0x437d48 VFMADD213PD %XMM7,%XMM5,%XMM6 |
(669) 0x437d4d VMOVUPD %XMM6,0x120(%RSP) |
(669) 0x437d56 VPERMILPD $0x1,%XMM3,%XMM3 |
(669) 0x437d5c VSUBSD %XMM3,%XMM10,%XMM3 |
(669) 0x437d60 VFMADD213SD %XMM10,%XMM0,%XMM3 |
(669) 0x437d65 VFMADD213SD %XMM9,%XMM0,%XMM3 |
(669) 0x437d6a VMOVSD %XMM3,0x130(%RSP) |
(669) 0x437d73 VMULSD %XMM4,%XMM2,%XMM2 |
(669) 0x437d77 VMOVSD %XMM2,0x138(%RSP) |
(669) 0x437d80 VMULSD %XMM1,%XMM9,%XMM2 |
(669) 0x437d84 VSUBSD %XMM2,%XMM10,%XMM3 |
(669) 0x437d88 VMULSD %XMM1,%XMM1,%XMM4 |
(669) 0x437d8c VPERMILPD $0x3,%XMM0,%XMM5 |
(669) 0x437d92 VBLENDPD $0x1,%XMM3,%XMM11,%XMM3 |
(669) 0x437d98 VMULPD %XMM3,%XMM5,%XMM3 |
(669) 0x437d9c VADDPD %XMM3,%XMM8,%XMM5 |
(669) 0x437da0 VSHUFPD $0x1,%XMM4,%XMM0,%XMM0 |
(669) 0x437da5 VFMADD213PD %XMM7,%XMM5,%XMM0 |
(669) 0x437daa VMOVUPD %XMM0,0x100(%RSP) |
(669) 0x437db3 VPERMILPD $0x1,%XMM3,%XMM0 |
(669) 0x437db9 VSUBSD %XMM0,%XMM10,%XMM0 |
(669) 0x437dbd VFMADD213SD %XMM10,%XMM1,%XMM0 |
(669) 0x437dc2 VFMADD213SD %XMM9,%XMM1,%XMM0 |
(669) 0x437dc7 VMOVSD %XMM0,0x110(%RSP) |
(669) 0x437dd0 VMULSD %XMM4,%XMM2,%XMM0 |
(669) 0x437dd4 VMOVSD %XMM0,0x118(%RSP) |
(669) 0x437ddd VMOVDQU 0x10(%R14),%XMM4 |
(669) 0x437de3 MOV 0x20(%R14),%R15 |
(669) 0x437de7 TEST %R13,%R13 |
(669) 0x437dea JE 437ef7 |
(669) 0x437df0 LEA (,%R13,8),%RDX |
(669) 0x437df8 MOV %RBX,%RDI |
(669) 0x437dfb XOR %ESI,%ESI |
(669) 0x437dfd VMOVUPS %ZMM12,0x240(%RSP) |
(669) 0x437e05 VMOVUPS %ZMM13,0x200(%RSP) |
(669) 0x437e0d VMOVUPS %ZMM14,0x1c0(%RSP) |
(669) 0x437e15 VMOVUPS %ZMM15,0x180(%RSP) |
(669) 0x437e1d VMOVUPS %ZMM16,0x140(%RSP) |
(669) 0x437e25 VMOVUPD %XMM17,0x80(%RSP) |
(669) 0x437e2d VMOVUPD %XMM18,0x70(%RSP) |
(669) 0x437e35 VMOVSD %XMM19,0x48(%RSP) |
(669) 0x437e3d VMOVSD %XMM20,0x40(%RSP) |
(669) 0x437e45 VMOVSD %XMM21,0x38(%RSP) |
(669) 0x437e4d VMOVSD %XMM22,0x30(%RSP) |
(669) 0x437e55 VMOVUPD %XMM23,0x50(%RSP) |
(669) 0x437e5d VMOVDQU64 %XMM24,0x60(%RSP) |
(669) 0x437e65 VMOVDQU %XMM4,0xf0(%RSP) |
(669) 0x437e6e VZEROUPPER |
(669) 0x437e71 CALL 47ebf0 <_intel_fast_memset> |
(669) 0x437e76 VMOVDQU 0xf0(%RSP),%XMM4 |
(669) 0x437e7f VMOVDQU64 0x60(%RSP),%XMM24 |
(669) 0x437e87 VMOVUPD 0x50(%RSP),%XMM23 |
(669) 0x437e8f VMOVSD 0x30(%RSP),%XMM22 |
(669) 0x437e97 VMOVSD 0x38(%RSP),%XMM21 |
(669) 0x437e9f VMOVSD 0x40(%RSP),%XMM20 |
(669) 0x437ea7 VMOVSD 0x48(%RSP),%XMM19 |
(669) 0x437eaf VMOVUPD 0x70(%RSP),%XMM18 |
(669) 0x437eb7 VMOVUPD 0x80(%RSP),%XMM17 |
(669) 0x437ebf VMOVUPD 0x140(%RSP),%ZMM16 |
(669) 0x437ec7 VMOVUPD 0x180(%RSP),%ZMM15 |
(669) 0x437ecf VMOVUPD 0x1c0(%RSP),%ZMM14 |
(669) 0x437ed7 VMOVUPD 0x200(%RSP),%ZMM13 |
(669) 0x437edf VMOVUPD 0x240(%RSP),%ZMM12 |
(669) 0x437ee7 VMOVSD 0x56121(%RIP),%XMM10 |
(669) 0x437eef VMOVSD 0x59df1(%RIP),%XMM9 |
(669) 0x437ef7 MOV %R13D,%EAX |
(669) 0x437efa CMP $0x2,%R13 |
(669) 0x437efe MOV $0x1,%ECX |
(669) 0x437f03 CMOVB %RCX,%R13 |
(669) 0x437f07 TEST %EAX,%EAX |
(669) 0x437f09 JE 437bf0 |
(669) 0x437f0f VFMADD213SD %XMM9,%XMM21,%XMM17 |
(669) 0x437f15 VSUBSD %XMM22,%XMM10,%XMM2 |
(669) 0x437f1b VFMADD213SD %XMM10,%XMM21,%XMM2 |
(669) 0x437f21 VFMADD213SD %XMM9,%XMM21,%XMM2 |
(669) 0x437f27 VPCMPEQD %XMM0,%XMM0,%XMM0 |
(669) 0x437f2b VPADDD %XMM0,%XMM24,%XMM0 |
(669) 0x437f31 VCVTTPD2DQ %XMM23,%XMM1 |
(669) 0x437f37 VPMAXSD 0x568c0(%RIP),%XMM1,%XMM1 |
(669) 0x437f40 VPMINSD %XMM1,%XMM0,%XMM0 |
(669) 0x437f45 MOVSXD %R12D,%R8 |
(669) 0x437f48 MOV %R15,%RSI |
(669) 0x437f4b IMUL %R8,%RSI |
(669) 0x437f4f LEA (%R15,%R15,1),%R12 |
(669) 0x437f53 MOV 0x8(%R14),%RCX |
(669) 0x437f57 VPMOVSXDQ %XMM0,%XMM0 |
(669) 0x437f5c LEA (%R15,%R15,2),%R10 |
(669) 0x437f60 VMULSD %XMM20,%XMM19,%XMM3 |
(669) 0x437f66 VPMULLQ %XMM0,%XMM4,%XMM0 |
(669) 0x437f6c VPEXTRQ $0x1,%XMM0,%R9 |
(669) 0x437f72 LEA (%RSI,%R9,1),%RDI |
(669) 0x437f76 VMOVQ %XMM0,%RAX |
(669) 0x437f7b ADD %RAX,%RDI |
(669) 0x437f7e MOV %RDI,0x1c0(%RSP) |
(669) 0x437f86 VMOVQ %XMM4,0x50(%RSP) |
(669) 0x437f8c VPEXTRQ $0x1,%XMM4,%R14 |
(669) 0x437f92 MOV %R13,%R11 |
(669) 0x437f95 AND $-0x8,%R11 |
(669) 0x437f99 ADD %RAX,%R9 |
(669) 0x437f9c LEA 0x3(%R8),%RAX |
(669) 0x437fa0 IMUL %R15,%RAX |
(669) 0x437fa4 LEA 0x2(%R8),%RDI |
(669) 0x437fa8 IMUL %R15,%RDI |
(669) 0x437fac INC %R8 |
(669) 0x437faf IMUL %R15,%R8 |
(669) 0x437fb3 LEA (%R15,%RSI,1),%RDX |
(669) 0x437fb7 ADD %R9,%RDX |
(669) 0x437fba MOV %RDX,0x140(%RSP) |
(669) 0x437fc2 ADD %RSI,%R12 |
(669) 0x437fc5 ADD %R9,%R12 |
(669) 0x437fc8 MOV %R12,0x240(%RSP) |
(669) 0x437fd0 ADD %RSI,%R10 |
(669) 0x437fd3 ADD %R9,%R10 |
(669) 0x437fd6 MOV %R10,0x200(%RSP) |
(669) 0x437fde ADD %R9,%RAX |
(669) 0x437fe1 ADD %R9,%RDI |
(669) 0x437fe4 ADD %R9,%R8 |
(669) 0x437fe7 ADD %RSI,%R9 |
(669) 0x437fea LEA -0x1(%R11),%RSI |
(669) 0x437fee VBROADCASTSD %XMM17,%ZMM0 |
(669) 0x437ff4 VBROADCASTSD %XMM18,%ZMM1 |
(669) 0x437ffa VBROADCASTSD %XMM2,%ZMM2 |
(669) 0x438000 VBROADCASTSD %XMM3,%ZMM3 |
(669) 0x438006 MOV %R13,0x80(%RSP) |
(669) 0x43800e VPBROADCASTQ %R13,%ZMM4 |
(669) 0x438014 LEA (%RCX,%RAX,8),%RDX |
(669) 0x438018 LEA (%RCX,%RDI,8),%R15 |
(669) 0x43801c LEA (%RCX,%R8,8),%R12 |
(669) 0x438020 LEA (%RCX,%R9,8),%R8 |
(669) 0x438024 MOV 0x50(%RSP),%RAX |
(669) 0x438029 LEA (,%RAX,8),%RAX |
(669) 0x438031 MOV %RAX,0x60(%RSP) |
(669) 0x438036 MOV %R14,0x180(%RSP) |
(669) 0x43803e LEA (,%R14,8),%R14 |
(669) 0x438046 XOR %EAX,%EAX |
(669) 0x438048 VMOVDQU64 0x59dae(%RIP),%ZMM11 |
(669) 0x438052 JMP 43809b |
0x438054 NOPW %CS:(%RAX,%RAX,1) |
(670) 0x438060 MOV 0x30(%RSP),%RAX |
(670) 0x438065 LEA 0x1(%RAX),%R9 |
(670) 0x438069 MOV 0x70(%RSP),%RDX |
(670) 0x43806e MOV 0x60(%RSP),%RDI |
(670) 0x438073 ADD %RDI,%RDX |
(670) 0x438076 MOV 0x48(%RSP),%R15 |
(670) 0x43807b ADD %RDI,%R15 |
(670) 0x43807e MOV 0x40(%RSP),%R12 |
(670) 0x438083 ADD %RDI,%R12 |
(670) 0x438086 MOV 0x38(%RSP),%R8 |
(670) 0x43808b ADD %RDI,%R8 |
(670) 0x43808e CMP $0x3,%RAX |
(670) 0x438092 MOV %R9,%RAX |
(670) 0x438095 JE 437bf0 |
(670) 0x43809b VMOVSD 0x120(%RSP,%RAX,8),%XMM5 |
(670) 0x4380a4 MOV 0x50(%RSP),%R13 |
(670) 0x4380a9 MOV %RAX,0x30(%RSP) |
(670) 0x4380ae IMUL %RAX,%R13 |
(670) 0x4380b2 MOV %R8,0x38(%RSP) |
(670) 0x4380b7 MOV %R12,0x40(%RSP) |
(670) 0x4380bc MOV %R15,0x48(%RSP) |
(670) 0x4380c1 MOV %RDX,0x70(%RSP) |
(670) 0x4380c6 MOV %RDX,%R9 |
(670) 0x4380c9 XOR %R10D,%R10D |
(670) 0x4380cc JMP 4381bd |
0x4380d1 NOPW %CS:(%RAX,%RAX,1) |
(671) 0x4380e0 MOV 0x180(%RSP),%RDI |
(671) 0x4380e8 IMUL %R10,%RDI |
(671) 0x4380ec ADD %R13,%RDI |
(671) 0x4380ef VPBROADCASTQ %RAX,%ZMM7 |
(671) 0x4380f5 VPORQ %ZMM11,%ZMM7,%ZMM7 |
(671) 0x4380fb VPCMPLTUQ %ZMM4,%ZMM7,%K1 |
(671) 0x438102 MOV 0x1c0(%RSP),%RDX |
(671) 0x43810a ADD %RAX,%RDX |
(671) 0x43810d ADD %RDI,%RDX |
(671) 0x438110 VMOVUPD (%RCX,%RDX,8),%ZMM7{%K1}{z} |
(671) 0x438117 VMOVAPD %ZMM7,%ZMM16{%K1} |
(671) 0x43811d VMULPD %ZMM0,%ZMM16,%ZMM7 |
(671) 0x438123 MOV 0x140(%RSP),%RDX |
(671) 0x43812b ADD %RAX,%RDX |
(671) 0x43812e ADD %RDI,%RDX |
(671) 0x438131 VMOVUPD (%RCX,%RDX,8),%ZMM8{%K1}{z} |
(671) 0x438138 VMOVAPD %ZMM8,%ZMM15{%K1} |
(671) 0x43813e VFMADD231PD %ZMM1,%ZMM15,%ZMM7 |
(671) 0x438144 MOV 0x240(%RSP),%RDX |
(671) 0x43814c ADD %RAX,%RDX |
(671) 0x43814f ADD %RDI,%RDX |
(671) 0x438152 VMOVUPD (%RCX,%RDX,8),%ZMM8{%K1}{z} |
(671) 0x438159 VMOVAPD %ZMM8,%ZMM14{%K1} |
(671) 0x43815f VFMADD231PD %ZMM2,%ZMM14,%ZMM7 |
(671) 0x438165 MOV 0x200(%RSP),%RDX |
(671) 0x43816d ADD %RAX,%RDX |
(671) 0x438170 ADD %RDX,%RDI |
(671) 0x438173 VMOVUPD (%RCX,%RDI,8),%ZMM8{%K1}{z} |
(671) 0x43817a VMOVAPD %ZMM8,%ZMM13{%K1} |
(671) 0x438180 VFMADD231PD %ZMM13,%ZMM3,%ZMM7 |
(671) 0x438186 VMOVUPD (%RBX,%RAX,8),%ZMM8{%K1}{z} |
(671) 0x43818d VMOVAPD %ZMM8,%ZMM12{%K1} |
(671) 0x438193 VFMADD213PD %ZMM12,%ZMM6,%ZMM7 |
(671) 0x438199 VMOVUPD %ZMM7,(%RBX,%RAX,8){%K1} |
(671) 0x4381a0 LEA 0x1(%R10),%RAX |
(671) 0x4381a4 ADD %R14,%R9 |
(671) 0x4381a7 ADD %R14,%R15 |
(671) 0x4381aa ADD %R14,%R12 |
(671) 0x4381ad ADD %R14,%R8 |
(671) 0x4381b0 CMP $0x3,%R10 |
(671) 0x4381b4 MOV %RAX,%R10 |
(671) 0x4381b7 JE 438060 |
(671) 0x4381bd VMULSD 0x100(%RSP,%R10,8),%XMM5,%XMM6 |
(671) 0x4381c7 VBROADCASTSD %XMM6,%ZMM6 |
(671) 0x4381cd XOR %EAX,%EAX |
(671) 0x4381cf TEST %R11,%R11 |
(671) 0x4381d2 JE 4380e0 |
(671) 0x4381d8 NOPL (%RAX,%RAX,1) |
(672) 0x4381e0 VMULPD (%R8,%RAX,8),%ZMM0,%ZMM7 |
(672) 0x4381e7 VFMADD231PD (%R12,%RAX,8),%ZMM1,%ZMM7 |
(672) 0x4381ee VFMADD231PD (%R15,%RAX,8),%ZMM2,%ZMM7 |
(672) 0x4381f5 VFMADD231PD (%R9,%RAX,8),%ZMM3,%ZMM7 |
(672) 0x4381fc VFMADD213PD (%RBX,%RAX,8),%ZMM6,%ZMM7 |
(672) 0x438203 VMOVUPD %ZMM7,(%RBX,%RAX,8) |
(672) 0x43820a ADD $0x8,%RAX |
(672) 0x43820e CMP %RSI,%RAX |
(672) 0x438211 JBE 4381e0 |
(671) 0x438213 MOV %R11,%RAX |
(671) 0x438216 CMP %R11,0x80(%RSP) |
(671) 0x43821e JNE 4380e0 |
(671) 0x438224 JMP 4381a0 |
0x438229 MOV 0xa0(%RSP),%RDI |
0x438231 VZEROUPPER |
0x438234 CALL 45e030 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE4stopEv> |
0x438239 MOV 0x30(%R12),%EAX |
0x43823e TEST %EAX,%EAX |
0x438240 MOV 0x98(%RSP),%R15 |
0x438248 JLE 4382c2 |
0x43824a XOR %EBX,%EBX |
0x43824c XOR %R14D,%R14D |
0x43824f JMP 43827c |
0x438251 NOPW %CS:(%RAX,%RAX,1) |
(668) 0x438260 MOV %ECX,%EDX |
(668) 0x438262 SAL $0x3,%RDX |
(668) 0x438266 CALL 404090 <memmove@plt> |
(668) 0x43826b MOV 0x30(%R12),%EAX |
(668) 0x438270 MOVSXD %EAX,%RCX |
(668) 0x438273 ADD $0x18,%RBX |
(668) 0x438277 CMP %RCX,%R14 |
(668) 0x43827a JGE 4382c2 |
(668) 0x43827c MOV %R14D,%EDX |
(668) 0x43827f IMUL %EAX,%EDX |
(668) 0x438282 INC %R14 |
(668) 0x438285 MOV 0x40(%R12),%ECX |
(668) 0x43828a IMUL %R14D,%ECX |
(668) 0x43828e MOV 0x8(%R12),%ESI |
(668) 0x438293 CMP %ECX,%ESI |
(668) 0x438295 CMOVL %ESI,%ECX |
(668) 0x438298 SUB %EDX,%ECX |
(668) 0x43829a JLE 438270 |
(668) 0x43829c MOV 0x300(%R12),%RSI |
(668) 0x4382a4 MOV (%RSI,%RBX,1),%RSI |
(668) 0x4382a8 MOVSXD %EDX,%RDI |
(668) 0x4382ab SAL $0x3,%RDI |
(668) 0x4382af ADD 0x18(%R15),%RDI |
(668) 0x4382b3 CMP $0x1,%ECX |
(668) 0x4382b6 JNE 438260 |
(668) 0x4382b8 VMOVSD (%RSI),%XMM0 |
(668) 0x4382bc VMOVSD %XMM0,(%RDI) |
(668) 0x4382c0 JMP 438270 |
0x4382c2 LEA -0x28(%RBP),%RSP |
0x4382c6 POP %RBX |
0x4382c7 POP %R12 |
0x4382c9 POP %R13 |
0x4382cb POP %R14 |
0x4382cd POP %R15 |
0x4382cf POP %RBP |
0x4382d0 RET |
0x4382d1 MOV %RAX,%RDI |
0x4382d4 CALL 40b670 <__clang_call_terminate> |
0x4382d9 NOPL (%RAX) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:194 | exec |
○ | qmcplusplus::WaveFunction::rat[...] | WaveFunction.cpp:214 | exec |
○ | main.extracted.104 | stl_vector.h:1126 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 |
Path / |
Source file and lines | einspline_spo_ref.hpp:172-189 |
Module | exec |
nb instructions | 81 |
nb uops | 91 |
loop length | 448 |
used x86 registers | 12 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 7 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 22.75 cycles |
front end | 22.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 9.75 | 9.75 | 14.50 | 14.17 | 15.00 | 9.75 | 9.75 | 14.33 |
cycles | 9.75 | 9.75 | 14.50 | 14.17 | 15.00 | 9.75 | 9.75 | 14.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 21.73 |
Stall cycles | 0.00 |
Front-end | 22.75 |
Dispatch | 15.00 |
Overall L1 | 22.75 |
all | 9% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 16% |
all | 44% |
load | 28% |
store | 100% |
mul | 50% |
add-sub | 50% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 50% |
all | 34% |
load | 22% |
store | 50% |
mul | 50% |
add-sub | 33% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 37% |
all | 11% |
load | 9% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 10% |
all | 18% |
load | 16% |
store | 25% |
mul | 18% |
add-sub | 18% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 18% |
all | 16% |
load | 14% |
store | 18% |
mul | 18% |
add-sub | 16% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 15% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
PUSH %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
AND $-0x40,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
SUB $0x2c0,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %RCX,0x98(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %EDX,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RSI,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RDI,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x348(%RDI),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %RDI,0xa0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
CALL 45de10 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE5startEv> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOVSXD %R13D,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
LEA (%RAX,%RAX,2),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
SAL $0x3,%RCX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
ADD 0x5e8(%RBX),%RCX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
LEA 0x988(%RBX),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
CMP %EAX,0x984(%RBX) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
CMOVNE %RCX,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VMOVDDUP (%RDX),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD 0xd0(%R12),%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP 0x8(%RDX),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0xe8(%R12),%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP 0x10(%RDX),%XMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x100(%R12),%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFPCLASSPD $0x50,%XMM0,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VCMPPD $0xe,0x5a1a2(%RIP){1to0},%XMM0,%K0{%K1} | 2 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 3 | 1 |
CMPL $0,0x30(%R12) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JLE 438229 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMULSD 0xe0(%R12),%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xf8(%R12),%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x110(%R12),%XMM3,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VROUNDPD $0x9,%XMM0,%XMM2 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
KNOTW %K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VROUNDSD $0x9,%XMM1,%XMM1,%XMM3 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBPD %XMM2,%XMM0,%XMM0{%K1}{z} | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %XMM0,0xe0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VSUBSD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VXORPD %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VCMPSD $0x1,%XMM0,%XMM1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVAPD %XMM3,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VCMPSD $0xe,0x5a13b(%RIP),%XMM1,%K2 | 2 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVSD %XMM0,%XMM2,%XMM2{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD %XMM2,%XMM3,%XMM3{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVUPD %XMM3,0xd0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD 0x5a126(%RIP),%XMM9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x56446(%RIP),%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVDDUP 0x5643c(%RIP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %XMM0,0xc0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %R12,0xa8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
JMP 437c19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0xa0(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
CALL 45e030 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE4stopEv> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV 0x30(%R12),%EAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
TEST %EAX,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x98(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
JLE 4382c2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
XOR %R14D,%R14D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 43827c | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
LEA -0x28(%RBP),%RSP | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 |
MOV %RAX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CALL 40b670 <__clang_call_terminate> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
Source file and lines | einspline_spo_ref.hpp:172-189 |
Module | exec |
nb instructions | 81 |
nb uops | 91 |
loop length | 448 |
used x86 registers | 12 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 7 |
ADD-SUB / MUL ratio | 1.00 |
micro-operation queue | 22.75 cycles |
front end | 22.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 9.75 | 9.75 | 14.50 | 14.17 | 15.00 | 9.75 | 9.75 | 14.33 |
cycles | 9.75 | 9.75 | 14.50 | 14.17 | 15.00 | 9.75 | 9.75 | 14.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 21.73 |
Stall cycles | 0.00 |
Front-end | 22.75 |
Dispatch | 15.00 |
Overall L1 | 22.75 |
all | 9% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 16% |
all | 44% |
load | 28% |
store | 100% |
mul | 50% |
add-sub | 50% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 50% |
all | 34% |
load | 22% |
store | 50% |
mul | 50% |
add-sub | 33% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 37% |
all | 11% |
load | 9% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 10% |
all | 18% |
load | 16% |
store | 25% |
mul | 18% |
add-sub | 18% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 18% |
all | 16% |
load | 14% |
store | 18% |
mul | 18% |
add-sub | 16% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 15% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
PUSH %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
AND $-0x40,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
SUB $0x2c0,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %RCX,0x98(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %EDX,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RSI,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RDI,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x348(%RDI),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %RDI,0xa0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
CALL 45de10 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE5startEv> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOVSXD %R13D,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
LEA (%RAX,%RAX,2),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
SAL $0x3,%RCX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
ADD 0x5e8(%RBX),%RCX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
LEA 0x988(%RBX),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
CMP %EAX,0x984(%RBX) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
CMOVNE %RCX,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VMOVDDUP (%RDX),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD 0xd0(%R12),%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP 0x8(%RDX),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0xe8(%R12),%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP 0x10(%RDX),%XMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231PD 0x100(%R12),%XMM3,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFPCLASSPD $0x50,%XMM0,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VCMPPD $0xe,0x5a1a2(%RIP){1to0},%XMM0,%K0{%K1} | 2 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 3 | 1 |
CMPL $0,0x30(%R12) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JLE 438229 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMULSD 0xe0(%R12),%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xf8(%R12),%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x110(%R12),%XMM3,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VROUNDPD $0x9,%XMM0,%XMM2 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
KNOTW %K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VROUNDSD $0x9,%XMM1,%XMM1,%XMM3 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBPD %XMM2,%XMM0,%XMM0{%K1}{z} | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %XMM0,0xe0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VSUBSD %XMM3,%XMM1,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VXORPD %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VCMPSD $0x1,%XMM0,%XMM1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVAPD %XMM3,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VCMPSD $0xe,0x5a13b(%RIP),%XMM1,%K2 | 2 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVSD %XMM0,%XMM2,%XMM2{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD %XMM2,%XMM3,%XMM3{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVUPD %XMM3,0xd0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD 0x5a126(%RIP),%XMM9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x56446(%RIP),%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVDDUP 0x5643c(%RIP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %XMM0,0xc0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %R12,0xa8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
JMP 437c19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0xa0(%RSP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
CALL 45e030 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE4stopEv> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV 0x30(%R12),%EAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
TEST %EAX,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x98(%RSP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
JLE 4382c2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
XOR %R14D,%R14D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 43827c | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
LEA -0x28(%RBP),%RSP | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 |
MOV %RAX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CALL 40b670 <__clang_call_terminate> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼miniqmcreference::einspline_spo_ref | 33.09 | 0.22 |
○Loop 668 - stl_algobase.h:238-437 - exec | 0 | 0 |
▼Loop 669 - MultiBsplineRef.hpp:43-74 - exec– | 0 | 0 |
▼Loop 670 - MultiBsplineRef.hpp:65-74 - exec– | 0 | 0 |
▼Loop 671 - MultiBsplineRef.hpp:66-74 - exec– | 0 | 0 |
○Loop 672 - MultiBsplineRef.hpp:70-73 - exec | 33.09 | 0.22 |