Loop Id: 731 | Module: libqmcwfs.so | Source: OneBodyJastrowRef.h:134-155 [...] | Coverage: 0.02% |
---|
Loop Id: 731 | Module: libqmcwfs.so | Source: OneBodyJastrowRef.h:134-155 [...] | Coverage: 0.02% |
---|
0x3e3c0 VMOVSD -0x48(%RBP),%XMM0 |
0x3e3c5 VSUBSD %XMM3,%XMM0,%XMM0 |
0x3e3c9 CALL 4ecf0 <@plt_start@+0x640> |
0x3e3ce MOV -0x40(%RBP),%RCX |
0x3e3d2 MOV (%RCX),%RAX |
0x3e3d5 VMOVSD %XMM0,(%RAX,%R12,8) |
0x3e3db INC %R12 |
0x3e3de MOV 0x8(%RCX),%RCX |
0x3e3e2 SUB %RAX,%RCX |
0x3e3e5 SAR $0x3,%RCX |
0x3e3e9 CMP %R12,%RCX |
0x3e3ec JBE 3e5f0 |
0x3e3f2 MOV -0x38(%RBP),%RDI |
0x3e3f6 MOV 0xf0(%R15),%RCX |
0x3e3fd MOV 0xa8(%R15),%ESI |
0x3e404 MOVSXD 0x2a0(%RDI),%RAX |
0x3e40b VMOVSD (%RCX,%RAX,8),%XMM0 |
0x3e410 VMOVSD %XMM0,-0x48(%RBP) |
0x3e415 CALL 4ea70 <@plt_start@+0x3c0> |
0x3e41a MOV 0x48(%RAX),%RAX |
0x3e41e LEA (%R12,%R12,4),%RCX |
0x3e422 MOV 0x18(%RAX,%RCX,8),%R13 |
0x3e427 MOV 0x98(%R15),%EAX |
0x3e42e TEST %EAX,%EAX |
0x3e430 JLE 3e4b0 |
0x3e432 VXORPD %XMM3,%XMM3,%XMM3 |
0x3e436 XOR %EBX,%EBX |
0x3e438 JMP 3e44f |
(733) 0x3e440 MOVSXD %EAX,%RCX |
(733) 0x3e443 INC %RBX |
(733) 0x3e446 CMP %RCX,%RBX |
(733) 0x3e449 JGE 3e3c0 |
(733) 0x3e44f MOV 0x1c8(%R15),%RCX |
(733) 0x3e456 MOV (%RCX,%RBX,8),%RDI |
(733) 0x3e45a TEST %RDI,%RDI |
(733) 0x3e45d JE 3e440 |
(733) 0x3e45f MOV 0xa0(%R15),%RAX |
(733) 0x3e466 MOV 0x148(%R15),%R9 |
(733) 0x3e46d MOV $-0x1,%ESI |
(733) 0x3e472 MOV %R13,%R8 |
(733) 0x3e475 VMOVSD %XMM3,-0x30(%RBP) |
(733) 0x3e47a MOV 0x268(%RAX),%RAX |
(733) 0x3e481 MOV 0x18(%RAX),%RAX |
(733) 0x3e485 MOV (%RAX,%RBX,4),%EDX |
(733) 0x3e488 MOV 0x4(%RAX,%RBX,4),%ECX |
(733) 0x3e48c CALL 307d0 <_ZNK11qmcplusplus14BsplineFunctorIdE9evaluateVEiiiPKdPd> |
(733) 0x3e491 VMOVSD -0x30(%RBP),%XMM3 |
(733) 0x3e496 MOV 0x98(%R15),%EAX |
(733) 0x3e49d VADDSD %XMM3,%XMM0,%XMM3 |
(733) 0x3e4a1 JMP 3e440 |
0x3e4b0 MOV 0x90(%R15),%EAX |
0x3e4b7 VXORPD %XMM3,%XMM3,%XMM3 |
0x3e4bb TEST %EAX,%EAX |
0x3e4bd JLE 3e3c0 |
0x3e4c3 XOR %R14D,%R14D |
0x3e4c6 JMP 3e4e3 |
(732) 0x3e4d0 VADDSD %XMM3,%XMM0,%XMM3 |
(732) 0x3e4d4 INC %R14 |
(732) 0x3e4d7 MOVSXD %EAX,%RCX |
(732) 0x3e4da CMP %RCX,%R14 |
(732) 0x3e4dd JGE 3e3c0 |
(732) 0x3e4e3 MOV 0xa0(%R15),%RCX |
(732) 0x3e4ea MOV 0x1c8(%R15),%RDX |
(732) 0x3e4f1 MOV 0x18(%RCX),%RCX |
(732) 0x3e4f5 MOVSXD (%RCX,%R14,4),%RCX |
(732) 0x3e4f9 MOV (%RDX,%RCX,8),%RBX |
(732) 0x3e4fd TEST %RBX,%RBX |
(732) 0x3e500 JE 3e4d4 |
(732) 0x3e502 VMOVSD (%R13,%R14,8),%XMM1 |
(732) 0x3e509 VMOVSD 0x8(%RBX),%XMM2 |
(732) 0x3e50e VXORPD %XMM0,%XMM0,%XMM0 |
(732) 0x3e512 VUCOMISD %XMM1,%XMM2 |
(732) 0x3e516 JBE 3e4d0 |
(732) 0x3e518 VMULSD 0x238(%RBX),%XMM1,%XMM0 |
(732) 0x3e520 LEA -0x50(%RBP),%RDI |
(732) 0x3e524 VMOVSD %XMM3,-0x30(%RBP) |
(732) 0x3e529 CALL 4ea80 <@plt_start@+0x3d0> |
(732) 0x3e52e VMOVUPD 0x18(%RBX),%XMM3 |
(732) 0x3e533 VMOVUPD 0x30(%RBX),%XMM5 |
(732) 0x3e538 VMULSD %XMM0,%XMM0,%XMM1 |
(732) 0x3e53c VCVTTSD2SI -0x50(%RBP),%EAX |
(732) 0x3e541 VMOVUPD 0x28(%RBX),%XMM4 |
(732) 0x3e546 MOV 0x218(%RBX),%RCX |
(732) 0x3e54d VBLENDPD $0x1,0x20(%RBX),%XMM5,%XMM6 |
(732) 0x3e554 VMOVHPD 0x40(%RBX),%XMM3,%XMM3 |
(732) 0x3e559 VMOVHPD 0x50(%RBX),%XMM5,%XMM5 |
(732) 0x3e55e VMULSD %XMM0,%XMM1,%XMM2 |
(732) 0x3e562 VMOVDDUP %XMM0,%XMM0 |
(732) 0x3e566 VUNPCKLPD %XMM1,%XMM2,%XMM8 |
(732) 0x3e56a VUNPCKLPD %XMM2,%XMM1,%XMM7 |
(732) 0x3e56e CLTQ |
(732) 0x3e570 VFMADD231PD %XMM8,%XMM3,%XMM5 |
(732) 0x3e575 VMOVHPD 0x48(%RBX),%XMM4,%XMM3 |
(732) 0x3e57a VFMADD231PD %XMM7,%XMM6,%XMM5 |
(732) 0x3e57f VFMADD213PD %XMM5,%XMM0,%XMM3 |
(732) 0x3e584 VMULPD (%RCX,%RAX,8),%XMM3,%XMM3 |
(732) 0x3e589 VUNPCKLPD %XMM0,%XMM1,%XMM0 |
(732) 0x3e58d VMULPD 0x60(%RBX),%XMM0,%XMM1 |
(732) 0x3e592 VMULPD 0x80(%RBX),%XMM0,%XMM0 |
(732) 0x3e59a VPERMILPD $0x1,%XMM3,%XMM4 |
(732) 0x3e5a0 VADDSD %XMM3,%XMM4,%XMM3 |
(732) 0x3e5a4 VPERMILPD $0x1,%XMM1,%XMM4 |
(732) 0x3e5aa VFMADD231SD 0x58(%RBX),%XMM2,%XMM1 |
(732) 0x3e5b0 VADDSD %XMM4,%XMM1,%XMM1 |
(732) 0x3e5b4 VADDSD 0x70(%RBX),%XMM1,%XMM1 |
(732) 0x3e5b9 VFMADD132SD 0x10(%RCX,%RAX,8),%XMM3,%XMM1 |
(732) 0x3e5c0 VPERMILPD $0x1,%XMM0,%XMM3 |
(732) 0x3e5c6 VFMADD231SD 0x78(%RBX),%XMM2,%XMM0 |
(732) 0x3e5cc VADDSD %XMM3,%XMM0,%XMM0 |
(732) 0x3e5d0 VADDSD 0x90(%RBX),%XMM0,%XMM0 |
(732) 0x3e5d8 VMOVSD -0x30(%RBP),%XMM3 |
(732) 0x3e5dd VFMADD132SD 0x18(%RCX,%RAX,8),%XMM1,%XMM0 |
(732) 0x3e5e4 MOV 0x90(%R15),%EAX |
(732) 0x3e5eb JMP 3e4d0 |
/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr_base.h: 1296 - 1296 |
-------------------------------------------------------------------------------- |
1296: { return _M_ptr; } |
/home/kcamus/qaas_runs/170-254-9426/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h: 166 - 182 |
-------------------------------------------------------------------------------- |
166: if (r >= cutoff_radius) |
167: return 0.0; |
168: r *= DeltaRInv; |
169: real_type ipart, t; |
170: t = std::modf(r, &ipart); |
171: int i = (int)ipart; |
172: real_type tp[4]; |
173: tp[0] = t * t * t; |
[...] |
179: (SplineCoefs[i+0]*(A[ 0]*tp[0] + A[ 1]*tp[1] + A[ 2]*tp[2] + A[ 3]*tp[3])+ |
180: SplineCoefs[i+1]*(A[ 4]*tp[0] + A[ 5]*tp[1] + A[ 6]*tp[2] + A[ 7]*tp[3])+ |
181: SplineCoefs[i+2]*(A[ 8]*tp[0] + A[ 9]*tp[1] + A[10]*tp[2] + A[11]*tp[3])+ |
182: SplineCoefs[i+3]*(A[12]*tp[0] + A[13]*tp[1] + A[14]*tp[2] + A[15]*tp[3])); |
/home/kcamus/qaas_runs/170-254-9426/intel/miniqmc/build/miniqmc/src/Particle/ParticleSet.h: 313 - 316 |
-------------------------------------------------------------------------------- |
313: inline int first(int igroup) const { return (*group_offsets_)[igroup]; } |
314: |
315: ///return the last index of a group i |
316: inline int last(int igroup) const { return (*group_offsets_)[igroup + 1]; } |
/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h: 919 - 1169 |
-------------------------------------------------------------------------------- |
919: { return size_type(this->_M_impl._M_finish - this->_M_impl._M_start); } |
[...] |
1046: return *(this->_M_impl._M_start + __n); |
[...] |
1064: return *(this->_M_impl._M_start + __n); |
[...] |
1169: { return _M_data_ptr(this->_M_impl._M_start); } |
/home/kcamus/qaas_runs/170-254-9426/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/OhmmsVector.h: 223 - 249 |
-------------------------------------------------------------------------------- |
223: return X[i]; |
[...] |
229: return X[i]; |
[...] |
249: inline const_pointer data() const { return X; } |
/home/kcamus/qaas_runs/170-254-9426/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/Jastrow/OneBodyJastrowRef.h: 134 - 155 |
-------------------------------------------------------------------------------- |
134: for (int k = 0; k < ratios.size(); ++k) |
135: ratios[k] = std::exp(Vat[VP.refPtcl] - computeU(VP.getDistTableAB(myTableID).getDistRow(k).data())); |
[...] |
141: if (NumGroups > 0) |
142: { |
143: for (int jg = 0; jg < NumGroups; ++jg) |
144: { |
145: if (F[jg] != nullptr) |
146: curVat += F[jg]->evaluateV(-1, Ions.first(jg), Ions.last(jg), dist, DistCompressed.data()); |
147: } |
148: } |
149: else |
150: { |
151: for (int c = 0; c < Nions; ++c) |
152: { |
153: int gid = Ions.GroupID[c]; |
154: if (F[gid] != nullptr) |
155: curVat += F[gid]->evaluate(dist[c]); |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 2.57 |
CQA speedup if FP arith vectorized | 2.18 |
CQA speedup if fully vectorized | 5.54 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.06 |
Bottlenecks | micro-operation queue, |
Function | miniqmcreference::OneBodyJastrowRef |
Source | stl_vector.h:919-919,stl_vector.h:1046-1046,stl_vector.h:1064-1064,OhmmsVector.h:223-223,OhmmsVector.h:249-249,OneBodyJastrowRef.h:134-135,OneBodyJastrowRef.h:141-141,OneBodyJastrowRef.h:151-151 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 6.00 |
CQA cycles if no scalar integer | 2.33 |
CQA cycles if FP arith vectorized | 2.75 |
CQA cycles if fully vectorized | 1.08 |
Front-end cycles | 6.00 |
DIV/SQRT cycles | 3.50 |
P0 cycles | 2.75 |
P1 cycles | 2.75 |
P2 cycles | 2.50 |
P3 cycles | 3.50 |
P4 cycles | 5.67 |
P5 cycles | 5.67 |
P6 cycles | 5.67 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.50 |
P10 cycles | 0.50 |
P11 cycles | 1.00 |
P12 cycles | 1.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 34.00 |
Nb uops | 36.00 |
Nb loads | 13.00 |
Nb stores | 2.00 |
Nb stack references | 3.00 |
FLOP/cycle | 0.17 |
Nb FLOP add-sub | 1.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 17.33 |
Bytes prefetched | 0.00 |
Bytes loaded | 88.00 |
Bytes stored | 16.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 15.38 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 33.33 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 10.94 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 13.54 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 2.57 |
CQA speedup if FP arith vectorized | 2.18 |
CQA speedup if fully vectorized | 5.54 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.06 |
Bottlenecks | micro-operation queue, |
Function | miniqmcreference::OneBodyJastrowRef |
Source | stl_vector.h:919-919,stl_vector.h:1046-1046,stl_vector.h:1064-1064,OhmmsVector.h:223-223,OhmmsVector.h:249-249,OneBodyJastrowRef.h:134-135,OneBodyJastrowRef.h:141-141,OneBodyJastrowRef.h:151-151 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 6.00 |
CQA cycles if no scalar integer | 2.33 |
CQA cycles if FP arith vectorized | 2.75 |
CQA cycles if fully vectorized | 1.08 |
Front-end cycles | 6.00 |
DIV/SQRT cycles | 3.50 |
P0 cycles | 2.75 |
P1 cycles | 2.75 |
P2 cycles | 2.50 |
P3 cycles | 3.50 |
P4 cycles | 5.67 |
P5 cycles | 5.67 |
P6 cycles | 5.67 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.50 |
P10 cycles | 0.50 |
P11 cycles | 1.00 |
P12 cycles | 1.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 34.00 |
Nb uops | 36.00 |
Nb loads | 13.00 |
Nb stores | 2.00 |
Nb stack references | 3.00 |
FLOP/cycle | 0.17 |
Nb FLOP add-sub | 1.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 17.33 |
Bytes prefetched | 0.00 |
Bytes loaded | 88.00 |
Bytes stored | 16.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 15.38 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 33.33 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 10.94 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 13.54 |
Path / |
Function | miniqmcreference::OneBodyJastrowRef |
Source file and lines | OneBodyJastrowRef.h:134-155 |
Module | libqmcwfs.so |
nb instructions | 34 |
nb uops | 36 |
loop length | 146 |
used x86 registers | 10 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 3 |
micro-operation queue | 6.00 cycles |
front end | 6.00 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 3.50 | 2.75 | 2.75 | 2.50 | 3.50 | 5.67 | 5.67 | 5.67 | 0.00 | 0.00 | 0.50 | 0.50 | 1.00 | 1.00 |
cycles | 3.50 | 2.75 | 2.75 | 2.50 | 3.50 | 5.67 | 5.67 | 5.67 | 0.00 | 0.00 | 0.50 | 0.50 | 1.00 | 1.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 6.00 |
Dispatch | 5.67 |
Overall L1 | 6.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 28% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 15% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 33% |
all | 8% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 7% |
all | 16% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 12% |
load | 10% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 13% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVSD -0x48(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VSUBSD %XMM3,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
CALL 4ecf0 <@plt_start@+0x640> | 2 | 0.50 | 0 | 0 | 0 | 0.50 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x40(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%RCX),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVSD %XMM0,(%RAX,%R12,8) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
INC %R12 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV 0x8(%RCX),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
SUB %RAX,%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SAR $0x3,%RCX | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
CMP %R12,%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JBE 3e5f0 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x260> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV -0x38(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0xf0(%R15),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0xa8(%R15),%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOVSXD 0x2a0(%RDI),%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVSD (%RCX,%RAX,8),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVSD %XMM0,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CALL 4ea70 <@plt_start@+0x3c0> | 2 | 0.50 | 0 | 0 | 0 | 0.50 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x48(%RAX),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%R12,%R12,4),%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV 0x18(%RAX,%RCX,8),%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x98(%R15),%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
TEST %EAX,%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JLE 3e4b0 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x120> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 3e44f <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0xbf> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
MOV 0x90(%R15),%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
TEST %EAX,%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JLE 3e3c0 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x30> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
XOR %R14D,%R14D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 3e4e3 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x153> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
Function | miniqmcreference::OneBodyJastrowRef |
Source file and lines | OneBodyJastrowRef.h:134-155 |
Module | libqmcwfs.so |
nb instructions | 34 |
nb uops | 36 |
loop length | 146 |
used x86 registers | 10 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 3 |
micro-operation queue | 6.00 cycles |
front end | 6.00 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 3.50 | 2.75 | 2.75 | 2.50 | 3.50 | 5.67 | 5.67 | 5.67 | 0.00 | 0.00 | 0.50 | 0.50 | 1.00 | 1.00 |
cycles | 3.50 | 2.75 | 2.75 | 2.50 | 3.50 | 5.67 | 5.67 | 5.67 | 0.00 | 0.00 | 0.50 | 0.50 | 1.00 | 1.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 6.00 |
Dispatch | 5.67 |
Overall L1 | 6.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 28% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 15% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 33% |
all | 8% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 7% |
all | 16% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 12% |
load | 10% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 13% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVSD -0x48(%RBP),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VSUBSD %XMM3,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 3 | 0.50 |
CALL 4ecf0 <@plt_start@+0x640> | 2 | 0.50 | 0 | 0 | 0 | 0.50 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x40(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV (%RCX),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVSD %XMM0,(%RAX,%R12,8) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
INC %R12 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV 0x8(%RCX),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
SUB %RAX,%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SAR $0x3,%RCX | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
CMP %R12,%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JBE 3e5f0 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x260> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
MOV -0x38(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0xf0(%R15),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0xa8(%R15),%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOVSXD 0x2a0(%RDI),%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVSD (%RCX,%RAX,8),%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVSD %XMM0,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
CALL 4ea70 <@plt_start@+0x3c0> | 2 | 0.50 | 0 | 0 | 0 | 0.50 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x48(%RAX),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%R12,%R12,4),%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV 0x18(%RAX,%RCX,8),%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x98(%R15),%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
TEST %EAX,%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JLE 3e4b0 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x120> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 3e44f <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0xbf> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
MOV 0x90(%R15),%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
TEST %EAX,%EAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JLE 3e3c0 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x30> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
XOR %R14D,%R14D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 3e4e3 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x153> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |