Loop Id: 116 | Module: exec | Source: ParticleSet.h:277-277 [...] | Coverage: 0.01% |
---|
Loop Id: 116 | Module: exec | Source: ParticleSet.h:277-277 [...] | Coverage: 0.01% |
---|
0x4110fd MOV -0x128(%RBP),%RSI |
0x411104 MOV -0xb0(%RBP),%RDI |
0x41110b MOV 0x48(%RSI),%RCX |
0x41110f LEA (%RDI,%RDI,4),%R8 |
0x411113 MOV %EDI,-0x108(%RBP) |
0x411119 SAL $0x3,%R8 |
0x41111d ADD %R8,%RCX |
0x411120 ADD 0x60(%RSI),%R8 |
0x411124 MOV %RCX,-0xf0(%RBP) |
0x41112b MOV %R8,-0x100(%RBP) |
0x411132 TEST %RAX,%RAX |
0x411135 JE 41146c |
0x41113b LEA (%RDI,%RDI,2),%R10 |
0x41113f XOR %R12D,%R12D |
0x411142 LEA (,%R10,8),%R9 |
0x41114a MOV %R9,-0x118(%RBP) |
0x411151 JMP 41115f |
(117) 0x411153 INC %R12 |
(117) 0x411156 CMP %RAX,%R12 |
(117) 0x411159 JAE 411451 |
(117) 0x41115f MOV -0xf0(%RBP),%RDX |
(117) 0x411166 LEA (,%R12,8),%R8 |
(117) 0x41116e MOV 0x18(%RDX),%RSI |
(117) 0x411172 ADD %R8,%RSI |
(117) 0x411175 VMOVSD (%RSI),%XMM0 |
(117) 0x411179 VCOMISD 0x1440(%RBX),%XMM0 |
(117) 0x411181 JAE 411153 |
(117) 0x411183 MOV 0x2830(%RBX),%RCX |
(117) 0x41118a MOV $-0x5555555555555555,%RAX |
(117) 0x411194 SUB 0x2828(%RBX),%RCX |
(117) 0x41119b SAR $0x3,%RCX |
(117) 0x41119f IMUL %RAX,%RCX |
(117) 0x4111a3 TEST %ECX,%ECX |
(117) 0x4111a5 JLE 4113d2 |
(117) 0x4111ab MOV -0x100(%RBP),%R10 |
(117) 0x4111b2 DEC %ECX |
(117) 0x4111b4 MOV -0x118(%RBP),%RDI |
(117) 0x4111bb ADD 0x40(%R14),%RDI |
(117) 0x4111bf MOVSXD 0x8(%R10),%RAX |
(117) 0x4111c3 MOV 0x18(%R10),%RDX |
(117) 0x4111c7 LEA (%RCX,%RCX,2),%R10 |
(117) 0x4111cb LEA (%R13,%R10,8),%RCX |
(117) 0x4111d0 LEA (%R10,%R10,2),%R10 |
(117) 0x4111d4 VMOVQ %RCX,%XMM4 |
(117) 0x4111d9 LEA (%RAX,%R12,1),%R9 |
(117) 0x4111dd ADD %RDX,%R8 |
(117) 0x4111e0 ADD %R9,%RAX |
(117) 0x4111e3 AND $0x3,%R10D |
(117) 0x4111e7 MOV %R8,-0x90(%RBP) |
(117) 0x4111ee LEA (%RDX,%R9,8),%R8 |
(117) 0x4111f2 LEA (%RDX,%RAX,8),%R9 |
(117) 0x4111f6 MOV -0x70(%RBP),%RAX |
(117) 0x4111fa MOV %R13,%RDX |
(117) 0x4111fd JE 411545 |
(117) 0x411203 MOV -0x90(%RBP),%RCX |
(117) 0x41120a VMOVSD (%R9),%XMM5 |
(117) 0x41120f VMOVDDUP %XMM0,%XMM6 |
(117) 0x411213 VADDSD 0x10(%RDI),%XMM5,%XMM14 |
(117) 0x411218 ADD $0x18,%RAX |
(117) 0x41121c VFMADD231SD 0x10(%R13),%XMM0,%XMM14 |
(117) 0x411222 LEA 0x18(%R13),%RDX |
(117) 0x411226 VMOVSD (%RCX),%XMM7 |
(117) 0x41122a VMOVHPD (%R8),%XMM7,%XMM1 |
(117) 0x41122f VADDPD (%RDI),%XMM1,%XMM8 |
(117) 0x411233 VFMADD231PD (%R13),%XMM6,%XMM8 |
(117) 0x411239 VMOVSD %XMM14,-0x8(%RAX) |
(117) 0x41123e VMOVUPD %XMM8,-0x18(%RAX) |
(117) 0x411243 VMOVSD (%RSI),%XMM0 |
(117) 0x411247 CMP $0x1,%R10 |
(117) 0x41124b JE 411534 |
(117) 0x411251 CMP $0x2,%R10 |
(117) 0x411255 JE 411293 |
(117) 0x411257 VMOVSD (%RCX),%XMM2 |
(117) 0x41125b VMOVHPD (%R8),%XMM2,%XMM3 |
(117) 0x411260 VADDPD (%RDI),%XMM3,%XMM13 |
(117) 0x411264 VMOVSD (%R9),%XMM12 |
(117) 0x411269 VMOVDDUP %XMM0,%XMM11 |
(117) 0x41126d VADDSD 0x10(%RDI),%XMM12,%XMM9 |
(117) 0x411272 VFMADD231PD (%RDX),%XMM11,%XMM13 |
(117) 0x411277 VFMADD231SD 0x10(%RDX),%XMM0,%XMM9 |
(117) 0x41127d ADD $0x18,%RAX |
(117) 0x411281 LEA 0x30(%R13),%RDX |
(117) 0x411285 VMOVUPD %XMM13,-0x18(%RAX) |
(117) 0x41128a VMOVSD %XMM9,-0x8(%RAX) |
(117) 0x41128f VMOVSD (%RSI),%XMM0 |
(117) 0x411293 MOV -0x90(%RBP),%RCX |
(117) 0x41129a VMOVSD (%R9),%XMM10 |
(117) 0x41129f VMOVDDUP %XMM0,%XMM15 |
(117) 0x4112a3 VADDSD 0x10(%RDI),%XMM10,%XMM1 |
(117) 0x4112a8 MOV %R12,-0xa0(%RBP) |
(117) 0x4112af VFMADD231SD 0x10(%RDX),%XMM0,%XMM1 |
(117) 0x4112b5 ADD $0x18,%RAX |
(117) 0x4112b9 ADD $0x18,%RDX |
(117) 0x4112bd VMOVQ %XMM4,%R12 |
(117) 0x4112c2 VMOVSD (%RCX),%XMM5 |
(117) 0x4112c6 VMOVHPD (%R8),%XMM5,%XMM6 |
(117) 0x4112cb VADDPD (%RDI),%XMM6,%XMM7 |
(117) 0x4112cf VFMADD231PD -0x18(%RDX),%XMM15,%XMM7 |
(117) 0x4112d5 VMOVSD %XMM1,-0x8(%RAX) |
(117) 0x4112da VMOVUPD %XMM7,-0x18(%RAX) |
(117) 0x4112df VMOVSD (%RSI),%XMM0 |
(117) 0x4112e3 JMP 411393 |
(118) 0x4112e8 VMOVSD (%RCX),%XMM13 |
(118) 0x4112ec VMOVHPD (%R8),%XMM13,%XMM9 |
(118) 0x4112f1 VADDPD (%RDI),%XMM9,%XMM10 |
(118) 0x4112f5 VMOVSD (%R9),%XMM3 |
(118) 0x4112fa VMOVDDUP (%RSI),%XMM15 |
(118) 0x4112fe VADDSD 0x10(%RDI),%XMM3,%XMM5 |
(118) 0x411303 VFMADD231PD 0x18(%RDX),%XMM15,%XMM10 |
(118) 0x411309 VMOVSD 0x28(%RDX),%XMM6 |
(118) 0x41130e VFMADD231SD (%RSI),%XMM6,%XMM5 |
(118) 0x411313 VMOVUPD %XMM10,0x18(%RAX) |
(118) 0x411318 VMOVSD %XMM5,0x28(%RAX) |
(118) 0x41131d VMOVSD (%RCX),%XMM1 |
(118) 0x411321 VMOVHPD (%R8),%XMM1,%XMM0 |
(118) 0x411326 VADDPD (%RDI),%XMM0,%XMM4 |
(118) 0x41132a VMOVSD 0x40(%RDX),%XMM12 |
(118) 0x41132f VMOVSD (%R9),%XMM7 |
(118) 0x411334 VMOVDDUP (%RSI),%XMM8 |
(118) 0x411338 VADDSD 0x10(%RDI),%XMM7,%XMM14 |
(118) 0x41133d VFMADD231PD 0x30(%RDX),%XMM8,%XMM4 |
(118) 0x411343 VFMADD231SD (%RSI),%XMM12,%XMM14 |
(118) 0x411348 VMOVUPD %XMM4,0x30(%RAX) |
(118) 0x41134d VMOVSD %XMM14,0x40(%RAX) |
(118) 0x411352 VMOVSD (%RCX),%XMM2 |
(118) 0x411356 VMOVSD 0x58(%RDX),%XMM15 |
(118) 0x41135b VMOVHPD (%R8),%XMM2,%XMM3 |
(118) 0x411360 VADDPD (%RDI),%XMM3,%XMM13 |
(118) 0x411364 VMOVSD (%R9),%XMM11 |
(118) 0x411369 VMOVDDUP (%RSI),%XMM9 |
(118) 0x41136d VADDSD 0x10(%RDI),%XMM11,%XMM10 |
(118) 0x411372 VFMADD231PD 0x48(%RDX),%XMM9,%XMM13 |
(118) 0x411378 VFMADD231SD (%RSI),%XMM15,%XMM10 |
(118) 0x41137d ADD $0x60,%RAX |
(118) 0x411381 ADD $0x60,%RDX |
(118) 0x411385 VMOVUPD %XMM13,-0x18(%RAX) |
(118) 0x41138a VMOVSD %XMM10,-0x8(%RAX) |
(118) 0x41138f VMOVSD (%RSI),%XMM0 |
(118) 0x411393 VMOVSD (%RCX),%XMM14 |
(118) 0x411397 VMOVHPD (%R8),%XMM14,%XMM12 |
(118) 0x41139c VADDPD (%RDI),%XMM12,%XMM11 |
(118) 0x4113a0 VMOVSD (%R9),%XMM4 |
(118) 0x4113a5 VMOVDDUP %XMM0,%XMM8 |
(118) 0x4113a9 VADDSD 0x10(%RDI),%XMM4,%XMM2 |
(118) 0x4113ae VFMADD231PD (%RDX),%XMM8,%XMM11 |
(118) 0x4113b3 VFMADD132SD 0x10(%RDX),%XMM2,%XMM0 |
(118) 0x4113b9 VMOVUPD %XMM11,(%RAX) |
(118) 0x4113bd VMOVSD %XMM0,0x10(%RAX) |
(118) 0x4113c2 CMP %R12,%RDX |
(118) 0x4113c5 JNE 4112e8 |
(117) 0x4113cb MOV -0xa0(%RBP),%R12 |
(117) 0x4113d2 MOV 0x18(%R11),%R11 |
(117) 0x4113d6 MOV 0x2840(%RBX),%RAX |
(117) 0x4113dd MOV -0x108(%RBP),%EDX |
(117) 0x4113e3 LEA -0x50(%RBP),%RSI |
(117) 0x4113e7 LEA -0x70(%RBP),%RCX |
(117) 0x4113eb MOV %RSI,-0x90(%RBP) |
(117) 0x4113f2 MOV %R12D,%R9D |
(117) 0x4113f5 MOV $0x1,%R8D |
(117) 0x4113fb MOV %R14,%RSI |
(117) 0x4113fe MOV %RCX,-0xa8(%RBP) |
(117) 0x411405 MOVSXD (%R11,%R12,4),%R10 |
(117) 0x411409 MOV (%RAX,%R10,8),%RDI |
(117) 0x41140d MOV %RDI,-0xa0(%RBP) |
(117) 0x411414 CALL 4d20f0 <_ZN11qmcplusplus18VirtualParticleSet9makeMovesERKNS_11ParticleSetEiRKSt6vectorINS_10TinyVectorIdLj3EEESaIS6_EEbi> |
(117) 0x411419 MOV -0xa0(%RBP),%RSI |
(117) 0x411420 MOV -0x88(%RBP),%RDI |
(117) 0x411427 LEA -0x50(%RBP),%RDX |
(117) 0x41142b MOV %RDX,-0x90(%RBP) |
(117) 0x411432 CALL 418bf0 <_ZN11qmcplusplus12WaveFunction14evaluateRatiosERNS_18VirtualParticleSetERSt6vectorIdSaIdEE> |
(117) 0x411437 MOV 0x2858(%RBX),%R11 |
(117) 0x41143e INC %R12 |
(117) 0x411441 MOV 0x260(%R11),%RAX |
(117) 0x411448 CMP %RAX,%R12 |
(117) 0x41144b JB 41115f |
0x411451 INCQ -0xb0(%RBP) |
0x411458 MOV -0xb0(%RBP),%RDI |
0x41145f CMP 0x260(%R14),%RDI |
0x411466 JB 4110fd |
(117) 0x411534 MOV %R12,-0xa0(%RBP) |
(117) 0x41153b VMOVQ %XMM4,%R12 |
(117) 0x411540 JMP 411393 |
(117) 0x411545 MOV %R12,-0xa0(%RBP) |
(117) 0x41154c MOV -0x90(%RBP),%RCX |
(117) 0x411553 VMOVQ %XMM4,%R12 |
(117) 0x411558 JMP 411393 |
/beegfs/hackathon/users/eoseret/qaas_runs/170-855-3059/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/VectorSoAContainer.h: 231 - 231 |
-------------------------------------------------------------------------------- |
231: inline const AoSElement_t operator[](size_t i) const { return AoSElement_t(myData + i, nGhosts); } |
/beegfs/hackathon/users/eoseret/qaas_runs/170-855-3059/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVector.h: 145 - 145 |
-------------------------------------------------------------------------------- |
145: X[i] = base[i * offset]; |
/beegfs/hackathon/users/eoseret/qaas_runs/170-855-3059/intel/miniqmc/build/miniqmc/src/Particle/ParticleSet.h: 277 - 277 |
-------------------------------------------------------------------------------- |
277: inline size_t getTotalNum() const { return TotalNum; } |
/beegfs/hackathon/users/eoseret/qaas_runs/170-855-3059/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/OhmmsVector.h: 229 - 229 |
-------------------------------------------------------------------------------- |
229: return X[i]; |
/cluster/comp/gcc/13.2.0/include/c++/13.2.0/bits/stl_vector.h: 990 - 1145 |
-------------------------------------------------------------------------------- |
990: { return size_type(this->_M_impl._M_finish - this->_M_impl._M_start); } |
[...] |
1142: operator[](size_type __n) const _GLIBCXX_NOEXCEPT |
1143: { |
1144: __glibcxx_requires_subscript(__n); |
1145: return *(this->_M_impl._M_start + __n); |
/beegfs/hackathon/users/eoseret/qaas_runs/170-855-3059/intel/miniqmc/build/miniqmc/src/Numerics/PETE/OperatorTags.h: 43 - 63 |
-------------------------------------------------------------------------------- |
43: return (a + b); |
[...] |
63: return (a * b); |
/cluster/comp/gcc/13.2.0/include/c++/13.2.0/bits/unique_ptr.h: 199 - 199 |
-------------------------------------------------------------------------------- |
199: pointer _M_ptr() const noexcept { return std::get<0>(_M_t); } |
/beegfs/hackathon/users/eoseret/qaas_runs/170-855-3059/intel/miniqmc/build/miniqmc/src/Drivers/NonLocalPP.hpp: 122 - 135 |
-------------------------------------------------------------------------------- |
122: for (int jel = 0; jel < els.getTotalNum(); ++jel) |
123: { |
124: const auto& dist = d_ie.getDistRow(jel); |
125: const auto& displ = d_ie.getDisplRow(jel); |
126: for (int iat = 0; iat < ions_ref.getTotalNum(); ++iat) |
127: { |
128: //due to < Rmax condition, the actually iteration iat is [0,2] in a real simulation |
129: if (dist[iat] < Rmax) |
130: { |
131: for (int k = 0; k < size(); k++) |
132: virtualPos[k] = dist[iat] * rOnSphere[k] + displ[iat] + els.R[jel]; |
133: auto& VP = *VPs[ions_ref.GroupID[iat]]; |
134: VP.makeMoves(els, jel, virtualPos, true, iat); |
135: wf.evaluateRatios(VP, ratios); |
/cluster/comp/gcc/13.2.0/include/c++/13.2.0/bits/new_allocator.h: 88 - 88 |
-------------------------------------------------------------------------------- |
88: __new_allocator() _GLIBCXX_USE_NOEXCEPT { } |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 5.33 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.05 |
Bottlenecks | P5, P6, P7, |
Function | main._omp_fn.1 |
Source | ParticleSet.h:277-277,stl_vector.h:1142-1142,stl_vector.h:1145-1145,NonLocalPP.hpp:122-122,NonLocalPP.hpp:126-126 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 3.67 |
CQA cycles if no scalar integer | 3.67 |
CQA cycles if FP arith vectorized | 3.67 |
CQA cycles if fully vectorized | 0.69 |
Front-end cycles | 3.50 |
DIV/SQRT cycles | 2.50 |
P0 cycles | 2.50 |
P1 cycles | 2.25 |
P2 cycles | 2.25 |
P3 cycles | 1.50 |
P4 cycles | 3.67 |
P5 cycles | 3.67 |
P6 cycles | 3.67 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.00 |
P10 cycles | 0.00 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 21.00 |
Nb uops | 21.00 |
Nb loads | 7.00 |
Nb stores | 5.00 |
Nb stack references | 6.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 25.09 |
Bytes prefetched | 0.00 |
Bytes loaded | 56.00 |
Bytes stored | 36.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 11.36 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 11.25 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 10.42 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 5.33 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.05 |
Bottlenecks | P5, P6, P7, |
Function | main._omp_fn.1 |
Source | ParticleSet.h:277-277,stl_vector.h:1142-1142,stl_vector.h:1145-1145,NonLocalPP.hpp:122-122,NonLocalPP.hpp:126-126 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 3.67 |
CQA cycles if no scalar integer | 3.67 |
CQA cycles if FP arith vectorized | 3.67 |
CQA cycles if fully vectorized | 0.69 |
Front-end cycles | 3.50 |
DIV/SQRT cycles | 2.50 |
P0 cycles | 2.50 |
P1 cycles | 2.25 |
P2 cycles | 2.25 |
P3 cycles | 1.50 |
P4 cycles | 3.67 |
P5 cycles | 3.67 |
P6 cycles | 3.67 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.00 |
P10 cycles | 0.00 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 21.00 |
Nb uops | 21.00 |
Nb loads | 7.00 |
Nb stores | 5.00 |
Nb stack references | 6.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 25.09 |
Bytes prefetched | 0.00 |
Bytes loaded | 56.00 |
Bytes stored | 36.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 11.36 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 11.25 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 10.42 |
Path / |
Function | main._omp_fn.1 |
Source file and lines | ParticleSet.h:277-277 |
Module | exec |
nb instructions | 21 |
nb uops | 21 |
loop length | 113 |
used x86 registers | 10 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 6 |
micro-operation queue | 3.50 cycles |
front end | 3.50 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 2.25 | 2.25 | 1.50 | 3.67 | 3.67 | 3.67 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
cycles | 2.50 | 2.50 | 2.25 | 2.25 | 1.50 | 3.67 | 3.67 | 3.67 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 3.50 |
Dispatch | 3.67 |
Overall L1 | 3.67 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 12% |
store | 11% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 10% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x128(%RBP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0xb0(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x48(%RSI),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%RDI,%RDI,4),%R8 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %EDI,-0x108(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SAL $0x3,%R8 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD %R8,%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD 0x60(%RSI),%R8 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV %RCX,-0xf0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %R8,-0x100(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
TEST %RAX,%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JE 41146c <main._omp_fn.1+0x5b4c> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
LEA (%RDI,%RDI,2),%R10 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
LEA (,%R10,8),%R9 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R9,-0x118(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
JMP 41115f <main._omp_fn.1+0x583f> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
INCQ -0xb0(%RBP) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV -0xb0(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
CMP 0x260(%R14),%RDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JB 4110fd <main._omp_fn.1+0x57dd> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
Function | main._omp_fn.1 |
Source file and lines | ParticleSet.h:277-277 |
Module | exec |
nb instructions | 21 |
nb uops | 21 |
loop length | 113 |
used x86 registers | 10 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 6 |
micro-operation queue | 3.50 cycles |
front end | 3.50 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 2.25 | 2.25 | 1.50 | 3.67 | 3.67 | 3.67 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
cycles | 2.50 | 2.50 | 2.25 | 2.25 | 1.50 | 3.67 | 3.67 | 3.67 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 3.50 |
Dispatch | 3.67 |
Overall L1 | 3.67 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 12% |
store | 11% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 10% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x128(%RBP),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0xb0(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV 0x48(%RSI),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
LEA (%RDI,%RDI,4),%R8 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %EDI,-0x108(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
SAL $0x3,%R8 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD %R8,%RCX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD 0x60(%RSI),%R8 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
MOV %RCX,-0xf0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %R8,-0x100(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
TEST %RAX,%RAX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JE 41146c <main._omp_fn.1+0x5b4c> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
LEA (%RDI,%RDI,2),%R10 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
LEA (,%R10,8),%R9 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOV %R9,-0x118(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
JMP 41115f <main._omp_fn.1+0x583f> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
INCQ -0xb0(%RBP) | 2 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOV -0xb0(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
CMP 0x260(%R14),%RDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
JB 4110fd <main._omp_fn.1+0x57dd> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |