Loop Id: 82 | Module: exec | Source: miniqmc.cpp:476-488 [...] | Coverage: 0.01% |
---|
Loop Id: 82 | Module: exec | Source: miniqmc.cpp:476-488 [...] | Coverage: 0.01% |
---|
0x40e433 MOV -0xd8(%RBP),%RCX |
0x40e43a MOV -0xc8(%RBP),%RDX |
0x40e441 LEA (,%R14,8),%R9 |
0x40e449 MOV %R9,-0xe8(%RBP) |
0x40e450 SUB %RCX,%RDX |
0x40e453 AND $0x8,%EDX |
0x40e456 JE 40e6f8 |
0x40e45c MOV %R15,-0x98(%RBP) |
0x40e463 MOV %R14,%R15 |
0x40e466 MOV %R9,%R14 |
0x40e469 MOV %EBX,-0xa0(%RBP) |
0x40e46f MOV %RCX,%RBX |
0x40e472 JMP 40e53e |
(81) 0x40e480 MOVSXD 0x8(%R13),%RDX |
(81) 0x40e484 MOV 0x18(%R13),%RCX |
(81) 0x40e488 ADD $0x30,%RBX |
(81) 0x40e48c MOV -0xb0(%RBP),%RSI |
(81) 0x40e493 LEA (%RDX,%R15,1),%RDI |
(81) 0x40e497 VMOVSD (%RCX,%R14,1),%XMM2 |
(81) 0x40e49d VMOVSD (%RSI,%R15,8),%XMM13 |
(81) 0x40e4a3 ADD %RDI,%RDX |
(81) 0x40e4a6 MOV -0xa0(%RBP),%ESI |
(81) 0x40e4ac VMOVSD (%RCX,%RDX,8),%XMM14 |
(81) 0x40e4b1 VMOVHPD (%RCX,%RDI,8),%XMM2,%XMM4 |
(81) 0x40e4b6 MOV -0xb8(%RBP),%RDX |
(81) 0x40e4bd VMOVDDUP %XMM13,%XMM5 |
(81) 0x40e4c2 MOV -0x98(%RBP),%RDI |
(81) 0x40e4c9 VFMSUB132SD 0x10(%R8),%XMM14,%XMM13 |
(81) 0x40e4cf VFMSUB231PD -0x18(%RBX),%XMM5,%XMM4 |
(81) 0x40e4d5 VMOVSD %XMM13,-0x50(%RBP) |
(81) 0x40e4da VMOVAPD %XMM4,-0x60(%RBP) |
(81) 0x40e4df CALL 488a00 <_ZN11qmcplusplus11ParticleSet8makeMoveEiRKNS_10TinyVectorIdLj3EEE> |
(81) 0x40e4e4 MOV 0x10(%R12),%R10 |
(81) 0x40e4e9 MOV (%R10),%R11 |
(81) 0x40e4ec MOV 0x20(%R11),%RDI |
(81) 0x40e4f0 CALL 4a40e0 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE5startEv> |
(81) 0x40e4f5 MOV -0xa0(%RBP),%EDX |
(81) 0x40e4fb MOV -0x98(%RBP),%RSI |
(81) 0x40e502 MOV -0xa8(%RBP),%RDI |
(81) 0x40e509 CALL 413ab0 <_ZN11qmcplusplus12WaveFunction5ratioERNS_11ParticleSetEi> |
(81) 0x40e50e MOV 0x10(%R12),%R8 |
(81) 0x40e513 MOV (%R8),%RAX |
(81) 0x40e516 MOV 0x20(%RAX),%RDI |
(81) 0x40e51a CALL 4a4310 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE4stopEv> |
(81) 0x40e51f MOV -0xa0(%RBP),%ESI |
(81) 0x40e525 MOV -0x98(%RBP),%RDI |
(81) 0x40e52c CALL 488ae0 <_ZN11qmcplusplus11ParticleSet10rejectMoveEi> |
(81) 0x40e531 MOV -0xb0(%RBP),%R9 |
(81) 0x40e538 VMOVSD (%R9,%R15,8),%XMM15 |
(81) 0x40e53e MOVSXD 0x8(%R13),%R10 |
(81) 0x40e542 MOV 0x18(%R13),%RDI |
(81) 0x40e546 VMOVDDUP %XMM15,%XMM3 |
(81) 0x40e54b MOV -0xb8(%RBP),%RDX |
(81) 0x40e552 LEA (%R10,%R15,1),%RSI |
(81) 0x40e556 VMOVSD (%RDI,%R14,1),%XMM0 |
(81) 0x40e55c ADD %RSI,%R10 |
(81) 0x40e55f VMOVSD (%RDI,%R10,8),%XMM12 |
(81) 0x40e565 VMOVHPD (%RDI,%RSI,8),%XMM0,%XMM6 |
(81) 0x40e56a MOV -0xa0(%RBP),%ESI |
(81) 0x40e570 VFMSUB132PD (%RBX),%XMM6,%XMM3 |
(81) 0x40e575 MOV -0x98(%RBP),%RDI |
(81) 0x40e57c VFMSUB132SD 0x10(%RBX),%XMM12,%XMM15 |
(81) 0x40e582 VMOVAPD %XMM3,-0x60(%RBP) |
(81) 0x40e587 VMOVSD %XMM15,-0x50(%RBP) |
(81) 0x40e58c CALL 488a00 <_ZN11qmcplusplus11ParticleSet8makeMoveEiRKNS_10TinyVectorIdLj3EEE> |
(81) 0x40e591 MOV 0x10(%R12),%R9 |
(81) 0x40e596 MOV (%R9),%RCX |
(81) 0x40e599 MOV 0x20(%RCX),%RDI |
(81) 0x40e59d CALL 4a40e0 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE5startEv> |
(81) 0x40e5a2 MOV -0x98(%RBP),%RSI |
(81) 0x40e5a9 MOV -0xa0(%RBP),%EDX |
(81) 0x40e5af MOV -0xa8(%RBP),%RDI |
(81) 0x40e5b6 CALL 413ab0 <_ZN11qmcplusplus12WaveFunction5ratioERNS_11ParticleSetEi> |
(81) 0x40e5bb MOV 0x10(%R12),%R11 |
(81) 0x40e5c0 MOV (%R11),%RDX |
(81) 0x40e5c3 MOV 0x20(%RDX),%RDI |
(81) 0x40e5c7 CALL 4a4310 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE4stopEv> |
(81) 0x40e5cc MOV -0xa0(%RBP),%ESI |
(81) 0x40e5d2 MOV -0x98(%RBP),%RDI |
(81) 0x40e5d9 CALL 488ae0 <_ZN11qmcplusplus11ParticleSet10rejectMoveEi> |
(81) 0x40e5de LEA 0x18(%RBX),%R8 |
(81) 0x40e5e2 CMP %R8,-0xc8(%RBP) |
(81) 0x40e5e9 JNE 40e480 |
0x40e5ef MOV %R15,%R14 |
0x40e5f2 MOV -0xa0(%RBP),%EBX |
0x40e5f8 MOV -0x98(%RBP),%R15 |
0x40e5ff INC %R14 |
0x40e602 CMP %R14,-0xe0(%RBP) |
0x40e609 JE 40e7be |
0x40e60f MOV -0xb0(%RBP),%RAX |
0x40e616 VMOVSD -0x108(%RBP),%XMM11 |
0x40e61e VMOVSD (%RAX,%R14,8),%XMM15 |
0x40e624 VCOMISD %XMM15,%XMM11 |
0x40e629 JA 40e433 |
0x40e6f8 MOVSXD 0x8(%R13),%R8 |
0x40e6fc MOV 0x18(%R13),%RAX |
0x40e700 VMOVDDUP %XMM15,%XMM8 |
0x40e705 MOV -0xd8(%RBP),%RSI |
0x40e70c MOV -0xb8(%RBP),%RDX |
0x40e713 LEA (%R8,%R14,1),%RDI |
0x40e717 VMOVSD (%RAX,%R14,8),%XMM7 |
0x40e71d ADD %RDI,%R8 |
0x40e720 VMOVSD (%RAX,%R8,8),%XMM9 |
0x40e726 VMOVHPD (%RAX,%RDI,8),%XMM7,%XMM10 |
0x40e72b MOV %R15,%RDI |
0x40e72e VFMSUB132PD (%RSI),%XMM10,%XMM8 |
0x40e733 VFMSUB132SD 0x10(%RSI),%XMM9,%XMM15 |
0x40e739 MOV %EBX,%ESI |
0x40e73b VMOVAPD %XMM8,-0x60(%RBP) |
0x40e740 VMOVSD %XMM15,-0x50(%RBP) |
0x40e745 CALL 488a00 <_ZN11qmcplusplus11ParticleSet8makeMoveEiRKNS_10TinyVectorIdLj3EEE> |
0x40e74a MOV 0x10(%R12),%R10 |
0x40e74f MOV (%R10),%R9 |
0x40e752 MOV 0x20(%R9),%RDI |
0x40e756 CALL 4a40e0 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE5startEv> |
0x40e75b MOV -0xa8(%RBP),%RDI |
0x40e762 MOV %EBX,%EDX |
0x40e764 MOV %R15,%RSI |
0x40e767 CALL 413ab0 <_ZN11qmcplusplus12WaveFunction5ratioERNS_11ParticleSetEi> |
0x40e76c MOV 0x10(%R12),%RCX |
0x40e771 MOV (%RCX),%R11 |
0x40e774 MOV 0x20(%R11),%RDI |
0x40e778 CALL 4a4310 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE4stopEv> |
0x40e77d MOV %EBX,%ESI |
0x40e77f MOV %R15,%RDI |
0x40e782 CALL 488ae0 <_ZN11qmcplusplus11ParticleSet10rejectMoveEi> |
0x40e787 MOV -0xd8(%RBP),%RDX |
0x40e78e MOV -0xb0(%RBP),%RAX |
0x40e795 MOV %R15,-0x98(%RBP) |
0x40e79c MOV %EBX,-0xa0(%RBP) |
0x40e7a2 MOV %R14,%R15 |
0x40e7a5 LEA 0x18(%RDX),%R8 |
0x40e7a9 VMOVSD (%RAX,%R14,8),%XMM15 |
0x40e7af MOV -0xe8(%RBP),%R14 |
0x40e7b6 MOV %R8,%RBX |
0x40e7b9 JMP 40e53e |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVectorOps.h: 85 - 85 |
-------------------------------------------------------------------------------- |
85: ret[d] = op(lhs[d], rhs[d]); |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/PETE/OperatorTags.h: 53 - 63 |
-------------------------------------------------------------------------------- |
53: return (a - b); |
[...] |
63: return (a * b); |
/usr/include/c++/13.1.1/bits/stl_vector.h: 1126 - 1126 |
-------------------------------------------------------------------------------- |
1126: return *(this->_M_impl._M_start + __n); |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Drivers/miniqmc.cpp: 476 - 488 |
-------------------------------------------------------------------------------- |
476: for (int iat = 0; iat < nions; ++iat) |
477: if (dist[iat] < Rmax) |
478: for (int k = 0; k < nknots; k++) |
479: { |
480: PosType deltar(dist[iat] * rOnSphere[k] - displ[iat]); |
481: |
482: els.makeMove(jel, deltar); |
483: |
484: Timers[Timer_Value].get().start(); |
485: wavefunction.ratio(els, jel); |
486: Timers[Timer_Value].get().stop(); |
487: |
488: els.rejectMove(jel); |
/usr/include/c++/13.1.1/bits/refwrap.h: 347 - 347 |
-------------------------------------------------------------------------------- |
347: { return *_M_data; } |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVector.h: 146 - 146 |
-------------------------------------------------------------------------------- |
146: X[i] = base[i * offset]; |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
○100.00 | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 2.32 |
CQA speedup if FP arith vectorized | 1.84 |
CQA speedup if fully vectorized | 12.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.23 |
Bottlenecks | |
Function | main._omp_fn.1 |
Source | TinyVectorOps.h:85-85,OperatorTags.h:53-63,stl_vector.h:1126-1126,miniqmc.cpp:476-488,refwrap.h:347-347,TinyVector.h:146-146 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 9.88 |
CQA cycles if no scalar integer | 4.25 |
CQA cycles if FP arith vectorized | 5.36 |
CQA cycles if fully vectorized | 0.82 |
Front-end cycles | 9.88 |
DIV/SQRT cycles | 3.25 |
P0 cycles | 3.19 |
P1 cycles | 7.67 |
P2 cycles | 7.67 |
P3 cycles | 6.50 |
P4 cycles | 3.13 |
P5 cycles | 3.94 |
P6 cycles | 6.17 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 9.43 |
Stall cycles (UFS) | 0.00 |
Nb insns | 36.50 |
Nb uops | 39.00 |
Nb loads | 15.00 |
Nb stores | 4.00 |
Nb stack references | 8.75 |
FLOP/cycle | 0.30 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 1.50 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 14.24 |
Bytes prefetched | 0.00 |
Bytes loaded | 120.00 |
Bytes stored | 32.00 |
Stride 0 | 3.75 |
Stride 1 | 1.75 |
Stride n | 3.00 |
Stride unknown | 7.25 |
Stride indirect | 1.75 |
Vectorization ratio all | 6.07 |
Vectorization ratio load | 8.59 |
Vectorization ratio store | 10.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 50.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.34 |
Vector-efficiency ratio load | 13.04 |
Vector-efficiency ratio store | 12.08 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 18.75 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.11 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 2.40 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 11.64 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.50 |
Bottlenecks | micro-operation queue, |
Function | main._omp_fn.1 |
Source | TinyVectorOps.h:85-85,OperatorTags.h:53-63,stl_vector.h:1126-1126,miniqmc.cpp:476-488,refwrap.h:347-347,TinyVector.h:146-146 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 6.00 |
CQA cycles if no scalar integer | 2.50 |
CQA cycles if FP arith vectorized | 6.00 |
CQA cycles if fully vectorized | 0.52 |
Front-end cycles | 6.00 |
DIV/SQRT cycles | 2.50 |
P0 cycles | 2.50 |
P1 cycles | 4.00 |
P2 cycles | 4.00 |
P3 cycles | 3.00 |
P4 cycles | 2.50 |
P5 cycles | 2.50 |
P6 cycles | 3.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 6.16 |
Stall cycles (UFS) | 0.00 |
Nb insns | 24.00 |
Nb uops | 24.00 |
Nb loads | 8.00 |
Nb stores | 3.00 |
Nb stack references | 8.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 13.33 |
Bytes prefetched | 0.00 |
Bytes loaded | 60.00 |
Bytes stored | 20.00 |
Stride 0 | 4.00 |
Stride 1 | 2.00 |
Stride n | 2.00 |
Stride unknown | 3.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 11.16 |
Vector-efficiency ratio load | 11.46 |
Vector-efficiency ratio store | 10.42 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 10.94 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 10.40 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.08 |
Bottlenecks | micro-operation queue, |
Function | main._omp_fn.1 |
Source | TinyVectorOps.h:85-85,OperatorTags.h:53-63,stl_vector.h:1126-1126,miniqmc.cpp:476-488,refwrap.h:347-347,TinyVector.h:146-146 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 3.25 |
CQA cycles if no scalar integer | 3.25 |
CQA cycles if FP arith vectorized | 3.25 |
CQA cycles if fully vectorized | 0.31 |
Front-end cycles | 3.25 |
DIV/SQRT cycles | 1.25 |
P0 cycles | 1.25 |
P1 cycles | 1.67 |
P2 cycles | 1.67 |
P3 cycles | 3.00 |
P4 cycles | 1.25 |
P5 cycles | 1.25 |
P6 cycles | 1.67 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 3.35 |
Stall cycles (UFS) | 0.00 |
Nb insns | 13.00 |
Nb uops | 13.00 |
Nb loads | 2.00 |
Nb stores | 3.00 |
Nb stack references | 5.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 11.08 |
Bytes prefetched | 0.00 |
Bytes loaded | 16.00 |
Bytes stored | 20.00 |
Stride 0 | 5.00 |
Stride 1 | 3.00 |
Stride n | 2.00 |
Stride unknown | 8.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | NA |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 11.25 |
Vector-efficiency ratio load | NA |
Vector-efficiency ratio store | 10.42 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 2.64 |
CQA speedup if FP arith vectorized | 2.51 |
CQA speedup if fully vectorized | 12.15 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.18 |
Bottlenecks | micro-operation queue, |
Function | main._omp_fn.1 |
Source | TinyVectorOps.h:85-85,OperatorTags.h:53-63,stl_vector.h:1126-1126,miniqmc.cpp:476-488,refwrap.h:347-347,TinyVector.h:146-146 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 16.50 |
CQA cycles if no scalar integer | 6.25 |
CQA cycles if FP arith vectorized | 6.56 |
CQA cycles if fully vectorized | 1.36 |
Front-end cycles | 16.50 |
DIV/SQRT cycles | 5.50 |
P0 cycles | 5.25 |
P1 cycles | 14.00 |
P2 cycles | 14.00 |
P3 cycles | 10.00 |
P4 cycles | 5.25 |
P5 cycles | 6.00 |
P6 cycles | 10.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 15.49 |
Stall cycles (UFS) | 0.00 |
Nb insns | 60.00 |
Nb uops | 65.00 |
Nb loads | 28.00 |
Nb stores | 5.00 |
Nb stack references | 12.00 |
FLOP/cycle | 0.36 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 3.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.24 |
Bytes prefetched | 0.00 |
Bytes loaded | 224.00 |
Bytes stored | 44.00 |
Stride 0 | 3.00 |
Stride 1 | 0.00 |
Stride n | 4.00 |
Stride unknown | 7.00 |
Stride indirect | 2.00 |
Vectorization ratio all | 10.00 |
Vectorization ratio load | 9.09 |
Vectorization ratio store | 20.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | 50.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 13.13 |
Vector-efficiency ratio load | 13.07 |
Vector-efficiency ratio store | 13.75 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | 18.75 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 2.75 |
CQA speedup if FP arith vectorized | 2.44 |
CQA speedup if fully vectorized | 12.44 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.25 |
Bottlenecks | micro-operation queue, |
Function | main._omp_fn.1 |
Source | TinyVectorOps.h:85-85,OperatorTags.h:53-63,stl_vector.h:1126-1126,miniqmc.cpp:476-488,refwrap.h:347-347,TinyVector.h:146-146 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 13.75 |
CQA cycles if no scalar integer | 5.00 |
CQA cycles if FP arith vectorized | 5.63 |
CQA cycles if fully vectorized | 1.10 |
Front-end cycles | 13.75 |
DIV/SQRT cycles | 3.75 |
P0 cycles | 3.75 |
P1 cycles | 11.00 |
P2 cycles | 11.00 |
P3 cycles | 10.00 |
P4 cycles | 3.50 |
P5 cycles | 6.00 |
P6 cycles | 10.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 12.73 |
Stall cycles (UFS) | 0.00 |
Nb insns | 49.00 |
Nb uops | 54.00 |
Nb loads | 22.00 |
Nb stores | 5.00 |
Nb stack references | 10.00 |
FLOP/cycle | 0.44 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 3.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.29 |
Bytes prefetched | 0.00 |
Bytes loaded | 180.00 |
Bytes stored | 44.00 |
Stride 0 | 3.00 |
Stride 1 | 2.00 |
Stride n | 4.00 |
Stride unknown | 11.00 |
Stride indirect | 3.00 |
Vectorization ratio all | 14.29 |
Vectorization ratio load | 16.67 |
Vectorization ratio store | 20.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | 50.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 13.84 |
Vector-efficiency ratio load | 14.58 |
Vector-efficiency ratio store | 13.75 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | 18.75 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Path / |
Function | main._omp_fn.1 |
Source file and lines | miniqmc.cpp:476-488 |
Module | exec |
nb instructions | 36.50 |
nb uops | 39 |
loop length | 185.50 |
used x86 registers | 11.25 |
used mmx registers | 0 |
used xmm registers | 3.25 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 8.75 |
micro-operation queue | 9.88 cycles |
front end | 9.88 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 3.25 | 3.19 | 7.67 | 7.67 | 6.50 | 3.13 | 3.94 | 6.17 |
cycles | 3.25 | 3.19 | 7.67 | 7.67 | 6.50 | 3.13 | 3.94 | 6.17 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 9.43 |
Stall cycles | 0.00 |
Front-end | 9.88 |
Dispatch | 8.00 |
Data deps. | 0.00 |
Overall L1 | 9.88 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 9% |
store | 50% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 6% |
load | 8% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 11% |
all | 14% |
load | 13% |
store | 18% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 12% |
load | 13% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Function | main._omp_fn.1 |
Source file and lines | miniqmc.cpp:476-488 |
Module | exec |
nb instructions | 24 |
nb uops | 24 |
loop length | 132 |
used x86 registers | 8 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 8 |
micro-operation queue | 6.00 cycles |
front end | 6.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 4.00 | 4.00 | 3.00 | 2.50 | 2.50 | 3.00 |
cycles | 2.50 | 2.50 | 4.00 | 4.00 | 3.00 | 2.50 | 2.50 | 3.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 6.16 |
Stall cycles | 0.00 |
Front-end | 6.00 |
Dispatch | 4.00 |
Data deps. | 0.00 |
Overall L1 | 6.00 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 10% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 10% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 11% |
load | 11% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 10% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0xd8(%RBP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0xc8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (,%R14,8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0xe8(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
SUB %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
AND $0x8,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 40e6f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV %R15,-0x98(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %R14,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R9,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %EBX,-0xa0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RCX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 40e53e | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
MOV %R15,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0xa0(%RBP),%EBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0x98(%RBP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
INC %R14 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %R14,-0xe0(%RBP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JE 40e7be | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV -0xb0(%RBP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD -0x108(%RBP),%XMM11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%RAX,%R14,8),%XMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VCOMISD %XMM15,%XMM11 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
JA 40e433 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Function | main._omp_fn.1 |
Source file and lines | miniqmc.cpp:476-488 |
Module | exec |
nb instructions | 13 |
nb uops | 13 |
loop length | 68 |
used x86 registers | 7 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 5 |
micro-operation queue | 3.25 cycles |
front end | 3.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 1.25 | 1.25 | 1.67 | 1.67 | 3.00 | 1.25 | 1.25 | 1.67 |
cycles | 1.25 | 1.25 | 1.67 | 1.67 | 3.00 | 1.25 | 1.25 | 1.67 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 3.35 |
Stall cycles | 0.00 |
Front-end | 3.25 |
Dispatch | 3.00 |
Data deps. | 0.00 |
Overall L1 | 3.25 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | NA (no load vectorizable/vectorized instructions) |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0xd8(%RBP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0xc8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (,%R14,8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0xe8(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
SUB %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
AND $0x8,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 40e6f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV %R15,-0x98(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %R14,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R9,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %EBX,-0xa0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RCX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 40e53e | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
Function | main._omp_fn.1 |
Source file and lines | miniqmc.cpp:476-488 |
Module | exec |
nb instructions | 60 |
nb uops | 65 |
loop length | 303 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 12 |
micro-operation queue | 16.50 cycles |
front end | 16.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 5.50 | 5.25 | 14.00 | 14.00 | 10.00 | 5.25 | 6.00 | 10.00 |
cycles | 5.50 | 5.25 | 14.00 | 14.00 | 10.00 | 5.25 | 6.00 | 10.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 15.49 |
Stall cycles | 0.00 |
Front-end | 16.50 |
Dispatch | 14.00 |
Data deps. | 0.00 |
Overall L1 | 16.50 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 16% |
load | 12% |
store | 50% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 10% |
load | 9% |
store | 20% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 10% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 12% |
all | 14% |
load | 14% |
store | 18% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 13% |
load | 13% |
store | 13% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0xd8(%RBP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0xc8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (,%R14,8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0xe8(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
SUB %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
AND $0x8,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 40e6f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV %R15,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0xa0(%RBP),%EBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0x98(%RBP),%R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
INC %R14 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %R14,-0xe0(%RBP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JE 40e7be | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV -0xb0(%RBP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD -0x108(%RBP),%XMM11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%RAX,%R14,8),%XMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VCOMISD %XMM15,%XMM11 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
JA 40e433 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOVSXD 0x8(%R13),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x18(%R13),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVDDUP %XMM15,%XMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
MOV -0xd8(%RBP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0xb8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%R8,%R14,1),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD (%RAX,%R14,8),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %RDI,%R8 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VMOVSD (%RAX,%R8,8),%XMM9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVHPD (%RAX,%RDI,8),%XMM7,%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 4 | 1 |
MOV %R15,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMSUB132PD (%RSI),%XMM10,%XMM8 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMSUB132SD 0x10(%RSI),%XMM9,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %EBX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %XMM8,-0x60(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD %XMM15,-0x50(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
CALL 488a00 <_ZN11qmcplusplus11ParticleSet8makeMoveEiRKNS_10TinyVectorIdLj3EEE> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV 0x10(%R12),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV (%R10),%R9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x20(%R9),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CALL 4a40e0 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE5startEv> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV -0xa8(%RBP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %EBX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R15,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CALL 413ab0 <_ZN11qmcplusplus12WaveFunction5ratioERNS_11ParticleSetEi> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV 0x10(%R12),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV (%RCX),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x20(%R11),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CALL 4a4310 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE4stopEv> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV %EBX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R15,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CALL 488ae0 <_ZN11qmcplusplus11ParticleSet10rejectMoveEi> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV -0xd8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0xb0(%RBP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %R15,-0x98(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %EBX,-0xa0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %R14,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
LEA 0x18(%RDX),%R8 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD (%RAX,%R14,8),%XMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0xe8(%RBP),%R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %R8,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 40e53e | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
Function | main._omp_fn.1 |
Source file and lines | miniqmc.cpp:476-488 |
Module | exec |
nb instructions | 49 |
nb uops | 54 |
loop length | 239 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 5 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 10 |
micro-operation queue | 13.75 cycles |
front end | 13.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 3.75 | 3.75 | 11.00 | 11.00 | 10.00 | 3.50 | 6.00 | 10.00 |
cycles | 3.75 | 3.75 | 11.00 | 11.00 | 10.00 | 3.50 | 6.00 | 10.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 12.73 |
Stall cycles | 0.00 |
Front-end | 13.75 |
Dispatch | 11.00 |
Data deps. | 0.00 |
Overall L1 | 13.75 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 22% |
load | 16% |
store | 50% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 14% |
load | 16% |
store | 20% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 50% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | NA (no load vectorizable/vectorized instructions) |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 12% |
all | 15% |
load | 14% |
store | 18% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 13% |
load | 14% |
store | 13% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 18% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0xd8(%RBP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0xc8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (,%R14,8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0xe8(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
SUB %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
AND $0x8,%EDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 40e6f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOVSXD 0x8(%R13),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x18(%R13),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVDDUP %XMM15,%XMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
MOV -0xd8(%RBP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0xb8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%R8,%R14,1),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD (%RAX,%R14,8),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
ADD %RDI,%R8 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VMOVSD (%RAX,%R8,8),%XMM9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVHPD (%RAX,%RDI,8),%XMM7,%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 4 | 1 |
MOV %R15,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VFMSUB132PD (%RSI),%XMM10,%XMM8 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMSUB132SD 0x10(%RSI),%XMM9,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %EBX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %XMM8,-0x60(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD %XMM15,-0x50(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
CALL 488a00 <_ZN11qmcplusplus11ParticleSet8makeMoveEiRKNS_10TinyVectorIdLj3EEE> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV 0x10(%R12),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV (%R10),%R9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x20(%R9),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CALL 4a40e0 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE5startEv> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV -0xa8(%RBP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %EBX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R15,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CALL 413ab0 <_ZN11qmcplusplus12WaveFunction5ratioERNS_11ParticleSetEi> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV 0x10(%R12),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV (%RCX),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x20(%R11),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CALL 4a4310 <_ZN11qmcplusplus9TimerTypeINS_8CPUClockEE4stopEv> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV %EBX,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R15,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CALL 488ae0 <_ZN11qmcplusplus11ParticleSet10rejectMoveEi> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV -0xd8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0xb0(%RBP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %R15,-0x98(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %EBX,-0xa0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %R14,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
LEA 0x18(%RDX),%R8 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD (%RAX,%R14,8),%XMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0xe8(%RBP),%R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %R8,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 40e53e | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |