Loop Id: 942 | Module: exec | Source: inner_product.hpp:82-154 [...] | Coverage: 0.01% |
---|
Loop Id: 942 | Module: exec | Source: inner_product.hpp:82-154 [...] | Coverage: 0.01% |
---|
0x4489c0 VMOVUPD %XMM2,-0x40(%RBP) |
0x4489c5 VMOVSD %XMM3,-0x30(%RBP) |
0x4489ca LEA (%R14,%R13,1),%RAX |
0x4489ce LEA (%RAX,%RAX,2),%RCX |
0x4489d2 VMOVUPD -0x40(%RBP),%XMM2 |
0x4489d7 VADDPD (%R15,%RCX,8),%XMM2,%XMM3 |
0x4489dd VMOVUPD %XMM3,(%R15,%RCX,8) |
0x4489e3 VMOVSD 0x10(%R15,%RCX,8),%XMM3 |
0x4489ea VADDSD -0x30(%RBP),%XMM3,%XMM3 |
0x4489ef VMOVSD %XMM3,0x10(%R15,%RCX,8) |
0x4489f6 VMOVUPD -0x38(%RBP),%XMM3 |
0x4489fb VMULPD %XMM3,%XMM3,%XMM3 |
0x4489ff VPERMILPD $0x1,%XMM3,%XMM4 |
0x448a05 VADDSD %XMM4,%XMM3,%XMM3 |
0x448a09 VSUBSD %XMM3,%XMM1,%XMM1 |
0x448a0d VFMSUB213SD (%R12,%RAX,8),%XMM2,%XMM2 |
0x448a13 VSUBSD %XMM2,%XMM1,%XMM1 |
0x448a17 VMOVSD %XMM1,(%R12,%RAX,8) |
0x448a1d LEA 0x1(%R13),%RAX |
0x448a21 ADD -0x90(%RBP),%RDI |
0x448a28 ADD %RSI,%R10 |
0x448a2b ADD -0x88(%RBP),%R11 |
0x448a32 CMP %R8,%R13 |
0x448a35 MOV %RAX,%R13 |
0x448a38 JE 448b47 |
0x448a3e VMOVUPD %XMM0,-0x40(%RBP) |
0x448a43 MOVQ $0,-0x30(%RBP) |
0x448a4b TEST %EBX,%EBX |
0x448a4d JLE 448a70 |
0x448a4f VXORPD %XMM2,%XMM2,%XMM2 |
0x448a53 VXORPD %XMM1,%XMM1,%XMM1 |
0x448a57 CMP $0x1,%EBX |
0x448a5a JNE 448a80 |
0x448a5c VXORPD %XMM3,%XMM3,%XMM3 |
0x448a60 XOR %R9D,%R9D |
0x448a63 JMP 448ace |
0x448a70 VXORPD %XMM1,%XMM1,%XMM1 |
0x448a74 JMP 4489ca |
0x448a80 MOV %R10,%RCX |
0x448a83 VXORPD %XMM3,%XMM3,%XMM3 |
0x448a87 XOR %R9D,%R9D |
0x448a8a NOPW (%RAX,%RAX,1) |
(943) 0x448a90 VMOVDDUP -0x8(%RDI,%R9,8),%XMM4 |
(943) 0x448a97 VFMADD231PD -0x28(%RCX),%XMM4,%XMM2 |
(943) 0x448a9d VFMADD231SD -0x18(%RCX),%XMM4,%XMM3 |
(943) 0x448aa3 VFMADD231SD -0x8(%R11,%R9,8),%XMM4,%XMM1 |
(943) 0x448aaa VMOVDDUP (%RDI,%R9,8),%XMM4 |
(943) 0x448ab0 VFMADD231PD -0x10(%RCX),%XMM4,%XMM2 |
(943) 0x448ab6 VFMADD231SD (%RCX),%XMM4,%XMM3 |
(943) 0x448abb VFMADD231SD (%R11,%R9,8),%XMM4,%XMM1 |
(943) 0x448ac1 ADD $0x2,%R9 |
(943) 0x448ac5 ADD $0x30,%RCX |
(943) 0x448ac9 CMP %R9,%RDX |
(943) 0x448acc JNE 448a90 |
0x448ace TEST $0x1,%BL |
0x448ad1 JE 4489c0 |
0x448ad7 MOV -0x80(%RBP),%RCX |
0x448adb IMUL %R13,%RCX |
0x448adf ADD %R9,%RCX |
0x448ae2 MOV %R8,%RBX |
0x448ae5 MOV %RSI,%R8 |
0x448ae8 MOV %R12,%RSI |
0x448aeb MOV %R14,%R12 |
0x448aee MOV -0x68(%RBP),%R14 |
0x448af2 IMUL %R13,%R14 |
0x448af6 ADD %R9,%R14 |
0x448af9 LEA (%R14,%R14,2),%RAX |
0x448afd MOV %R12,%R14 |
0x448b00 MOV %RSI,%R12 |
0x448b03 MOV %RBX,%RSI |
0x448b06 MOV -0x78(%RBP),%RBX |
0x448b0a VMOVDDUP (%RBX,%RCX,8),%XMM4 |
0x448b0f MOV %RSI,%RCX |
0x448b12 MOV %R8,%RSI |
0x448b15 MOV %RCX,%R8 |
0x448b18 MOV -0x50(%RBP),%RBX |
0x448b1c MOV -0x70(%RBP),%RCX |
0x448b20 VFMADD231PD (%RCX,%RAX,8),%XMM4,%XMM2 |
0x448b26 VFMADD231SD 0x10(%RCX,%RAX,8),%XMM4,%XMM3 |
0x448b2d MOV -0x58(%RBP),%RAX |
0x448b31 IMUL %R13,%RAX |
0x448b35 ADD %R9,%RAX |
0x448b38 MOV -0x60(%RBP),%RCX |
0x448b3c VFMADD231SD (%RCX,%RAX,8),%XMM4,%XMM1 |
0x448b42 JMP 4489c0 |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVectorOps.h: 131 - 131 |
-------------------------------------------------------------------------------- |
131: res += lhs[d] * rhs[d]; |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/PETE/OperatorTags.h: 63 - 183 |
-------------------------------------------------------------------------------- |
63: return (a * b); |
[...] |
94: (const_cast<T1&>(a) += b); |
[...] |
183: return (const_cast<T1&>(a) = b); |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Platforms/CPU/SIMD/inner_product.hpp: 82 - 154 |
-------------------------------------------------------------------------------- |
82: res += a[i] * b[i]; |
[...] |
154: for (int i = 0; i < n; i++) |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/DiracDeterminantRef.cpp: 238 - 252 |
-------------------------------------------------------------------------------- |
238: if (NumPtcls == 1) |
[...] |
247: for (int i = 0, iat = FirstIndex; i < NumPtcls; i++, iat++) |
248: { |
249: mGradType rv = simd::dot(psiM[i], dpsiM[i], NumOrbitals); |
250: mValueType lap = simd::dot(psiM[i], d2psiM[i], NumOrbitals); |
251: G[iat] += rv; |
252: L[iat] += lap - dot(rv, rv); |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVector.h: 176 - 177 |
-------------------------------------------------------------------------------- |
176: inline Type_t& operator[](unsigned int i) { return X[i]; } |
177: inline const Type_t& operator[](unsigned int i) const { return X[i]; } |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | qmcplusplus::WaveFunction::eva[...] | WaveFunction.cpp:176 | exec |
○ | main.extracted.107 | miniqmc.cpp:375 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:374 | exec |
○ | __libc_init_first | libc.so.6 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.90 |
CQA speedup if FP arith vectorized | 2.63 |
CQA speedup if fully vectorized | 10.07 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 2.05 |
Bottlenecks | micro-operation queue, |
Function | miniqmcreference::DiracDeterminantRef |
Source | TinyVectorOps.h:131-131,OperatorTags.h:63-183,inner_product.hpp:82-154,DiracDeterminantRef.cpp:238-252,TinyVector.h:176-177 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 19.50 |
CQA cycles if no scalar integer | 10.25 |
CQA cycles if FP arith vectorized | 7.41 |
CQA cycles if fully vectorized | 1.94 |
Front-end cycles | 19.50 |
DIV/SQRT cycles | 8.75 |
P0 cycles | 8.75 |
P1 cycles | 9.50 |
P2 cycles | 9.50 |
P3 cycles | 7.00 |
P4 cycles | 8.75 |
P5 cycles | 8.75 |
P6 cycles | 7.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 19.80 |
Stall cycles (UFS) | 0.00 |
Nb insns | 73.00 |
Nb uops | 73.00 |
Nb loads | 19.00 |
Nb stores | 7.00 |
Nb stack references | 12.00 |
FLOP/cycle | 0.92 |
Nb FLOP add-sub | 6.00 |
Nb FLOP mul | 2.00 |
Nb FLOP fma | 5.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 13.54 |
Bytes prefetched | 0.00 |
Bytes loaded | 184.00 |
Bytes stored | 80.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 36.84 |
Vectorization ratio load | 33.33 |
Vectorization ratio store | 42.86 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 12.50 |
Vectorization ratio fma | 25.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 42.86 |
Vector-efficiency ratio all | 16.61 |
Vector-efficiency ratio load | 16.67 |
Vector-efficiency ratio store | 16.96 |
Vector-efficiency ratio mul | 25.00 |
Vector-efficiency ratio add_sub | 14.06 |
Vector-efficiency ratio fma | 15.63 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 16.96 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.90 |
CQA speedup if FP arith vectorized | 2.63 |
CQA speedup if fully vectorized | 10.07 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 2.05 |
Bottlenecks | micro-operation queue, |
Function | miniqmcreference::DiracDeterminantRef |
Source | TinyVectorOps.h:131-131,OperatorTags.h:63-183,inner_product.hpp:82-154,DiracDeterminantRef.cpp:238-252,TinyVector.h:176-177 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 19.50 |
CQA cycles if no scalar integer | 10.25 |
CQA cycles if FP arith vectorized | 7.41 |
CQA cycles if fully vectorized | 1.94 |
Front-end cycles | 19.50 |
DIV/SQRT cycles | 8.75 |
P0 cycles | 8.75 |
P1 cycles | 9.50 |
P2 cycles | 9.50 |
P3 cycles | 7.00 |
P4 cycles | 8.75 |
P5 cycles | 8.75 |
P6 cycles | 7.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 19.80 |
Stall cycles (UFS) | 0.00 |
Nb insns | 73.00 |
Nb uops | 73.00 |
Nb loads | 19.00 |
Nb stores | 7.00 |
Nb stack references | 12.00 |
FLOP/cycle | 0.92 |
Nb FLOP add-sub | 6.00 |
Nb FLOP mul | 2.00 |
Nb FLOP fma | 5.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 13.54 |
Bytes prefetched | 0.00 |
Bytes loaded | 184.00 |
Bytes stored | 80.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 36.84 |
Vectorization ratio load | 33.33 |
Vectorization ratio store | 42.86 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 12.50 |
Vectorization ratio fma | 25.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 42.86 |
Vector-efficiency ratio all | 16.61 |
Vector-efficiency ratio load | 16.67 |
Vector-efficiency ratio store | 16.96 |
Vector-efficiency ratio mul | 25.00 |
Vector-efficiency ratio add_sub | 14.06 |
Vector-efficiency ratio fma | 15.63 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 16.96 |
Path / |
Function | miniqmcreference::DiracDeterminantRef |
Source file and lines | inner_product.hpp:82-154 |
Module | exec |
nb instructions | 73 |
nb uops | 73 |
loop length | 311 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 5 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 12 |
ADD-SUB / MUL ratio | 5.00 |
micro-operation queue | 19.50 cycles |
front end | 19.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 8.75 | 8.75 | 9.50 | 9.50 | 7.00 | 8.75 | 8.75 | 7.00 |
cycles | 8.75 | 8.75 | 9.50 | 9.50 | 7.00 | 8.75 | 8.75 | 7.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 19.80 |
Stall cycles | 0.00 |
Front-end | 19.50 |
Dispatch | 9.50 |
Overall L1 | 19.50 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 53% |
load | 40% |
store | 50% |
mul | 100% |
add-sub | 20% |
fma | 25% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 36% |
load | 33% |
store | 42% |
mul | 100% |
add-sub | 12% |
fma | 25% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 42% |
all | 10% |
load | 12% |
store | 6% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 10% |
all | 19% |
load | 17% |
store | 18% |
mul | 25% |
add-sub | 15% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 16% |
load | 16% |
store | 16% |
mul | 25% |
add-sub | 14% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 16% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPD %XMM2,-0x40(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD %XMM3,-0x30(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (%R14,%R13,1),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA (%RAX,%RAX,2),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVUPD -0x40(%RBP),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VADDPD (%R15,%RCX,8),%XMM2,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %XMM3,(%R15,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD 0x10(%R15,%RCX,8),%XMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VADDSD -0x30(%RBP),%XMM3,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM3,0x10(%R15,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD -0x38(%RBP),%XMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMULPD %XMM3,%XMM3,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM3,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSD %XMM4,%XMM3,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM3,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMSUB213SD (%R12,%RAX,8),%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM1,(%R12,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA 0x1(%R13),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
ADD -0x90(%RBP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
ADD %RSI,%R10 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD -0x88(%RBP),%R11 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
CMP %R8,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %RAX,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JE 448b47 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVUPD %XMM0,-0x40(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOVQ $0,-0x30(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 2 | 1 |
TEST %EBX,%EBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JLE 448a70 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPD %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CMP $0x1,%EBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 448a80 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 448ace | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
VXORPD %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 4489ca | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
MOV %R10,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
TEST $0x1,%BL | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 4489c0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV -0x80(%RBP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
IMUL %R13,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %R9,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %R8,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RSI,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R12,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R14,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0x68(%RBP),%R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
IMUL %R13,%R14 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %R9,%R14 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
LEA (%R14,%R14,2),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R12,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RSI,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RBX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0x78(%RBP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVDDUP (%RBX,%RCX,8),%XMM4 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %RSI,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R8,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RCX,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0x50(%RBP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0x70(%RBP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VFMADD231PD (%RCX,%RAX,8),%XMM4,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x10(%RCX,%RAX,8),%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x58(%RBP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
IMUL %R13,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %R9,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV -0x60(%RBP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VFMADD231SD (%RCX,%RAX,8),%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
JMP 4489c0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
Function | miniqmcreference::DiracDeterminantRef |
Source file and lines | inner_product.hpp:82-154 |
Module | exec |
nb instructions | 73 |
nb uops | 73 |
loop length | 311 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 5 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 12 |
ADD-SUB / MUL ratio | 5.00 |
micro-operation queue | 19.50 cycles |
front end | 19.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 8.75 | 8.75 | 9.50 | 9.50 | 7.00 | 8.75 | 8.75 | 7.00 |
cycles | 8.75 | 8.75 | 9.50 | 9.50 | 7.00 | 8.75 | 8.75 | 7.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 19.80 |
Stall cycles | 0.00 |
Front-end | 19.50 |
Dispatch | 9.50 |
Overall L1 | 19.50 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 53% |
load | 40% |
store | 50% |
mul | 100% |
add-sub | 20% |
fma | 25% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 36% |
load | 33% |
store | 42% |
mul | 100% |
add-sub | 12% |
fma | 25% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 42% |
all | 10% |
load | 12% |
store | 6% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 10% |
all | 19% |
load | 17% |
store | 18% |
mul | 25% |
add-sub | 15% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 16% |
load | 16% |
store | 16% |
mul | 25% |
add-sub | 14% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 16% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPD %XMM2,-0x40(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD %XMM3,-0x30(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (%R14,%R13,1),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA (%RAX,%RAX,2),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVUPD -0x40(%RBP),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VADDPD (%R15,%RCX,8),%XMM2,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %XMM3,(%R15,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD 0x10(%R15,%RCX,8),%XMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VADDSD -0x30(%RBP),%XMM3,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM3,0x10(%R15,%RCX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD -0x38(%RBP),%XMM3 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMULPD %XMM3,%XMM3,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM3,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSD %XMM4,%XMM3,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM3,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMSUB213SD (%R12,%RAX,8),%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM1,(%R12,%RAX,8) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA 0x1(%R13),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
ADD -0x90(%RBP),%RDI | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
ADD %RSI,%R10 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
ADD -0x88(%RBP),%R11 | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
CMP %R8,%R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %RAX,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JE 448b47 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VMOVUPD %XMM0,-0x40(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOVQ $0,-0x30(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 2 | 1 |
TEST %EBX,%EBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JLE 448a70 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPD %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CMP $0x1,%EBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 448a80 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 448ace | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
VXORPD %XMM1,%XMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 4489ca | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
MOV %R10,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
TEST $0x1,%BL | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 4489c0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV -0x80(%RBP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
IMUL %R13,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %R9,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %R8,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RSI,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R12,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R14,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0x68(%RBP),%R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
IMUL %R13,%R14 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %R9,%R14 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
LEA (%R14,%R14,2),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R12,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RSI,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RBX,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0x78(%RBP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVDDUP (%RBX,%RCX,8),%XMM4 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %RSI,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %R8,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RCX,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV -0x50(%RBP),%RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0x70(%RBP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VFMADD231PD (%RCX,%RAX,8),%XMM4,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0x10(%RCX,%RAX,8),%XMM4,%XMM3 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV -0x58(%RBP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
IMUL %R13,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %R9,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV -0x60(%RBP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VFMADD231SD (%RCX,%RAX,8),%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
JMP 4489c0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |