Loop Id: 827 | Module: exec | Source: MultiBsplineRef.hpp:43-74 [...] | Coverage: 0.02% |
---|
Loop Id: 827 | Module: exec | Source: MultiBsplineRef.hpp:43-74 [...] | Coverage: 0.02% |
---|
0x43bbd0 MOV 0xb8(%RSP),%RDX |
0x43bbd8 INC %RDX |
0x43bbdb CMP 0xa0(%RSP),%RDX |
0x43bbe3 JGE 43c1b4 |
0x43bbe9 MOV 0xb0(%RSP),%RAX |
0x43bbf1 MOV (%RAX,%RDX,8),%R14 |
0x43bbf5 VMOVUPD 0x160(%RSP),%XMM0 |
0x43bbfe VSUBSD 0x78(%R14),%XMM0,%XMM0 |
0x43bc04 VMULSD 0x98(%R14),%XMM0,%XMM0 |
0x43bc0d MOV 0x88(%R14),%R15D |
0x43bc14 DEC %R15D |
0x43bc17 VROUNDSD $0x9,%XMM0,%XMM0,%XMM1 |
0x43bc1d VCVTTSD2SI %XMM1,%EAX |
0x43bc21 MOV %EAX,%ECX |
0x43bc23 SAR $0x1f,%ECX |
0x43bc26 ANDN %EAX,%ECX,%EAX |
0x43bc2b CMP %EAX,%R15D |
0x43bc2e CMOVGE %EAX,%R15D |
0x43bc32 MOV %RDX,0xb8(%RSP) |
0x43bc3a LEA (%RDX,%RDX,2),%RAX |
0x43bc3e MOV 0xa8(%RSP),%RCX |
0x43bc46 MOV (%RCX,%RAX,8),%R12 |
0x43bc4a VSUBSD %XMM1,%XMM0,%XMM13 |
0x43bc4e VMULSD %XMM12,%XMM13,%XMM11 |
0x43bc53 VMOVSD 0x5a3b5(%RIP),%XMM8 |
0x43bc5b VSUBSD %XMM11,%XMM8,%XMM16 |
0x43bc61 VFMADD213SD 0x5a40d(%RIP),%XMM13,%XMM16 |
0x43bc6b VMULSD %XMM8,%XMM13,%XMM0 |
0x43bc70 VMULSD %XMM13,%XMM13,%XMM15 |
0x43bc75 VSUBSD %XMM0,%XMM8,%XMM1 |
0x43bc79 VMULSD %XMM1,%XMM13,%XMM1 |
0x43bc7d VUNPCKLPD %XMM1,%XMM0,%XMM0 |
0x43bc81 VADDPD 0x5d3b7(%RIP),%XMM0,%XMM0 |
0x43bc89 VUNPCKLPD %XMM13,%XMM15,%XMM14 |
0x43bc8e VFMADD213PD 0x5d3b9(%RIP),%XMM0,%XMM14 |
0x43bc97 VMOVSD 0x28(%R14),%XMM0 |
0x43bc9d VMOVSD 0x48(%R14),%XMM1 |
0x43bca3 VMOVHPD 0x50(%R14),%XMM0,%XMM0 |
0x43bca9 VMOVHPD 0x70(%R14),%XMM1,%XMM1 |
0x43bcaf VMOVUPD 0x170(%RSP),%XMM2 |
0x43bcb8 VSUBPD %XMM0,%XMM2,%XMM0 |
0x43bcbc VMULPD %XMM0,%XMM1,%XMM0 |
0x43bcc0 VRNDSCALEPD $0x9,%XMM0,%XMM17 |
0x43bcc7 VMOVD 0x38(%R14),%XMM1 |
0x43bccd VSUBPD %XMM17,%XMM0,%XMM0 |
0x43bcd3 VPINSRD $0x1,0x60(%R14),%XMM1,%XMM18 |
0x43bcdb VPERMILPD $0x1,%XMM0,%XMM1 |
0x43bce1 VMULSD %XMM0,%XMM12,%XMM2 |
0x43bce5 VSUBSD %XMM2,%XMM8,%XMM3 |
0x43bce9 VMULPD %XMM0,%XMM0,%XMM4 |
0x43bced VMOVDDUP %XMM0,%XMM5 |
0x43bcf1 VMOVUPD 0x150(%RSP),%XMM10 |
0x43bcfa VBLENDPD $0x1,%XMM3,%XMM10,%XMM3 |
0x43bd00 VMULPD %XMM3,%XMM5,%XMM3 |
0x43bd04 VMOVUPD 0x5d354(%RIP),%XMM9 |
0x43bd0c VADDPD %XMM3,%XMM9,%XMM5 |
0x43bd10 VUNPCKLPD %XMM4,%XMM0,%XMM6 |
0x43bd14 VMOVUPD 0x5d354(%RIP),%XMM7 |
0x43bd1c VFMADD213PD %XMM7,%XMM5,%XMM6 |
0x43bd21 VMOVUPD %XMM6,0x1a0(%RSP) |
0x43bd2a VPERMILPD $0x1,%XMM3,%XMM3 |
0x43bd30 VSUBSD %XMM3,%XMM8,%XMM3 |
0x43bd34 VFMADD213SD %XMM8,%XMM0,%XMM3 |
0x43bd39 VFMADD213SD %XMM12,%XMM0,%XMM3 |
0x43bd3e VMOVSD %XMM3,0x1b0(%RSP) |
0x43bd47 VMULSD %XMM4,%XMM2,%XMM2 |
0x43bd4b VMOVSD %XMM2,0x1b8(%RSP) |
0x43bd54 VMULSD %XMM1,%XMM12,%XMM2 |
0x43bd58 VSUBSD %XMM2,%XMM8,%XMM3 |
0x43bd5c VMULSD %XMM1,%XMM1,%XMM4 |
0x43bd60 VPERMILPD $0x3,%XMM0,%XMM5 |
0x43bd66 VBLENDPD $0x1,%XMM3,%XMM10,%XMM3 |
0x43bd6c VMULPD %XMM3,%XMM5,%XMM3 |
0x43bd70 VADDPD %XMM3,%XMM9,%XMM5 |
0x43bd74 VSHUFPD $0x1,%XMM4,%XMM0,%XMM0 |
0x43bd79 VFMADD213PD %XMM7,%XMM5,%XMM0 |
0x43bd7e VMOVUPD %XMM0,0x180(%RSP) |
0x43bd87 VPERMILPD $0x1,%XMM3,%XMM0 |
0x43bd8d VSUBSD %XMM0,%XMM8,%XMM0 |
0x43bd91 VFMADD213SD %XMM8,%XMM1,%XMM0 |
0x43bd96 VFMADD213SD %XMM12,%XMM1,%XMM0 |
0x43bd9b VMOVSD %XMM0,0x190(%RSP) |
0x43bda4 VMULSD %XMM4,%XMM2,%XMM0 |
0x43bda8 VMOVSD %XMM0,0x198(%RSP) |
0x43bdb1 VMOVDQU 0x10(%R14),%XMM2 |
0x43bdb7 MOV 0x20(%R14),%R13 |
0x43bdbb CMPL $0,0x20(%RSP) |
0x43bdc0 JE 43be55 |
0x43bdc6 MOV %R12,%RDI |
0x43bdc9 XOR %ESI,%ESI |
0x43bdcb MOV 0x98(%RSP),%RDX |
0x43bdd3 VMOVUPD %XMM13,0x70(%RSP) |
0x43bdd9 VMOVUPD %YMM14,0x120(%RSP) |
0x43bde2 VMOVSD %XMM11,0x8(%RSP) |
0x43bde8 VMOVUPD %XMM15,0x30(%RSP) |
0x43bdee VMOVSD %XMM16,0x10(%RSP) |
0x43bdf6 VMOVUPD %XMM17,0x60(%RSP) |
0x43bdfe VMOVDQU64 %XMM18,0x50(%RSP) |
0x43be06 VMOVDQU %XMM2,0x40(%RSP) |
0x43be0c VZEROUPPER |
0x43be0f CALL 4879b0 <_intel_fast_memset> |
0x43be14 VMOVDQU 0x40(%RSP),%XMM2 |
0x43be1a VMOVDQU64 0x50(%RSP),%XMM18 |
0x43be22 VMOVUPD 0x60(%RSP),%XMM17 |
0x43be2a VMOVSD 0x10(%RSP),%XMM16 |
0x43be32 VMOVUPD 0x30(%RSP),%XMM15 |
0x43be38 VMOVSD 0x8(%RSP),%XMM11 |
0x43be3e VMOVUPD 0x120(%RSP),%YMM14 |
0x43be47 VMOVUPD 0x70(%RSP),%XMM13 |
0x43be4d VMOVSD 0x5d02b(%RIP),%XMM12 |
0x43be55 CMPL $0,0x20(%RSP) |
0x43be5a JE 43bbd0 |
0x43be60 VFMADD213SD %XMM12,%XMM16,%XMM13 |
0x43be66 VPCMPEQD %XMM0,%XMM0,%XMM0 |
0x43be6a VPADDD %XMM0,%XMM18,%XMM0 |
0x43be70 VCVTTPD2DQ %XMM17,%XMM1 |
0x43be76 VPMAXSD 0x5a981(%RIP),%XMM1,%XMM1 |
0x43be7f VPMINSD %XMM1,%XMM0,%XMM0 |
0x43be84 MOVSXD %R15D,%RDX |
0x43be87 MOV %R13,%R11 |
0x43be8a IMUL %RDX,%R11 |
0x43be8e VPMOVSXDQ %XMM0,%XMM0 |
0x43be93 VPMULLQ %XMM0,%XMM2,%XMM1 |
0x43be99 MOV 0x8(%R14),%R8 |
0x43be9d VMULSD %XMM15,%XMM11,%XMM11 |
0x43bea2 VPEXTRQ $0x1,%XMM1,%RCX |
0x43bea8 VMOVQ %XMM1,%RSI |
0x43bead ADD %RCX,%RSI |
0x43beb0 VMOVQ %XMM2,%RCX |
0x43beb5 VPEXTRQ $0x1,%XMM2,%RDI |
0x43bebb VBROADCASTSD %XMM13,%YMM1 |
0x43bec0 VBROADCASTSD %XMM14,%YMM2 |
0x43bec5 VPERMPD $0x55,%YMM14,%YMM3 |
0x43becb VBROADCASTSD %XMM11,%YMM4 |
0x43bed0 ADD %RSI,%R11 |
0x43bed3 LEA (%R8,%R11,8),%R14 |
0x43bed7 MOV %RCX,0xc8(%RSP) |
0x43bedf LEA (,%RCX,8),%RCX |
0x43bee7 MOV %RCX,0xc0(%RSP) |
0x43beef MOV %RDI,0x50(%RSP) |
0x43bef4 LEA (,%RDI,8),%RCX |
0x43befc MOV %RCX,0x40(%RSP) |
0x43bf01 LEA 0x3(%RDX),%RDI |
0x43bf05 IMUL %R13,%RDI |
0x43bf09 ADD %RSI,%RDI |
0x43bf0c LEA (%R8,%RDI,8),%RCX |
0x43bf10 LEA 0x2(%RDX),%RAX |
0x43bf14 IMUL %R13,%RAX |
0x43bf18 ADD %RSI,%RAX |
0x43bf1b INC %RDX |
0x43bf1e IMUL %R13,%RDX |
0x43bf22 LEA (%R8,%RAX,8),%R9 |
0x43bf26 ADD %RSI,%RDX |
0x43bf29 MOV %R8,0x118(%RSP) |
0x43bf31 LEA (%R8,%RDX,8),%R10 |
0x43bf35 XOR %R8D,%R8D |
0x43bf38 JMP 43bfb9 |
(829) 0x43bf40 LEA 0x1(%RSI),%R8 |
(829) 0x43bf44 MOV 0xf8(%RSP),%R14 |
(829) 0x43bf4c MOV 0xc0(%RSP),%RCX |
(829) 0x43bf54 ADD %RCX,%R14 |
(829) 0x43bf57 MOV 0xe8(%RSP),%R15 |
(829) 0x43bf5f ADD %RCX,%R15 |
(829) 0x43bf62 MOV 0xd8(%RSP),%R9 |
(829) 0x43bf6a ADD %RCX,%R9 |
(829) 0x43bf6d MOV 0xd0(%RSP),%R10 |
(829) 0x43bf75 ADD %RCX,%R10 |
(829) 0x43bf78 MOV 0x100(%RSP),%R11 |
(829) 0x43bf80 MOV 0xc8(%RSP),%RCX |
(829) 0x43bf88 ADD %RCX,%R11 |
(829) 0x43bf8b MOV 0xf0(%RSP),%RDI |
(829) 0x43bf93 ADD %RCX,%RDI |
(829) 0x43bf96 MOV 0xe0(%RSP),%RAX |
(829) 0x43bf9e ADD %RCX,%RAX |
(829) 0x43bfa1 MOV 0x108(%RSP),%RDX |
(829) 0x43bfa9 ADD %RCX,%RDX |
(829) 0x43bfac MOV %R15,%RCX |
(829) 0x43bfaf CMP $0x3,%RSI |
(829) 0x43bfb3 JE 43bbd0 |
(829) 0x43bfb9 MOV %R8,0x60(%RSP) |
(829) 0x43bfbe VMOVSD 0x1a0(%RSP,%R8,8),%XMM10 |
(829) 0x43bfc8 MOV %RDX,0x108(%RSP) |
(829) 0x43bfd0 MOV %RDX,0x120(%RSP) |
(829) 0x43bfd8 MOV %RAX,0xe0(%RSP) |
(829) 0x43bfe0 MOV %RAX,0x8(%RSP) |
(829) 0x43bfe5 MOV %RDI,0xf0(%RSP) |
(829) 0x43bfed MOV %RDI,0x30(%RSP) |
(829) 0x43bff2 MOV %R11,0x100(%RSP) |
(829) 0x43bffa MOV %R10,0xd0(%RSP) |
(829) 0x43c002 MOV %R9,0xd8(%RSP) |
(829) 0x43c00a MOV %RCX,0xe8(%RSP) |
(829) 0x43c012 MOV %RCX,%RAX |
(829) 0x43c015 MOV %R14,0xf8(%RSP) |
(829) 0x43c01d XOR %ECX,%ECX |
(829) 0x43c01f JMP 43c073 |
(830) 0x43c030 MOV 0x70(%RSP),%RSI |
(830) 0x43c035 LEA 0x1(%RSI),%RCX |
(830) 0x43c039 MOV 0x40(%RSP),%RDX |
(830) 0x43c03e ADD %RDX,%R14 |
(830) 0x43c041 ADD %RDX,%RAX |
(830) 0x43c044 ADD %RDX,%R9 |
(830) 0x43c047 ADD %RDX,%R10 |
(830) 0x43c04a MOV 0x50(%RSP),%RDX |
(830) 0x43c04f ADD %RDX,%R11 |
(830) 0x43c052 ADD %RDX,0x30(%RSP) |
(830) 0x43c057 ADD %RDX,0x8(%RSP) |
(830) 0x43c05c ADD %RDX,0x120(%RSP) |
(830) 0x43c064 CMP $0x3,%RSI |
(830) 0x43c068 MOV 0x60(%RSP),%RSI |
(830) 0x43c06d JE 43bf40 |
(830) 0x43c073 VMULSD 0x180(%RSP,%RCX,8),%XMM10,%XMM6 |
(830) 0x43c07c TEST %RBX,%RBX |
(830) 0x43c07f MOV %RCX,0x70(%RSP) |
(830) 0x43c084 JE 43c110 |
(830) 0x43c08a VBROADCASTSD %XMM6,%YMM7 |
(830) 0x43c08f XOR %ECX,%ECX |
(830) 0x43c091 MOV 0x110(%RSP),%RDX |
(830) 0x43c099 NOPL (%RAX) |
(831) 0x43c0a0 VMULPD 0x20(%R14,%RCX,8),%YMM1,%YMM8 |
(831) 0x43c0a7 VMULPD (%R14,%RCX,8),%YMM1,%YMM9 |
(831) 0x43c0ad VFMADD231PD (%R10,%RCX,8),%YMM2,%YMM9 |
(831) 0x43c0b3 VFMADD231PD 0x20(%R10,%RCX,8),%YMM2,%YMM8 |
(831) 0x43c0ba VFMADD231PD 0x20(%R9,%RCX,8),%YMM3,%YMM8 |
(831) 0x43c0c1 VFMADD231PD (%R9,%RCX,8),%YMM3,%YMM9 |
(831) 0x43c0c7 VFMADD231PD (%RAX,%RCX,8),%YMM4,%YMM9 |
(831) 0x43c0cd VFMADD231PD 0x20(%RAX,%RCX,8),%YMM4,%YMM8 |
(831) 0x43c0d4 VFMADD213PD 0x20(%R12,%RCX,8),%YMM7,%YMM8 |
(831) 0x43c0db VFMADD213PD (%R12,%RCX,8),%YMM7,%YMM9 |
(831) 0x43c0e1 VMOVUPD %YMM9,(%R12,%RCX,8) |
(831) 0x43c0e7 VMOVUPD %YMM8,0x20(%R12,%RCX,8) |
(831) 0x43c0ee ADD $0x8,%RCX |
(831) 0x43c0f2 CMP %RDX,%RCX |
(831) 0x43c0f5 JBE 43c0a0 |
(830) 0x43c0f7 MOV %RBX,%R15 |
(830) 0x43c0fa CMP %RBX,0x28(%RSP) |
(830) 0x43c0ff JE 43c030 |
(830) 0x43c105 JMP 43c113 |
(830) 0x43c110 XOR %R15D,%R15D |
(830) 0x43c113 LEA (%R15,%R11,1),%RCX |
(830) 0x43c117 MOV %R11,0x10(%RSP) |
(830) 0x43c11c MOV 0x118(%RSP),%R11 |
(830) 0x43c124 LEA (%R11,%RCX,8),%RDX |
(830) 0x43c128 MOV 0x28(%RSP),%RDI |
(830) 0x43c12d SUB %R15,%RDI |
(830) 0x43c130 LEA (%R12,%R15,8),%RCX |
(830) 0x43c134 MOV 0x30(%RSP),%RSI |
(830) 0x43c139 ADD %R15,%RSI |
(830) 0x43c13c LEA (%R11,%RSI,8),%RSI |
(830) 0x43c140 MOV %RBX,%R13 |
(830) 0x43c143 MOV 0x8(%RSP),%RBX |
(830) 0x43c148 ADD %R15,%RBX |
(830) 0x43c14b LEA (%R11,%RBX,8),%R8 |
(830) 0x43c14f MOV %R13,%RBX |
(830) 0x43c152 ADD 0x120(%RSP),%R15 |
(830) 0x43c15a LEA (%R11,%R15,8),%R15 |
(830) 0x43c15e MOV 0x10(%RSP),%R11 |
(830) 0x43c163 XOR %R13D,%R13D |
(830) 0x43c166 NOPW %CS:(%RAX,%RAX,1) |
(828) 0x43c170 VMOVSD (%R15,%R13,8),%XMM7 |
(828) 0x43c176 VMOVHPD (%R8,%R13,8),%XMM7,%XMM7 |
(828) 0x43c17c VMULPD %XMM7,%XMM14,%XMM7 |
(828) 0x43c180 VPERMILPD $0x1,%XMM7,%XMM5 |
(828) 0x43c186 VMOVSD (%RDX,%R13,8),%XMM0 |
(828) 0x43c18c VFMADD132SD %XMM13,%XMM7,%XMM0 |
(828) 0x43c191 VFMADD231SD (%RSI,%R13,8),%XMM11,%XMM5 |
(828) 0x43c197 VADDSD %XMM0,%XMM5,%XMM0 |
(828) 0x43c19b VFMADD213SD (%RCX,%R13,8),%XMM6,%XMM0 |
(828) 0x43c1a1 VMOVSD %XMM0,(%RCX,%R13,8) |
(828) 0x43c1a7 INC %R13 |
(828) 0x43c1aa CMP %R13,%RDI |
(828) 0x43c1ad JNE 43c170 |
(830) 0x43c1af JMP 43c030 |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVectorOps.h: 59 - 59 |
-------------------------------------------------------------------------------- |
59: for (unsigned d = 0; d < D; ++d) |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineEvalHelper.hpp: 47 - 49 |
-------------------------------------------------------------------------------- |
47: T sf = std::floor(x); |
48: T dx2 = x - sf; |
49: int ind2 = std::min(std::max(0, static_cast<int>(sf)), nmax); |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineRef.hpp: 43 - 74 |
-------------------------------------------------------------------------------- |
43: y -= spline_m->y_grid.start; |
44: z -= spline_m->z_grid.start; |
45: T tx, ty, tz; |
46: int ix, iy, iz; |
47: spline2::getSplineBound(x * spline_m->x_grid.delta_inv, tx, ix, spline_m->x_grid.num - 1); |
48: spline2::getSplineBound(y * spline_m->y_grid.delta_inv, ty, iy, spline_m->y_grid.num - 1); |
49: spline2::getSplineBound(z * spline_m->z_grid.delta_inv, tz, iz, spline_m->z_grid.num - 1); |
[...] |
56: const intptr_t xs = spline_m->x_stride; |
57: const intptr_t ys = spline_m->y_stride; |
58: const intptr_t zs = spline_m->z_stride; |
[...] |
65: for (size_t i = 0; i < 4; i++) |
66: for (size_t j = 0; j < 4; j++) |
67: { |
68: const T pre00 = a[i] * b[j]; |
69: const T* restrict coefs = spline_m->coefs + (ix + i) * xs + (iy + j) * ys + iz * zs; |
70: #pragma omp simd aligned(coefs: QMC_SIMD_ALIGNMENT) simdlen(simdlen_) |
71: for (size_t n = 0; n < num_splines; n++) |
72: vals[n] += pre00 * |
73: (c[0] * coefs[n] + c[1] * coefs[n + zs] + c[2] * coefs[n + 2 * zs] + |
74: c[3] * coefs[n + 3 * zs]); |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/einspline_spo_ref.hpp: 175 - 176 |
-------------------------------------------------------------------------------- |
175: for (int i = 0; i < nBlocks; ++i) |
176: MultiBsplineEvalRef::evaluate_v(einsplines[i], u[0], u[1], u[2], psi[i].data(), nSplinesPerBlock); |
/home/kcamus/qaas_runs/169-451-1869/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineData.hpp: 54 - 57 |
-------------------------------------------------------------------------------- |
54: a[0] = ((A00 * tx + A01) * tx + A02) * tx + A03; |
55: a[1] = ((A10 * tx + A11) * tx + A12) * tx + A13; |
56: a[2] = ((A20 * tx + A21) * tx + A22) * tx + A23; |
57: a[3] = ((A30 * tx + A31) * tx + A32) * tx + A33; |
/usr/lib64/gcc/x86_64-pc-linux-gnu/13.1.1/../../../../include/c++/13.1.1/bits/stl_algobase.h: 238 - 931 |
-------------------------------------------------------------------------------- |
238: if (__b < __a) |
[...] |
930: for (; __first != __last; ++__first) |
931: *__first = __tmp; |
/usr/lib64/gcc/x86_64-pc-linux-gnu/13.1.1/../../../../include/c++/13.1.1/bits/stl_vector.h: 1258 - 1258 |
-------------------------------------------------------------------------------- |
1258: { return _M_data_ptr(this->_M_impl._M_start); } |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | miniqmcreference::DiracDetermi[...] | DiracDeterminantRef.cpp:194 | exec |
○ | qmcplusplus::WaveFunction::rat[...] | WaveFunction.cpp:214 | exec |
○ | main.extracted.104 | refwrap.h:347 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | miniqmc.cpp:404 | exec |
○ | __libc_init_first | libc.so.6 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.35 |
CQA speedup if FP arith vectorized | 1.75 |
CQA speedup if fully vectorized | 6.77 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.40 |
Bottlenecks | micro-operation queue, |
Function | miniqmcreference::einspline_spo_ref |
Source | TinyVectorOps.h:59-59,MultiBsplineEvalHelper.hpp:47-49,MultiBsplineRef.hpp:43-74,einspline_spo_ref.hpp:175-176,MultiBsplineData.hpp:54-57,stl_algobase.h:238-931,stl_vector.h:1258-1258 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 42.50 |
CQA cycles if no scalar integer | 31.50 |
CQA cycles if FP arith vectorized | 24.28 |
CQA cycles if fully vectorized | 6.28 |
Front-end cycles | 42.50 |
DIV/SQRT cycles | 30.33 |
P0 cycles | 30.33 |
P1 cycles | 20.33 |
P2 cycles | 20.33 |
P3 cycles | 21.00 |
P4 cycles | 30.33 |
P5 cycles | 20.00 |
P6 cycles | 20.33 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 50.00 |
Stall cycles (UFS) | 7.97 |
Nb insns | 157.00 |
Nb uops | 170.00 |
Nb loads | 40.00 |
Nb stores | 20.00 |
Nb stack references | 26.00 |
FLOP/cycle | 1.44 |
Nb FLOP add-sub | 18.00 |
Nb FLOP mul | 19.00 |
Nb FLOP fma | 12.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 15.91 |
Bytes prefetched | 0.00 |
Bytes loaded | 436.00 |
Bytes stored | 240.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 41.53 |
Vectorization ratio load | 44.12 |
Vectorization ratio store | 40.00 |
Vectorization ratio mul | 31.25 |
Vectorization ratio add_sub | 40.00 |
Vectorization ratio fma | 33.33 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 44.12 |
Vector-efficiency ratio all | 17.80 |
Vector-efficiency ratio load | 17.83 |
Vector-efficiency ratio store | 18.75 |
Vector-efficiency ratio mul | 16.41 |
Vector-efficiency ratio add_sub | 17.08 |
Vector-efficiency ratio fma | 16.67 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 17.46 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.35 |
CQA speedup if FP arith vectorized | 1.75 |
CQA speedup if fully vectorized | 6.77 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.40 |
Bottlenecks | micro-operation queue, |
Function | miniqmcreference::einspline_spo_ref |
Source | TinyVectorOps.h:59-59,MultiBsplineEvalHelper.hpp:47-49,MultiBsplineRef.hpp:43-74,einspline_spo_ref.hpp:175-176,MultiBsplineData.hpp:54-57,stl_algobase.h:238-931,stl_vector.h:1258-1258 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 42.50 |
CQA cycles if no scalar integer | 31.50 |
CQA cycles if FP arith vectorized | 24.28 |
CQA cycles if fully vectorized | 6.28 |
Front-end cycles | 42.50 |
DIV/SQRT cycles | 30.33 |
P0 cycles | 30.33 |
P1 cycles | 20.33 |
P2 cycles | 20.33 |
P3 cycles | 21.00 |
P4 cycles | 30.33 |
P5 cycles | 20.00 |
P6 cycles | 20.33 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 50.00 |
Stall cycles (UFS) | 7.97 |
Nb insns | 157.00 |
Nb uops | 170.00 |
Nb loads | 40.00 |
Nb stores | 20.00 |
Nb stack references | 26.00 |
FLOP/cycle | 1.44 |
Nb FLOP add-sub | 18.00 |
Nb FLOP mul | 19.00 |
Nb FLOP fma | 12.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 15.91 |
Bytes prefetched | 0.00 |
Bytes loaded | 436.00 |
Bytes stored | 240.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 41.53 |
Vectorization ratio load | 44.12 |
Vectorization ratio store | 40.00 |
Vectorization ratio mul | 31.25 |
Vectorization ratio add_sub | 40.00 |
Vectorization ratio fma | 33.33 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 44.12 |
Vector-efficiency ratio all | 17.80 |
Vector-efficiency ratio load | 17.83 |
Vector-efficiency ratio store | 18.75 |
Vector-efficiency ratio mul | 16.41 |
Vector-efficiency ratio add_sub | 17.08 |
Vector-efficiency ratio fma | 16.67 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 17.46 |
Path / |
Function | miniqmcreference::einspline_spo_ref |
Source file and lines | MultiBsplineRef.hpp:43-74 |
Module | exec |
nb instructions | 157 |
nb uops | 170 |
loop length | 874 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 19 |
used ymm registers | 5 |
used zmm registers | 0 |
nb stack references | 26 |
ADD-SUB / MUL ratio | 0.87 |
micro-operation queue | 42.50 cycles |
front end | 42.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 30.33 | 30.33 | 20.33 | 20.33 | 21.00 | 30.33 | 20.00 | 20.33 |
cycles | 30.33 | 30.33 | 20.33 | 20.33 | 21.00 | 30.33 | 20.00 | 20.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 50.00 |
Stall cycles | 7.97 |
ROB full (events) | 5.56 |
RS full (events) | 7.78 |
Front-end | 42.50 |
Dispatch | 30.33 |
Overall L1 | 42.50 |
all | 36% |
load | 33% |
store | 25% |
mul | 100% |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 33% |
all | 43% |
load | 50% |
store | 50% |
mul | 26% |
add-sub | 38% |
fma | 33% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 52% |
all | 41% |
load | 44% |
store | 40% |
mul | 31% |
add-sub | 40% |
fma | 33% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 44% |
all | 15% |
load | 14% |
store | 15% |
mul | 25% |
add-sub | 15% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 13% |
all | 18% |
load | 19% |
store | 20% |
mul | 15% |
add-sub | 17% |
fma | 16% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 20% |
all | 17% |
load | 17% |
store | 18% |
mul | 16% |
add-sub | 17% |
fma | 16% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 17% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0xb8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
INC %RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP 0xa0(%RSP),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JGE 43c1b4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV 0xb0(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV (%RAX,%RDX,8),%R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD 0x160(%RSP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD 0x78(%R14),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x98(%R14),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV 0x88(%R14),%R15D | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
DEC %R15D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VROUNDSD $0x9,%XMM0,%XMM0,%XMM1 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VCVTTSD2SI %XMM1,%EAX | 2 | 1.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
SAR $0x1f,%ECX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
ANDN %EAX,%ECX,%EAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
CMP %EAX,%R15D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMOVGE %EAX,%R15D | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
MOV %RDX,0xb8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (%RDX,%RDX,2),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0xa8(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV (%RCX,%RAX,8),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD %XMM1,%XMM0,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM13,%XMM11 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD 0x5a3b5(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD %XMM11,%XMM8,%XMM16 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213SD 0x5a40d(%RIP),%XMM13,%XMM16 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM8,%XMM13,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM13,%XMM13,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM0,%XMM8,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKLPD %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPD 0x5d3b7(%RIP),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKLPD %XMM13,%XMM15,%XMM14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD213PD 0x5d3b9(%RIP),%XMM0,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD 0x28(%R14),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x48(%R14),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVHPD 0x50(%R14),%XMM0,%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 4 | 1 |
VMOVHPD 0x70(%R14),%XMM1,%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 4 | 1 |
VMOVUPD 0x170(%RSP),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBPD %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %XMM0,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALEPD $0x9,%XMM0,%XMM17 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMOVD 0x38(%R14),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBPD %XMM17,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPINSRD $0x1,0x60(%R14),%XMM1,%XMM18 | 2 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 3 | 1 |
VPERMILPD $0x1,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMULSD %XMM0,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM2,%XMM8,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %XMM0,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP %XMM0,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVUPD 0x150(%RSP),%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VBLENDPD $0x1,%XMM3,%XMM10,%XMM3 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VMULPD %XMM3,%XMM5,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x5d354(%RIP),%XMM9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VADDPD %XMM3,%XMM9,%XMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKLPD %XMM4,%XMM0,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVUPD 0x5d354(%RIP),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VFMADD213PD %XMM7,%XMM5,%XMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %XMM6,0x1a0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VPERMILPD $0x1,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VSUBSD %XMM3,%XMM8,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213SD %XMM8,%XMM0,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213SD %XMM12,%XMM0,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM3,0x1b0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM4,%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM2,0x1b8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM1,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM2,%XMM8,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x3,%XMM0,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VBLENDPD $0x1,%XMM3,%XMM10,%XMM3 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VMULPD %XMM3,%XMM5,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDPD %XMM3,%XMM9,%XMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSHUFPD $0x1,%XMM4,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD213PD %XMM7,%XMM5,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %XMM0,0x180(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VPERMILPD $0x1,%XMM3,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VSUBSD %XMM0,%XMM8,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213SD %XMM8,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213SD %XMM12,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM0,0x190(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM4,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM0,0x198(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVDQU 0x10(%R14),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV 0x20(%R14),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CMPL $0,0x20(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JE 43be55 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV %R12,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x98(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD %XMM13,0x70(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD %YMM14,0x120(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD %XMM11,0x8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD %XMM15,0x30(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD %XMM16,0x10(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD %XMM17,0x60(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVDQU64 %XMM18,0x50(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 |
VMOVDQU %XMM2,0x40(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 |
VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
CALL 4879b0 <_intel_fast_memset> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
VMOVDQU 0x40(%RSP),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDQU64 0x50(%RSP),%XMM18 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x60(%RSP),%XMM17 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x10(%RSP),%XMM16 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD 0x30(%RSP),%XMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x8(%RSP),%XMM11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD 0x120(%RSP),%YMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x70(%RSP),%XMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x5d02b(%RIP),%XMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CMPL $0,0x20(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JE 43bbd0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VFMADD213SD %XMM12,%XMM16,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPCMPEQD %XMM0,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDD %XMM0,%XMM18,%XMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VCVTTPD2DQ %XMM17,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 1 |
VPMAXSD 0x5a981(%RIP),%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPMINSD %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOVSXD %R15D,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %R13,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
IMUL %RDX,%R11 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VPMULLQ %XMM0,%XMM2,%XMM1 | 3 | 1.50 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
MOV 0x8(%R14),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMULSD %XMM15,%XMM11,%XMM11 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPEXTRQ $0x1,%XMM1,%RCX | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVQ %XMM1,%RSI | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ADD %RCX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VMOVQ %XMM2,%RCX | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VPEXTRQ $0x1,%XMM2,%RDI | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM13,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM14,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VPERMPD $0x55,%YMM14,%YMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM11,%YMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
ADD %RSI,%R11 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
LEA (%R8,%R11,8),%R14 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,0xc8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (,%RCX,8),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,0xc0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RDI,0x50(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (,%RDI,8),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,0x40(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA 0x3(%RDX),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
IMUL %R13,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %RSI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
LEA (%R8,%RDI,8),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA 0x2(%RDX),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
IMUL %R13,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %RSI,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
INC %RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
IMUL %R13,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%R8,%RAX,8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
ADD %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %R8,0x118(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (%R8,%RDX,8),%R10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
XOR %R8D,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 43bfb9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
Function | miniqmcreference::einspline_spo_ref |
Source file and lines | MultiBsplineRef.hpp:43-74 |
Module | exec |
nb instructions | 157 |
nb uops | 170 |
loop length | 874 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 19 |
used ymm registers | 5 |
used zmm registers | 0 |
nb stack references | 26 |
ADD-SUB / MUL ratio | 0.87 |
micro-operation queue | 42.50 cycles |
front end | 42.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 30.33 | 30.33 | 20.33 | 20.33 | 21.00 | 30.33 | 20.00 | 20.33 |
cycles | 30.33 | 30.33 | 20.33 | 20.33 | 21.00 | 30.33 | 20.00 | 20.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 50.00 |
Stall cycles | 7.97 |
ROB full (events) | 5.56 |
RS full (events) | 7.78 |
Front-end | 42.50 |
Dispatch | 30.33 |
Overall L1 | 42.50 |
all | 36% |
load | 33% |
store | 25% |
mul | 100% |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 33% |
all | 43% |
load | 50% |
store | 50% |
mul | 26% |
add-sub | 38% |
fma | 33% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 52% |
all | 41% |
load | 44% |
store | 40% |
mul | 31% |
add-sub | 40% |
fma | 33% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 44% |
all | 15% |
load | 14% |
store | 15% |
mul | 25% |
add-sub | 15% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 13% |
all | 18% |
load | 19% |
store | 20% |
mul | 15% |
add-sub | 17% |
fma | 16% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 20% |
all | 17% |
load | 17% |
store | 18% |
mul | 16% |
add-sub | 17% |
fma | 16% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 17% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0xb8(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
INC %RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP 0xa0(%RSP),%RDX | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JGE 43c1b4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV 0xb0(%RSP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV (%RAX,%RDX,8),%R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD 0x160(%RSP),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD 0x78(%R14),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x98(%R14),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV 0x88(%R14),%R15D | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
DEC %R15D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VROUNDSD $0x9,%XMM0,%XMM0,%XMM1 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VCVTTSD2SI %XMM1,%EAX | 2 | 1.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
SAR $0x1f,%ECX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
ANDN %EAX,%ECX,%EAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
CMP %EAX,%R15D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMOVGE %EAX,%R15D | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
MOV %RDX,0xb8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (%RDX,%RDX,2),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0xa8(%RSP),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV (%RCX,%RAX,8),%R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD %XMM1,%XMM0,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM12,%XMM13,%XMM11 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD 0x5a3b5(%RIP),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBSD %XMM11,%XMM8,%XMM16 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213SD 0x5a40d(%RIP),%XMM13,%XMM16 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM8,%XMM13,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM13,%XMM13,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM0,%XMM8,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM13,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKLPD %XMM1,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPD 0x5d3b7(%RIP),%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKLPD %XMM13,%XMM15,%XMM14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD213PD 0x5d3b9(%RIP),%XMM0,%XMM14 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD 0x28(%R14),%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x48(%R14),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVHPD 0x50(%R14),%XMM0,%XMM0 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 4 | 1 |
VMOVHPD 0x70(%R14),%XMM1,%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 4 | 1 |
VMOVUPD 0x170(%RSP),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBPD %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %XMM0,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALEPD $0x9,%XMM0,%XMM17 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMOVD 0x38(%R14),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VSUBPD %XMM17,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPINSRD $0x1,0x60(%R14),%XMM1,%XMM18 | 2 | 0 | 0 | 0.50 | 0.50 | 0 | 1 | 0 | 0 | 3 | 1 |
VPERMILPD $0x1,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMULSD %XMM0,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM2,%XMM8,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %XMM0,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP %XMM0,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVUPD 0x150(%RSP),%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VBLENDPD $0x1,%XMM3,%XMM10,%XMM3 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VMULPD %XMM3,%XMM5,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x5d354(%RIP),%XMM9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VADDPD %XMM3,%XMM9,%XMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKLPD %XMM4,%XMM0,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVUPD 0x5d354(%RIP),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VFMADD213PD %XMM7,%XMM5,%XMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %XMM6,0x1a0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VPERMILPD $0x1,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VSUBSD %XMM3,%XMM8,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213SD %XMM8,%XMM0,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213SD %XMM12,%XMM0,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM3,0x1b0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM4,%XMM2,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM2,0x1b8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM1,%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM2,%XMM8,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x3,%XMM0,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VBLENDPD $0x1,%XMM3,%XMM10,%XMM3 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VMULPD %XMM3,%XMM5,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDPD %XMM3,%XMM9,%XMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSHUFPD $0x1,%XMM4,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD213PD %XMM7,%XMM5,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %XMM0,0x180(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VPERMILPD $0x1,%XMM3,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VSUBSD %XMM0,%XMM8,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213SD %XMM8,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213SD %XMM12,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM0,0x190(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD %XMM4,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM0,0x198(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVDQU 0x10(%R14),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV 0x20(%R14),%R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CMPL $0,0x20(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JE 43be55 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV %R12,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x98(%RSP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD %XMM13,0x70(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD %YMM14,0x120(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD %XMM11,0x8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD %XMM15,0x30(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD %XMM16,0x10(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVUPD %XMM17,0x60(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVDQU64 %XMM18,0x50(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 |
VMOVDQU %XMM2,0x40(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 4 | 1 |
VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
CALL 4879b0 <_intel_fast_memset> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
VMOVDQU 0x40(%RSP),%XMM2 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDQU64 0x50(%RSP),%XMM18 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x60(%RSP),%XMM17 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x10(%RSP),%XMM16 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD 0x30(%RSP),%XMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x8(%RSP),%XMM11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD 0x120(%RSP),%YMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VMOVUPD 0x70(%RSP),%XMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x5d02b(%RIP),%XMM12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CMPL $0,0x20(%RSP) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JE 43bbd0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VFMADD213SD %XMM12,%XMM16,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPCMPEQD %XMM0,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDD %XMM0,%XMM18,%XMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 1 | 0.33 |
VCVTTPD2DQ %XMM17,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 1 |
VPMAXSD 0x5a981(%RIP),%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPMINSD %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MOVSXD %R15D,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %R13,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
IMUL %RDX,%R11 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VPMULLQ %XMM0,%XMM2,%XMM1 | 3 | 1.50 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
MOV 0x8(%R14),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMULSD %XMM15,%XMM11,%XMM11 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPEXTRQ $0x1,%XMM1,%RCX | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVQ %XMM1,%RSI | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ADD %RCX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VMOVQ %XMM2,%RCX | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
VPEXTRQ $0x1,%XMM2,%RDI | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM13,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM14,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VPERMPD $0x55,%YMM14,%YMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSD %XMM11,%YMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
ADD %RSI,%R11 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
LEA (%R8,%R11,8),%R14 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,0xc8(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (,%RCX,8),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,0xc0(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RDI,0x50(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (,%RDI,8),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,0x40(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA 0x3(%RDX),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
IMUL %R13,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %RSI,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
LEA (%R8,%RDI,8),%RCX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA 0x2(%RDX),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
IMUL %R13,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %RSI,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
INC %RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
IMUL %R13,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%R8,%RAX,8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
ADD %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %R8,0x118(%RSP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
LEA (%R8,%RDX,8),%R10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
XOR %R8D,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 43bfb9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |