| | | | | | | requested parallelism | walltime sum (s) | nb instances | any sync average per thread time (s) | any wait average per thread time (s) | parallelism overhead (%) | local speedup if perfectly balanced | global speedup if perfectly balanced |
start addr | function name | source location | level | ancestor thread num | invoker | parallel or teams | 1x6 | 1x72 | 1x96 | 1x120 | 1x128 | 1x144 | 1x168 | 1x192 | 1x6 | 1x72 | 1x96 | 1x120 | 1x128 | 1x144 | 1x168 | 1x192 | 1x6 | 1x72 | 1x96 | 1x120 | 1x128 | 1x144 | 1x168 | 1x192 | 1x6 | 1x72 | 1x96 | 1x120 | 1x128 | 1x144 | 1x168 | 1x192 | 1x6 | 1x72 | 1x96 | 1x120 | 1x128 | 1x144 | 1x168 | 1x192 | 1x6 | 1x72 | 1x96 | 1x120 | 1x128 | 1x144 | 1x168 | 1x192 | 1x6 | 1x72 | 1x96 | 1x120 | 1x128 | 1x144 | 1x168 | 1x192 | 1x6 | 1x72 | 1x96 | 1x120 | 1x128 | 1x144 | 1x168 | 1x192 |
libggml-cpu.so:0x30d9c | ggml_graph_compute | ggml-cpu.c:3148 | 0 | 0 | runtime | parallel | 6 | 72 | 96 | 120 | 128 | 144 | 168 | 192 | 23.825 | 7.419 | 7.094 | 6.674 | 6.620 | 6.815 | 6.086 | 5.056 | 132 | 132 | 132 | 132 | 132 | 132 | 132 | 132 | 3.243 | 2.940 | 3.062 | 2.818 | 2.979 | 3.159 | 2.801 | 2.424 | 3.240 | 2.939 | 3.059 | 2.815 | 2.977 | 3.156 | 2.799 | 2.422 | 13.6 | 39.6 | 43.2 | 42.2 | 45.0 | 46.4 | 46.0 | 47.9 | 1.158 | 1.657 | 1.760 | 1.731 | 1.818 | 1.864 | 1.853 | 1.921 | 1.141 | 1.424 | 1.462 | 1.440 | 1.466 | 1.487 | 1.463 | 1.520 |
(null):(nil) | | :0 | 0 | 0 | runtime | parallel | 6 | 72 | 96 | 120 | 128 | 144 | 168 | 192 | 1.328 | 0.406 | 0.420 | 0.364 | 0.393 | 0.353 | 0.374 | 0.166 | 4.10 E3 | 4.10 E3 | 4.10 E3 | 4.10 E3 | 4.10 E3 | 4.10 E3 | 4.10 E3 | 4.10 E3 | 0.367 | 0.188 | 0.129 | 0.112 | 0.109 | 98.9 E-3 | 0.132 | 48.9 E-3 | 0.366 | 0.188 | 0.128 | 0.111 | 0.108 | 97.8 E-3 | 0.131 | 48.3 E-3 | 27.6 | 46.4 | 30.6 | 30.8 | 27.7 | 28.1 | 35.4 | 29.5 | 1.381 | 1.864 | 1.442 | 1.446 | 1.383 | 1.390 | 1.548 | 1.418 | 1.014 | 1.019 | 1.013 | 1.012 | 1.012 | 1.010 | 1.015 | 1.007 |
libggml-cpu.so:0x64e9f | ggml_backend_amx_convert_weight(ggml_tensor*, void const*, u... | mmq.cpp:2337 | 0 | 0 | runtime | parallel | 6 | 72 | 96 | 120 | 128 | 144 | 168 | 192 | 0.421 | 0.140 | 0.148 | 0.150 | 0.156 | 0.161 | 0.165 | 0.176 | 225 | 225 | 225 | 225 | 225 | 225 | 225 | 225 | 62.1 E-3 | 46.9 E-3 | 54.9 E-3 | 61.5 E-3 | 53.0 E-3 | 65.7 E-3 | 73.1 E-3 | 77.5 E-3 | 62.1 E-3 | 46.9 E-3 | 54.9 E-3 | 61.5 E-3 | 53.0 E-3 | 65.7 E-3 | 73.1 E-3 | 77.5 E-3 | 14.7 | 33.4 | 37.2 | 41.0 | 33.9 | 40.8 | 44.3 | 44.2 | 1.173 | 1.503 | 1.592 | 1.694 | 1.513 | 1.690 | 1.796 | 1.791 | 1.002 | 1.005 | 1.006 | 1.007 | 1.006 | 1.007 | 1.008 | 1.011 |