1 | #!/usr/bin/env bash
|
2 | #
|
3 | # Run the 'perf' tool and associated reports on OSH.
|
4 | #
|
5 | # Usage:
|
6 | # benchmarks/perf.sh <function name>
|
7 | #
|
8 | # Deps:
|
9 | #
|
10 | # Clone https://github.com/brendangregg/FlameGraph
|
11 | # Put it in ~/git/other/FlameGraph, or edit the paths below
|
12 | #
|
13 | # Examples:
|
14 | #
|
15 | # $0 install # install perf, including matching kernel symbols
|
16 | #
|
17 | # $0 profile-osh-parse # make flame graph
|
18 | #
|
19 | # Then look at _tmp/perf/osh-parse.svg in the browser
|
20 |
|
21 | # $0 profile-osh-parse flat # make flat text report
|
22 | #
|
23 | # perf report -i _tmp/perf/osh-parse.perf # interactive
|
24 | #
|
25 | # Likewise for
|
26 | #
|
27 | # $0 profile-example escape
|
28 | # => _tmp/perf/example-escape.svg
|
29 | # $0 profile-example escape flat
|
30 | # => _tmp/perf/example-escape.report.txt
|
31 |
|
32 | set -o nounset
|
33 | set -o pipefail
|
34 | set -o errexit
|
35 |
|
36 | REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
|
37 |
|
38 | source $REPO_ROOT/test/common.sh # $OSH
|
39 |
|
40 | # $REPO_ROOT needed since CPython configure changes dirs
|
41 | readonly BASE_DIR=$REPO_ROOT/_tmp/perf
|
42 |
|
43 | # TODO:
|
44 | # - kernel symbols. Is that why there are a lot of [unknown] in opt mode?
|
45 | # - grep for call_function in collapsed. I don't see it?
|
46 | # - it's inlined I guess?
|
47 |
|
48 | # Question: PyEval_EvalFrameEx doesn't appear recursive in opt mode? At least
|
49 | # according to 'perf'. Or maybe you don't have enough samples to catch it?
|
50 |
|
51 | # NOTES:
|
52 | # - dbg vs. opt matters a lot
|
53 | # - function-level performance categorization is bad for bytecode interpreters,
|
54 | # which have a single function and a big switch statement.
|
55 | # - a longer file like configure-coreutils hit garbage collection! collect()
|
56 | # - reference counting functions: visit_decref, visit_reachable
|
57 |
|
58 | install-ubuntu-packages() {
|
59 | # linux-tools-generic is the kernel module
|
60 | # Apparently you need a package specific to the kernel, not sure why.
|
61 | sudo apt-get install \
|
62 | linux-tools-common linux-tools-$(uname -r) linux-tools-generic
|
63 | }
|
64 |
|
65 | install-debian-packages() {
|
66 | sudo apt-get install linux-perf
|
67 | }
|
68 |
|
69 | soil-install() {
|
70 | sudo apt-get update # seem to need this
|
71 |
|
72 | install-ubuntu-packages
|
73 | }
|
74 |
|
75 | debug-symbols() {
|
76 | #dpkg --listfiles linux-tools-4.13.0-36-generic
|
77 | #sudo apt install python-dbg
|
78 |
|
79 | # I don't see symbols files here? Just the interpreter? They're built into the ELF file?
|
80 | #dpkg --listfiles python-dbg
|
81 |
|
82 | # has files in /usr/lib/debug
|
83 | # file /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug
|
84 | # /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, BuildID[sha1]=8d9bd4ce26e45ef16075c67d5f5eeafd8b562832, not stripped
|
85 | #
|
86 | # https://sourceware.org/gdb/onlinedocs/gdb/Separate-Debug-Files.html
|
87 |
|
88 | # Does perf also support separate debug files?
|
89 | # How do I set the debug link in oil.ovm? Or should I set build ID?
|
90 |
|
91 | # The GNU binary utilities (Binutils) package includes the ‘objcopy’ utility
|
92 | # that can produce the separated executable / debugging information file
|
93 | # pairs using the following commands:
|
94 | # objcopy --only-keep-debug foo foo.debug
|
95 | # strip -g foo
|
96 |
|
97 | sudo apt install zlib1g-dbg
|
98 | dpkg --listfiles zlib1g-dbg
|
99 | #libpython2.7-dbg
|
100 | }
|
101 |
|
102 | # TODO: Link these two tools in ../oil_DEPS/bin or something
|
103 | # Make them work on CI
|
104 |
|
105 | # NOTE: I used this before with python-flamegraph too.
|
106 | flamegraph() {
|
107 | ~/git/other/FlameGraph/flamegraph.pl "$@"
|
108 | }
|
109 |
|
110 | stackcollapse-perf() {
|
111 | ~/git/other/FlameGraph/stackcollapse-perf.pl "$@"
|
112 | }
|
113 |
|
114 | # http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html
|
115 | make-graph() {
|
116 | local name=${1:-osh-parse}
|
117 |
|
118 | local dir=$BASE_DIR
|
119 | perf script -i $dir/$name.perf | stackcollapse-perf > $dir/$name.perf-folded
|
120 |
|
121 | flamegraph $dir/$name.perf-folded > $dir/$name.svg
|
122 |
|
123 | echo "Wrote $dir/$name.svg"
|
124 | }
|
125 |
|
126 | _make-readable() {
|
127 | local perf_raw=$1
|
128 |
|
129 | chmod 644 $perf_raw
|
130 |
|
131 | # original user
|
132 | chown $USER $perf_raw
|
133 | }
|
134 |
|
135 | make-readable() {
|
136 | # This gets run as root
|
137 | local name=$1
|
138 |
|
139 | local perf_raw=$BASE_DIR/$name.perf
|
140 |
|
141 | sudo $0 _make-readable $perf_raw
|
142 |
|
143 | file $perf_raw
|
144 | ls -l $perf_raw
|
145 | }
|
146 |
|
147 | _record-cpp() {
|
148 | local name=$1 # e.g. oils-for-unix, escape
|
149 | local mode=$2
|
150 | shift 2
|
151 |
|
152 | # Can repeat 13 times without blowing heap
|
153 | #export REPEAT=13
|
154 |
|
155 | local freq=10000
|
156 |
|
157 | local extra_flags=''
|
158 | case $mode in
|
159 | (graph) extra_flags='-g' ;; # needed to make flame graph
|
160 | (flat) extra_flags='' ;;
|
161 | (*) die "Mode should be graph or flat, got $mode" ;;
|
162 | esac
|
163 |
|
164 | time perf record $extra_flags -F $freq -o $BASE_DIR/$name.perf -- "$@"
|
165 |
|
166 | make-readable $name
|
167 | }
|
168 |
|
169 | profile-cpp() {
|
170 | local name=$1
|
171 | local mode=$2
|
172 | shift
|
173 |
|
174 | mkdir -p $BASE_DIR
|
175 |
|
176 | # -E preserve environment like BENCHMARK=1
|
177 | sudo -E $REPO_ROOT/$0 _record-cpp $name "$@";
|
178 |
|
179 | case $mode in
|
180 | (graph)
|
181 | make-graph $name
|
182 | ;;
|
183 | (flat)
|
184 | local out=$BASE_DIR/$name.report.txt
|
185 | text-report $name | tee $out
|
186 | echo "Wrote $out"
|
187 | ;;
|
188 | (*)
|
189 | die "Mode should be graph or flat, got $mode"
|
190 | ;;
|
191 | esac
|
192 | }
|
193 |
|
194 | profile-osh-parse() {
|
195 | # Profile parsing a big file. More than half the time is in malloc
|
196 | # (_int_malloc in GCC), which is not surprising!
|
197 |
|
198 | local osh=${1:-_bin/cxx-opt/osh}
|
199 | local mode=${2:-graph}
|
200 |
|
201 | #local file=benchmarks/testdata/configure
|
202 | local file=benchmarks/testdata/configure-coreutils
|
203 |
|
204 | local -a cmd=( $osh --ast-format none -n $file )
|
205 | profile-cpp 'osh-parse' $mode "${cmd[@]}"
|
206 |
|
207 | # 'perf list' shows the events
|
208 | #OILS_GC_STATS=1 sudo perf stat -e cache-misses -e cache-references "${cmd[@]}"
|
209 | OILS_GC_STATS=1 sudo perf stat "${cmd[@]}"
|
210 |
|
211 | # Run again with GC stats
|
212 | time OILS_GC_STATS=1 "${cmd[@]}"
|
213 | }
|
214 |
|
215 | profile-fib() {
|
216 | local osh=${1:-_bin/cxx-opt/osh}
|
217 | local mode=${2:-graph}
|
218 |
|
219 | # Same iterations as benchmarks/gc
|
220 | local -a cmd=( $osh benchmarks/compute/fib.sh 500 44 )
|
221 |
|
222 | profile-cpp 'fib' $mode "${cmd[@]}"
|
223 | }
|
224 |
|
225 | # Hm this is dominated by GC, not regex?
|
226 | profile-parse-help() {
|
227 | local osh=${1:-_bin/cxx-opt/osh}
|
228 | local mode=${2:-flat}
|
229 |
|
230 | local -a cmd=( $osh benchmarks/parse-help/pure-excerpt.sh parse_help_file benchmarks/parse-help/clang.txt )
|
231 |
|
232 | profile-cpp 'parse-help' $mode "${cmd[@]}"
|
233 | }
|
234 |
|
235 | profile-example() {
|
236 | local example=${1:-escape}
|
237 | local mode=${2:-graph}
|
238 |
|
239 | local bin="_bin/cxx-opt/mycpp/examples/$example.mycpp"
|
240 |
|
241 | ninja $bin
|
242 | echo
|
243 |
|
244 | BENCHMARK=1 profile-cpp "example-$example" $mode $bin
|
245 | }
|
246 |
|
247 | profile-hash-table() {
|
248 | local mode=${1:-graph}
|
249 |
|
250 | local bin='_bin/cxx-opt/mycpp/hash_table'
|
251 | ninja $bin
|
252 | profile-cpp 'hash_table' $mode $bin -t hash_speed_test
|
253 | }
|
254 |
|
255 | # Perf note: Without -o, for some reason osh output is shown on the console.
|
256 | # It doesn't go to wc?
|
257 | #perf record -o perf.data -- _bin/osh -n benchmarks/testdata/abuild | wc -l
|
258 |
|
259 | text-report() {
|
260 | ### Show a batch report; 'perf report' is interactive
|
261 |
|
262 | local name=$1
|
263 | shift
|
264 |
|
265 | local perf_raw=$BASE_DIR/$name.perf
|
266 |
|
267 | # Flat report
|
268 | perf report -i $perf_raw -n --stdio "$@"
|
269 | }
|
270 |
|
271 | # Shows instruction counts, branch misses, and so forth
|
272 | #
|
273 | # Wow 11 billion instructions! 9 billion cycles. 2.3 billion branches. Crazy.
|
274 | # Only 21M branch misses, or 0.9%. Interesting.
|
275 | _stat() {
|
276 | perf stat -- "$@" | wc -l
|
277 | # -e cache-misses only shows that stat
|
278 | }
|
279 | stat() { sudo $0 _stat "$@"; }
|
280 |
|
281 | stat-osh-parse() {
|
282 | stat _bin/cxx-opt/oils-for-unix --ast-format none -n benchmarks/testdata/configure
|
283 | }
|
284 |
|
285 |
|
286 | #
|
287 | # OLD OVM stuff
|
288 | #
|
289 |
|
290 | # Parsing abuild in Debug mode:
|
291 | # 14% PyEval_EvalFrameEx -- hm. Interpreter dispatch is a lot? More than I
|
292 | # thought. Maybe need my own loop.
|
293 | # 8% type_call -- hm introspection?
|
294 | # 7% PyObject_GetAttr. My intitution. Should be done at compile time!
|
295 | # 6% do_richcmp -- hm interesting
|
296 | # 5% PyObject_Malloc.
|
297 |
|
298 | # More benchmarks:
|
299 | # OPy running under OVM, compiling itself, compiling Oil, compiling OPy ports,
|
300 | # etc.
|
301 |
|
302 | # Parsing abuild, the optimized version.
|
303 | #
|
304 | # 80% PyEval_EvalFramEx. Woah everything is inlined?
|
305 | # 12.5% PyObject_GenericGetAtr. PyObject_GetAttr is much lower down.
|
306 | # Some kernel.
|
307 | # 0.76% lookdict_string is not a bottleneck. Hm.
|
308 | #
|
309 | # Wow.
|
310 | # Maybe I need counters in optimized mode?
|
311 | # Yeah what I really want is per opcode total!
|
312 |
|
313 | _record() {
|
314 |
|
315 | # TODO: The optimized build should have symbols! Don't build with -s. And
|
316 | # then put symbols next to the binary somehow? How do the symbols packages
|
317 | # work?
|
318 | #perf record -o perf.data -- _bin/oil.ovm-dbg osh -n benchmarks/testdata/abuild | wc -l
|
319 |
|
320 | # call graph recording. This helps it be less "flat" in opt mode. Otherwise
|
321 | # everything is PyEval_EvalFrameEx.
|
322 | local flag='-g'
|
323 | local bin=_bin/oil.ovm-opt
|
324 | #local bin=_bin/oil.ovm-dbg # This shows more details
|
325 |
|
326 | local freq=1000 # 1000 Hz
|
327 |
|
328 | #local file=benchmarks/testdata/abuild # small file
|
329 |
|
330 | local file=benchmarks/testdata/configure-coreutils # big file
|
331 |
|
332 | time perf record $flag -F $freq -o perf.data -- $bin osh --ast-format none -n $file
|
333 | #perf record -o perf.data -- _bin/osh --ast-format none -n benchmarks/testdata/abuild
|
334 | }
|
335 | record() { sudo $0 _record; }
|
336 |
|
337 | #
|
338 | # Soil CI
|
339 | #
|
340 |
|
341 | build-stress-test() {
|
342 |
|
343 | # Special _OIL_DEV for -D GC_TIMING
|
344 | _OIL_DEV=1 ./configure --without-readline
|
345 |
|
346 | mkdir -p _tmp
|
347 | c++ -D MARK_SWEEP -I . \
|
348 | -O2 -g \
|
349 | -o _tmp/gc_stress_test \
|
350 | mycpp/gc_stress_test.cc \
|
351 | mycpp/mark_sweep_heap.cc \
|
352 | mycpp/gc_builtins.cc \
|
353 | mycpp/gc_mylib.cc \
|
354 | mycpp/gc_str.cc \
|
355 | mycpp/hash.cc \
|
356 | -lstdc++
|
357 | }
|
358 |
|
359 | profile-stress-test() {
|
360 | profile-cpp 'gc_stress_test' flat \
|
361 | _tmp/gc_stress_test
|
362 | }
|
363 |
|
364 | print-index() {
|
365 | echo '<body style="margin: 0 auto; width: 40em; font-size: large">'
|
366 | echo '<h1>Perf Profiles</h1>'
|
367 |
|
368 | for path in $BASE_DIR/*.txt; do
|
369 | local filename=$(basename $path)
|
370 | echo "<a href="$filename">$filename</a> <br/>"
|
371 | done
|
372 |
|
373 | echo '</body>'
|
374 | }
|
375 |
|
376 | # TODO: fetch the tarball from the cpp-small CI task
|
377 |
|
378 | build-tar() {
|
379 | local tar=${1:-_release/oils-for-unix.tar}
|
380 |
|
381 | tar=$PWD/$tar
|
382 |
|
383 | local tmp=$BASE_DIR/tar
|
384 | mkdir -p $tmp
|
385 |
|
386 | pushd $tmp
|
387 |
|
388 | tar --extract < $tar
|
389 | cd oils-for-unix-* # glob of 1
|
390 |
|
391 | ./configure
|
392 |
|
393 | # TODO: add bumproot
|
394 | for variant in opt+bumpleak opt; do
|
395 | echo
|
396 |
|
397 | time _build/oils.sh '' $variant
|
398 | echo
|
399 |
|
400 | _bin/cxx-$variant-sh/osh -c 'echo "hi from $0"'
|
401 | done
|
402 |
|
403 | # TODO:
|
404 | # - profile each executable
|
405 | # - add OILS_GC_THRESHOLD=$big to avoid GC
|
406 |
|
407 | popd
|
408 | }
|
409 |
|
410 | profile-cpython-configure() {
|
411 | ### borrowed from benchmarks/osh-runtime.sh
|
412 |
|
413 | local osh=${1:-$REPO_ROOT/_bin/cxx-opt/osh}
|
414 | local mode=${2:-flat}
|
415 |
|
416 | local dir=$BASE_DIR/cpython-configure
|
417 |
|
418 | # Fails because perf has to run as root
|
419 | rm -r -f -v $dir || true
|
420 |
|
421 | mkdir -p $dir
|
422 |
|
423 | local -a cmd=( $osh $REPO_ROOT/Python-2.7.13/configure )
|
424 |
|
425 | pushd $dir
|
426 | profile-cpp 'cpython-configure' $mode "${cmd[@]}"
|
427 | popd
|
428 | }
|
429 |
|
430 | cpython-report() {
|
431 | #perf report -i $BASE_DIR/cpython-configure.perf
|
432 |
|
433 | # oils-for-unix is only 4.89% of time? That's #5
|
434 | # 48% in kernel
|
435 | # 23% in cc1
|
436 | #
|
437 | # That means we're still a bit slow
|
438 |
|
439 | # TODO: I want to change OVERALL percentages
|
440 | #
|
441 | # GC is 1.6% and let's say rooting is 3%. That's 300 ms out of 10s
|
442 | # GC can account for the whole thing
|
443 | # I wonder if we can do GC while waiting for processes? They might be tiny
|
444 | # processes though
|
445 |
|
446 | perf report -i $BASE_DIR/cpython-configure.perf \
|
447 | -n --dso=oils-for-unix --percentage=relative
|
448 |
|
449 | #perf report -i $BASE_DIR/cpython-configure.perf --sort=dso
|
450 | }
|
451 |
|
452 | local-test() {
|
453 | local osh=_bin/cxx-opt/osh
|
454 | ninja $osh
|
455 | profile-fib $REPO_ROOT/$osh flat
|
456 | }
|
457 |
|
458 | soil-run() {
|
459 | echo 'TODO run benchmarks/gc tasks'
|
460 | # But we don't have Ninja
|
461 | # Fetch the tarball?
|
462 |
|
463 | # Can you WAIT for the tarball?
|
464 | # You can wait for the cpp-small task that builds it? Ah hacky hacky
|
465 |
|
466 | build-stress-test
|
467 |
|
468 | profile-stress-test
|
469 |
|
470 | export-osh-cpp _tmp/native-tar-test opt
|
471 | #export-osh-cpp '' opt
|
472 |
|
473 | profile-fib $OSH flat
|
474 | profile-osh-parse $OSH flat
|
475 | profile-parse-help $OSH flat
|
476 |
|
477 | print-index > $BASE_DIR/index.html
|
478 |
|
479 | echo "Wrote $BASE_DIR/index.html"
|
480 | }
|
481 |
|
482 | "$@"
|