| 1 | #!/usr/bin/env bash
|
| 2 | #
|
| 3 | # Run the 'perf' tool and associated reports on OSH.
|
| 4 | #
|
| 5 | # Usage:
|
| 6 | # benchmarks/perf.sh <function name>
|
| 7 | #
|
| 8 | # Deps:
|
| 9 | #
|
| 10 | # Clone https://github.com/brendangregg/FlameGraph
|
| 11 | # Put it in ~/git/other/FlameGraph, or edit the paths below
|
| 12 | #
|
| 13 | # Examples:
|
| 14 | #
|
| 15 | # $0 install # install perf, including matching kernel symbols
|
| 16 | #
|
| 17 | # $0 profile-osh-parse # make flame graph
|
| 18 | #
|
| 19 | # Then look at _tmp/perf/osh-parse.svg in the browser
|
| 20 |
|
| 21 | # $0 profile-osh-parse flat # make flat text report
|
| 22 | #
|
| 23 | # perf report -i _tmp/perf/osh-parse.perf # interactive
|
| 24 | #
|
| 25 | # Likewise for
|
| 26 | #
|
| 27 | # $0 profile-example escape
|
| 28 | # => _tmp/perf/example-escape.svg
|
| 29 | # $0 profile-example escape flat
|
| 30 | # => _tmp/perf/example-escape.report.txt
|
| 31 |
|
| 32 | set -o nounset
|
| 33 | set -o pipefail
|
| 34 | set -o errexit
|
| 35 |
|
| 36 | REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
|
| 37 |
|
| 38 | source $REPO_ROOT/test/common.sh # $OSH
|
| 39 |
|
| 40 | # $REPO_ROOT needed since CPython configure changes dirs
|
| 41 | readonly BASE_DIR=$REPO_ROOT/_tmp/perf
|
| 42 |
|
| 43 | # TODO:
|
| 44 | # - kernel symbols. Is that why there are a lot of [unknown] in opt mode?
|
| 45 | # - grep for call_function in collapsed. I don't see it?
|
| 46 | # - it's inlined I guess?
|
| 47 |
|
| 48 | # Question: PyEval_EvalFrameEx doesn't appear recursive in opt mode? At least
|
| 49 | # according to 'perf'. Or maybe you don't have enough samples to catch it?
|
| 50 |
|
| 51 | # NOTES:
|
| 52 | # - dbg vs. opt matters a lot
|
| 53 | # - function-level performance categorization is bad for bytecode interpreters,
|
| 54 | # which have a single function and a big switch statement.
|
| 55 | # - a longer file like configure-coreutils hit garbage collection! collect()
|
| 56 | # - reference counting functions: visit_decref, visit_reachable
|
| 57 |
|
| 58 | install-ubuntu-packages() {
|
| 59 | # linux-tools-generic is the kernel module
|
| 60 | # Apparently you need a package specific to the kernel, not sure why.
|
| 61 | sudo apt-get install \
|
| 62 | linux-tools-common linux-tools-$(uname -r) linux-tools-generic
|
| 63 | }
|
| 64 |
|
| 65 | install-debian-packages() {
|
| 66 | sudo apt-get install linux-perf
|
| 67 | }
|
| 68 |
|
| 69 | soil-install() {
|
| 70 | sudo apt-get update # seem to need this
|
| 71 |
|
| 72 | install-ubuntu-packages
|
| 73 | }
|
| 74 |
|
| 75 | debug-symbols() {
|
| 76 | #dpkg --listfiles linux-tools-4.13.0-36-generic
|
| 77 | #sudo apt install python-dbg
|
| 78 |
|
| 79 | # I don't see symbols files here? Just the interpreter? They're built into the ELF file?
|
| 80 | #dpkg --listfiles python-dbg
|
| 81 |
|
| 82 | # has files in /usr/lib/debug
|
| 83 | # file /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug
|
| 84 | # /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, BuildID[sha1]=8d9bd4ce26e45ef16075c67d5f5eeafd8b562832, not stripped
|
| 85 | #
|
| 86 | # https://sourceware.org/gdb/onlinedocs/gdb/Separate-Debug-Files.html
|
| 87 |
|
| 88 | # Does perf also support separate debug files?
|
| 89 | # How do I set the debug link in oil.ovm? Or should I set build ID?
|
| 90 |
|
| 91 | # The GNU binary utilities (Binutils) package includes the ‘objcopy’ utility
|
| 92 | # that can produce the separated executable / debugging information file
|
| 93 | # pairs using the following commands:
|
| 94 | # objcopy --only-keep-debug foo foo.debug
|
| 95 | # strip -g foo
|
| 96 |
|
| 97 | sudo apt install zlib1g-dbg
|
| 98 | dpkg --listfiles zlib1g-dbg
|
| 99 | #libpython2.7-dbg
|
| 100 | }
|
| 101 |
|
| 102 | # TODO: Link these two tools in ../oil_DEPS/bin or something
|
| 103 | # Make them work on CI
|
| 104 |
|
| 105 | # NOTE: I used this before with python-flamegraph too.
|
| 106 | flamegraph() {
|
| 107 | ~/git/other/FlameGraph/flamegraph.pl "$@"
|
| 108 | }
|
| 109 |
|
| 110 | stackcollapse-perf() {
|
| 111 | ~/git/other/FlameGraph/stackcollapse-perf.pl "$@"
|
| 112 | }
|
| 113 |
|
| 114 | # http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html
|
| 115 | make-graph() {
|
| 116 | local name=${1:-osh-parse}
|
| 117 |
|
| 118 | local dir=$BASE_DIR
|
| 119 | perf script -i $dir/$name.perf | stackcollapse-perf > $dir/$name.perf-folded
|
| 120 |
|
| 121 | flamegraph $dir/$name.perf-folded > $dir/$name.svg
|
| 122 |
|
| 123 | echo "Wrote $dir/$name.svg"
|
| 124 | }
|
| 125 |
|
| 126 | _make-readable() {
|
| 127 | local perf_raw=$1
|
| 128 |
|
| 129 | chmod 644 $perf_raw
|
| 130 |
|
| 131 | # original user
|
| 132 | chown $USER $perf_raw
|
| 133 | }
|
| 134 |
|
| 135 | make-readable() {
|
| 136 | # This gets run as root
|
| 137 | local name=$1
|
| 138 |
|
| 139 | local perf_raw=$BASE_DIR/$name.perf
|
| 140 |
|
| 141 | sudo $0 _make-readable $perf_raw
|
| 142 |
|
| 143 | file $perf_raw
|
| 144 | ls -l $perf_raw
|
| 145 | }
|
| 146 |
|
| 147 | _record-cpp() {
|
| 148 | local name=$1 # e.g. oils-for-unix, escape
|
| 149 | local mode=$2
|
| 150 | shift 2
|
| 151 |
|
| 152 | # Can repeat 13 times without blowing heap
|
| 153 | #export REPEAT=13
|
| 154 |
|
| 155 | local freq=10000
|
| 156 |
|
| 157 | local extra_flags=''
|
| 158 | case $mode in
|
| 159 | (graph) extra_flags='-g' ;; # needed to make flame graph
|
| 160 | (flat) extra_flags='' ;;
|
| 161 | (*) die "Mode should be graph or flat, got $mode" ;;
|
| 162 | esac
|
| 163 |
|
| 164 | time perf record $extra_flags -F $freq -o $BASE_DIR/$name.perf -- "$@"
|
| 165 |
|
| 166 | make-readable $name
|
| 167 | }
|
| 168 |
|
| 169 | profile-cpp() {
|
| 170 | local name=$1
|
| 171 | local mode=$2
|
| 172 | shift
|
| 173 |
|
| 174 | mkdir -p $BASE_DIR
|
| 175 |
|
| 176 | # -E preserve environment like BENCHMARK=1
|
| 177 | sudo -E $REPO_ROOT/$0 _record-cpp $name "$@";
|
| 178 |
|
| 179 | case $mode in
|
| 180 | (graph)
|
| 181 | make-graph $name
|
| 182 | ;;
|
| 183 | (flat)
|
| 184 | local out=$BASE_DIR/$name.report.txt
|
| 185 | text-report $name | tee $out
|
| 186 | echo "Wrote $out"
|
| 187 | ;;
|
| 188 | (*)
|
| 189 | die "Mode should be graph or flat, got $mode"
|
| 190 | ;;
|
| 191 | esac
|
| 192 | }
|
| 193 |
|
| 194 | profile-osh-parse() {
|
| 195 | # Profile parsing a big file. More than half the time is in malloc
|
| 196 | # (_int_malloc in GCC), which is not surprising!
|
| 197 |
|
| 198 | local osh=${1:-_bin/cxx-opt/osh}
|
| 199 | local mode=${2:-graph}
|
| 200 |
|
| 201 | #local file=benchmarks/testdata/configure
|
| 202 | local file=benchmarks/testdata/configure-coreutils
|
| 203 |
|
| 204 | local -a cmd=( $osh --ast-format none -n $file )
|
| 205 | profile-cpp 'osh-parse' $mode "${cmd[@]}"
|
| 206 |
|
| 207 | # 'perf list' shows the events
|
| 208 | #OILS_GC_STATS=1 sudo perf stat -e cache-misses -e cache-references "${cmd[@]}"
|
| 209 | OILS_GC_STATS=1 sudo perf stat "${cmd[@]}"
|
| 210 |
|
| 211 | # Run again with GC stats
|
| 212 | time OILS_GC_STATS=1 "${cmd[@]}"
|
| 213 | }
|
| 214 |
|
| 215 | profile-fib() {
|
| 216 | local osh=${1:-_bin/cxx-opt/osh}
|
| 217 | local mode=${2:-graph}
|
| 218 |
|
| 219 | # Same iterations as benchmarks/gc
|
| 220 | local -a cmd=( $osh benchmarks/compute/fib.sh 500 44 )
|
| 221 |
|
| 222 | profile-cpp 'fib' $mode "${cmd[@]}"
|
| 223 | }
|
| 224 |
|
| 225 | # Hm this is dominated by GC, not regex?
|
| 226 | profile-parse-help() {
|
| 227 | local osh=${1:-_bin/cxx-opt/osh}
|
| 228 | local mode=${2:-flat}
|
| 229 |
|
| 230 | local -a cmd=( $osh benchmarks/parse-help/pure-excerpt.sh parse_help_file benchmarks/parse-help/clang.txt )
|
| 231 |
|
| 232 | profile-cpp 'parse-help' $mode "${cmd[@]}"
|
| 233 | }
|
| 234 |
|
| 235 | profile-example() {
|
| 236 | local example=${1:-escape}
|
| 237 | local mode=${2:-graph}
|
| 238 |
|
| 239 | local bin="_bin/cxx-opt/mycpp/examples/$example.mycpp"
|
| 240 |
|
| 241 | ninja $bin
|
| 242 | echo
|
| 243 |
|
| 244 | BENCHMARK=1 profile-cpp "example-$example" $mode $bin
|
| 245 | }
|
| 246 |
|
| 247 | profile-hash-table() {
|
| 248 | local mode=${1:-graph}
|
| 249 |
|
| 250 | local bin='_bin/cxx-opt/mycpp/hash_table'
|
| 251 | ninja $bin
|
| 252 | profile-cpp 'hash_table' $mode $bin -t hash_speed_test
|
| 253 | }
|
| 254 |
|
| 255 | # Perf note: Without -o, for some reason osh output is shown on the console.
|
| 256 | # It doesn't go to wc?
|
| 257 | #perf record -o perf.data -- _bin/osh -n benchmarks/testdata/abuild | wc -l
|
| 258 |
|
| 259 | text-report() {
|
| 260 | ### Show a batch report; 'perf report' is interactive
|
| 261 |
|
| 262 | local name=$1
|
| 263 | shift
|
| 264 |
|
| 265 | local perf_raw=$BASE_DIR/$name.perf
|
| 266 |
|
| 267 | # Flat report
|
| 268 | perf report -i $perf_raw -n --stdio "$@"
|
| 269 | }
|
| 270 |
|
| 271 | # Shows instruction counts, branch misses, and so forth
|
| 272 | #
|
| 273 | # Wow 11 billion instructions! 9 billion cycles. 2.3 billion branches. Crazy.
|
| 274 | # Only 21M branch misses, or 0.9%. Interesting.
|
| 275 | _stat() {
|
| 276 | perf stat -- "$@" | wc -l
|
| 277 | # -e cache-misses only shows that stat
|
| 278 | }
|
| 279 | stat() { sudo $0 _stat "$@"; }
|
| 280 |
|
| 281 | stat-osh-parse() {
|
| 282 | stat _bin/cxx-opt/oils-for-unix --ast-format none -n benchmarks/testdata/configure
|
| 283 | }
|
| 284 |
|
| 285 |
|
| 286 | #
|
| 287 | # OLD OVM stuff
|
| 288 | #
|
| 289 |
|
| 290 | # Parsing abuild in Debug mode:
|
| 291 | # 14% PyEval_EvalFrameEx -- hm. Interpreter dispatch is a lot? More than I
|
| 292 | # thought. Maybe need my own loop.
|
| 293 | # 8% type_call -- hm introspection?
|
| 294 | # 7% PyObject_GetAttr. My intitution. Should be done at compile time!
|
| 295 | # 6% do_richcmp -- hm interesting
|
| 296 | # 5% PyObject_Malloc.
|
| 297 |
|
| 298 | # More benchmarks:
|
| 299 | # OPy running under OVM, compiling itself, compiling Oil, compiling OPy ports,
|
| 300 | # etc.
|
| 301 |
|
| 302 | # Parsing abuild, the optimized version.
|
| 303 | #
|
| 304 | # 80% PyEval_EvalFramEx. Woah everything is inlined?
|
| 305 | # 12.5% PyObject_GenericGetAtr. PyObject_GetAttr is much lower down.
|
| 306 | # Some kernel.
|
| 307 | # 0.76% lookdict_string is not a bottleneck. Hm.
|
| 308 | #
|
| 309 | # Wow.
|
| 310 | # Maybe I need counters in optimized mode?
|
| 311 | # Yeah what I really want is per opcode total!
|
| 312 |
|
| 313 | _record() {
|
| 314 |
|
| 315 | # TODO: The optimized build should have symbols! Don't build with -s. And
|
| 316 | # then put symbols next to the binary somehow? How do the symbols packages
|
| 317 | # work?
|
| 318 | #perf record -o perf.data -- _bin/oil.ovm-dbg osh -n benchmarks/testdata/abuild | wc -l
|
| 319 |
|
| 320 | # call graph recording. This helps it be less "flat" in opt mode. Otherwise
|
| 321 | # everything is PyEval_EvalFrameEx.
|
| 322 | local flag='-g'
|
| 323 | local bin=_bin/oil.ovm-opt
|
| 324 | #local bin=_bin/oil.ovm-dbg # This shows more details
|
| 325 |
|
| 326 | local freq=1000 # 1000 Hz
|
| 327 |
|
| 328 | #local file=benchmarks/testdata/abuild # small file
|
| 329 |
|
| 330 | local file=benchmarks/testdata/configure-coreutils # big file
|
| 331 |
|
| 332 | time perf record $flag -F $freq -o perf.data -- $bin osh --ast-format none -n $file
|
| 333 | #perf record -o perf.data -- _bin/osh --ast-format none -n benchmarks/testdata/abuild
|
| 334 | }
|
| 335 | record() { sudo $0 _record; }
|
| 336 |
|
| 337 | #
|
| 338 | # Soil CI
|
| 339 | #
|
| 340 |
|
| 341 | build-stress-test() {
|
| 342 |
|
| 343 | # Special _OIL_DEV for -D GC_TIMING
|
| 344 | _OIL_DEV=1 ./configure --without-readline
|
| 345 |
|
| 346 | mkdir -p _tmp
|
| 347 | c++ -D MARK_SWEEP -I . \
|
| 348 | -O2 -g \
|
| 349 | -o _tmp/gc_stress_test \
|
| 350 | mycpp/gc_stress_test.cc \
|
| 351 | mycpp/mark_sweep_heap.cc \
|
| 352 | mycpp/gc_builtins.cc \
|
| 353 | mycpp/gc_mylib.cc \
|
| 354 | mycpp/gc_str.cc \
|
| 355 | mycpp/hash.cc \
|
| 356 | -lstdc++
|
| 357 | }
|
| 358 |
|
| 359 | profile-stress-test() {
|
| 360 | profile-cpp 'gc_stress_test' flat \
|
| 361 | _tmp/gc_stress_test
|
| 362 | }
|
| 363 |
|
| 364 | print-index() {
|
| 365 | echo '<body style="margin: 0 auto; width: 40em; font-size: large">'
|
| 366 | echo '<h1>Perf Profiles</h1>'
|
| 367 |
|
| 368 | for path in $BASE_DIR/*.txt; do
|
| 369 | local filename=$(basename $path)
|
| 370 | echo "<a href="$filename">$filename</a> <br/>"
|
| 371 | done
|
| 372 |
|
| 373 | echo '</body>'
|
| 374 | }
|
| 375 |
|
| 376 | # TODO: fetch the tarball from the cpp-small CI task
|
| 377 |
|
| 378 | build-tar() {
|
| 379 | local tar=${1:-_release/oils-for-unix.tar}
|
| 380 |
|
| 381 | tar=$PWD/$tar
|
| 382 |
|
| 383 | local tmp=$BASE_DIR/tar
|
| 384 | mkdir -p $tmp
|
| 385 |
|
| 386 | pushd $tmp
|
| 387 |
|
| 388 | tar --extract < $tar
|
| 389 | cd oils-for-unix-* # glob of 1
|
| 390 |
|
| 391 | ./configure
|
| 392 |
|
| 393 | # TODO: add bumproot
|
| 394 | for variant in opt+bumpleak opt; do
|
| 395 | echo
|
| 396 |
|
| 397 | time _build/oils.sh '' $variant
|
| 398 | echo
|
| 399 |
|
| 400 | _bin/cxx-$variant-sh/osh -c 'echo "hi from $0"'
|
| 401 | done
|
| 402 |
|
| 403 | # TODO:
|
| 404 | # - profile each executable
|
| 405 | # - add OILS_GC_THRESHOLD=$big to avoid GC
|
| 406 |
|
| 407 | popd
|
| 408 | }
|
| 409 |
|
| 410 | profile-cpython-configure() {
|
| 411 | ### borrowed from benchmarks/osh-runtime.sh
|
| 412 |
|
| 413 | local osh=${1:-$REPO_ROOT/_bin/cxx-opt/osh}
|
| 414 | local mode=${2:-flat}
|
| 415 |
|
| 416 | local dir=$BASE_DIR/cpython-configure
|
| 417 |
|
| 418 | # Fails because perf has to run as root
|
| 419 | rm -r -f -v $dir || true
|
| 420 |
|
| 421 | mkdir -p $dir
|
| 422 |
|
| 423 | local -a cmd=( $osh $REPO_ROOT/Python-2.7.13/configure )
|
| 424 |
|
| 425 | pushd $dir
|
| 426 | profile-cpp 'cpython-configure' $mode "${cmd[@]}"
|
| 427 | popd
|
| 428 | }
|
| 429 |
|
| 430 | cpython-report() {
|
| 431 | #perf report -i $BASE_DIR/cpython-configure.perf
|
| 432 |
|
| 433 | # oils-for-unix is only 4.89% of time? That's #5
|
| 434 | # 48% in kernel
|
| 435 | # 23% in cc1
|
| 436 | #
|
| 437 | # That means we're still a bit slow
|
| 438 |
|
| 439 | # TODO: I want to change OVERALL percentages
|
| 440 | #
|
| 441 | # GC is 1.6% and let's say rooting is 3%. That's 300 ms out of 10s
|
| 442 | # GC can account for the whole thing
|
| 443 | # I wonder if we can do GC while waiting for processes? They might be tiny
|
| 444 | # processes though
|
| 445 |
|
| 446 | perf report -i $BASE_DIR/cpython-configure.perf \
|
| 447 | -n --dso=oils-for-unix --percentage=relative
|
| 448 |
|
| 449 | #perf report -i $BASE_DIR/cpython-configure.perf --sort=dso
|
| 450 | }
|
| 451 |
|
| 452 | local-test() {
|
| 453 | local osh=_bin/cxx-opt/osh
|
| 454 | ninja $osh
|
| 455 | profile-fib $REPO_ROOT/$osh flat
|
| 456 | }
|
| 457 |
|
| 458 | soil-run() {
|
| 459 | echo 'TODO run benchmarks/gc tasks'
|
| 460 | # But we don't have Ninja
|
| 461 | # Fetch the tarball?
|
| 462 |
|
| 463 | # Can you WAIT for the tarball?
|
| 464 | # You can wait for the cpp-small task that builds it? Ah hacky hacky
|
| 465 |
|
| 466 | build-stress-test
|
| 467 |
|
| 468 | profile-stress-test
|
| 469 |
|
| 470 | export-osh-cpp _tmp/native-tar-test opt
|
| 471 | #export-osh-cpp '' opt
|
| 472 |
|
| 473 | profile-fib $OSH flat
|
| 474 | profile-osh-parse $OSH flat
|
| 475 | profile-parse-help $OSH flat
|
| 476 |
|
| 477 | print-index > $BASE_DIR/index.html
|
| 478 |
|
| 479 | echo "Wrote $BASE_DIR/index.html"
|
| 480 | }
|
| 481 |
|
| 482 | "$@"
|