1 | #!/usr/bin/env bash
|
2 | #
|
3 | # Why is CPython configure slower under OSH?
|
4 | #
|
5 | # Usage:
|
6 | # benchmarks/autoconf.sh <function name>
|
7 | #
|
8 | # Examples:
|
9 | # $0 patch-pyconf # times builtin
|
10 | # $0 measure-times # time-tsv with gc stats
|
11 | # $0 report-times # times builtin
|
12 | #
|
13 | # $0 measure-syscalls
|
14 | # $0 report-syscalls
|
15 | # $0 report-processes
|
16 | # $0 report-external
|
17 | #
|
18 | # Simpler:
|
19 | # $0 measure-rusage # time-tsv
|
20 |
|
21 | set -o nounset
|
22 | set -o pipefail
|
23 | set -o errexit
|
24 |
|
25 | REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
|
26 |
|
27 | source benchmarks/cachegrind.sh # with-cachegrind
|
28 | source benchmarks/callgrind.sh # with-cachegrind
|
29 | source test/tsv-lib.sh # $TAB
|
30 |
|
31 | readonly BASE_DIR_RELATIVE=_tmp/autoconf
|
32 | readonly BASE_DIR=$REPO_ROOT/$BASE_DIR_RELATIVE
|
33 | readonly PY_CONF=$REPO_ROOT/Python-2.7.13/configure
|
34 |
|
35 | #
|
36 | # Trying to measure allocation/GC overhead
|
37 | #
|
38 | # This DOES NOT HELP because bumpleak/bumproot are **slower** on bigger heaps.
|
39 | # There's less cache locality!
|
40 | #
|
41 |
|
42 | cpython-configure-tasks() {
|
43 | local -a variants=( opt+bumpleak opt+bumproot opt )
|
44 | for v in ${variants[@]}; do
|
45 | echo "${v}${TAB}_bin/cxx-$v/osh"
|
46 | done
|
47 | }
|
48 |
|
49 | cpython-setup() {
|
50 | cpython-configure-tasks | while read -r _ osh; do
|
51 | ninja $osh
|
52 | done
|
53 | }
|
54 |
|
55 | measure-alloc-overhead() {
|
56 | local base_dir=$BASE_DIR/alloc-overhead
|
57 | rm -r -f -v $base_dir
|
58 |
|
59 | cpython-configure-tasks | while read -r variant osh; do
|
60 | osh=$REPO_ROOT/$osh
|
61 |
|
62 | local task_dir=$base_dir/$variant
|
63 |
|
64 | mkdir -p $task_dir
|
65 | pushd $task_dir > /dev/null
|
66 |
|
67 | local -a flags=(
|
68 | --output "$base_dir/$variant.tsv"
|
69 | --rusage
|
70 | )
|
71 |
|
72 | local -a time_argv
|
73 |
|
74 | time_argv=(
|
75 | time-tsv --print-header
|
76 | "${flags[@]}"
|
77 | --field variant
|
78 | )
|
79 | "${time_argv[@]}"
|
80 |
|
81 | time_argv=(
|
82 | time-tsv --append
|
83 | "${flags[@]}"
|
84 | --field "$variant"
|
85 | -- $osh $PY_CONF
|
86 | )
|
87 |
|
88 | #echo "${time_argv[@]}"
|
89 | "${time_argv[@]}"
|
90 |
|
91 | popd > /dev/null
|
92 |
|
93 | done
|
94 | }
|
95 |
|
96 | #
|
97 | # Compare bash/dash/osh locally
|
98 | #
|
99 |
|
100 | measure-rusage() {
|
101 | local base_dir=$BASE_DIR/rusage
|
102 | rm -r -f -v $base_dir
|
103 |
|
104 | shell-tasks | while read -r sh_label sh_path; do
|
105 |
|
106 | local task_dir=$base_dir/$sh_label
|
107 |
|
108 | mkdir -p $task_dir
|
109 | pushd $task_dir > /dev/null
|
110 |
|
111 | local -a flags=(
|
112 | --output "$base_dir/$sh_label.tsv"
|
113 | --rusage
|
114 | )
|
115 |
|
116 | local -a time_argv
|
117 |
|
118 | time_argv=(
|
119 | time-tsv --print-header
|
120 | "${flags[@]}"
|
121 | --field sh_label
|
122 | )
|
123 | "${time_argv[@]}"
|
124 |
|
125 | time_argv=(
|
126 | time-tsv --append
|
127 | "${flags[@]}"
|
128 | --field "$sh_label"
|
129 | -- $sh_path $PY_CONF
|
130 | )
|
131 |
|
132 | #echo "${time_argv[@]}"
|
133 | "${time_argv[@]}"
|
134 |
|
135 | popd > /dev/null
|
136 |
|
137 | done
|
138 | }
|
139 |
|
140 | #
|
141 | # Now try strace
|
142 | #
|
143 |
|
144 | shell-tasks() {
|
145 | echo "bash${TAB}bash"
|
146 | echo "dash${TAB}dash"
|
147 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/osh"
|
148 | }
|
149 |
|
150 | measure-syscalls() {
|
151 | local osh=_bin/cxx-opt/osh
|
152 | #local osh=_bin/cxx-dbg/osh
|
153 |
|
154 | ninja $osh
|
155 |
|
156 | local base_dir=$BASE_DIR/syscalls
|
157 |
|
158 | rm -r -f -v $base_dir
|
159 |
|
160 | shell-tasks | while read -r sh_label sh_path; do
|
161 | local dir=$base_dir/$sh_label
|
162 | mkdir -p $dir
|
163 |
|
164 | local counts_dir=$base_dir/$sh_label
|
165 | mkdir -p $counts_dir
|
166 |
|
167 | pushd $dir
|
168 | #strace -o $counts -c $sh_path $PY_CONF
|
169 | # See how many external processes are started?
|
170 | #strace -o $counts -ff -e execve $sh_path $PY_CONF
|
171 | strace -o $counts_dir/syscalls -ff $sh_path $PY_CONF
|
172 | popd
|
173 | done
|
174 | }
|
175 |
|
176 | # --- _tmp/autoconf/bash
|
177 | # 6047
|
178 | # 4621
|
179 | # --- _tmp/autoconf/dash
|
180 | # 6088
|
181 | # 4627
|
182 | # --- _tmp/autoconf/osh
|
183 | # 5691
|
184 | # 4631
|
185 | #
|
186 | # Woah we start fewer processes! But are not faster?
|
187 |
|
188 | grep-exec() {
|
189 | # --max-count 1 - only report one execve per process
|
190 | egrep --no-filename --max-count 1 -o 'execve\("[^"]+' "$@"
|
191 | }
|
192 |
|
193 | # andy@hoover:~/git/oilshell/oil$ benchmarks/autoconf.sh report-syscalls
|
194 | # --- _tmp/autoconf/syscalls/bash
|
195 | # 2592 _tmp/autoconf/syscalls/bash/syscalls.903220
|
196 | # 2608 _tmp/autoconf/syscalls/bash/syscalls.898727
|
197 | # 2632 _tmp/autoconf/syscalls/bash/syscalls.898387
|
198 | # 2679 _tmp/autoconf/syscalls/bash/syscalls.898292
|
199 | # 2853 _tmp/autoconf/syscalls/bash/syscalls.898927
|
200 | # 2873 _tmp/autoconf/syscalls/bash/syscalls.898334
|
201 | # 2920 _tmp/autoconf/syscalls/bash/syscalls.898895
|
202 | # 3204 _tmp/autoconf/syscalls/bash/syscalls.898664
|
203 | # 112549 _tmp/autoconf/syscalls/bash/syscalls.897471
|
204 | # 1360223 total
|
205 | #
|
206 | # --- _tmp/autoconf/syscalls/dash
|
207 | # 2592 _tmp/autoconf/syscalls/dash/syscalls.909344
|
208 | # 2607 _tmp/autoconf/syscalls/dash/syscalls.904921
|
209 | # 2630 _tmp/autoconf/syscalls/dash/syscalls.904581
|
210 | # 2683 _tmp/autoconf/syscalls/dash/syscalls.904486
|
211 | # 2851 _tmp/autoconf/syscalls/dash/syscalls.905109
|
212 | # 2873 _tmp/autoconf/syscalls/dash/syscalls.904528
|
213 | # 2920 _tmp/autoconf/syscalls/dash/syscalls.905088
|
214 | # 3204 _tmp/autoconf/syscalls/dash/syscalls.904858
|
215 | # 112922 _tmp/autoconf/syscalls/dash/syscalls.903626
|
216 | # 1372118 total
|
217 | #
|
218 | # --- _tmp/autoconf/syscalls/osh
|
219 | # 2592 _tmp/autoconf/syscalls/osh/syscalls.915226
|
220 | # 2607 _tmp/autoconf/syscalls/osh/syscalls.910993
|
221 | # 2630 _tmp/autoconf/syscalls/osh/syscalls.910647
|
222 | # 2679 _tmp/autoconf/syscalls/osh/syscalls.910561
|
223 | # 2851 _tmp/autoconf/syscalls/osh/syscalls.911162
|
224 | # 2873 _tmp/autoconf/syscalls/osh/syscalls.910599
|
225 | # 2920 _tmp/autoconf/syscalls/osh/syscalls.911143
|
226 | # 3204 _tmp/autoconf/syscalls/osh/syscalls.910936
|
227 | # 72921 _tmp/autoconf/syscalls/osh/syscalls.909769
|
228 | # 1211074 total
|
229 |
|
230 | report-processes() {
|
231 | for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
|
232 | echo "--- $sh_dir"
|
233 | ls $sh_dir/* | wc -l
|
234 | grep-exec $sh_dir/syscalls.* | wc -l
|
235 | echo
|
236 |
|
237 | done
|
238 | }
|
239 |
|
240 | report-external() {
|
241 | local n=${1:-5}
|
242 |
|
243 | for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
|
244 | echo "--- $sh_dir"
|
245 |
|
246 | grep-exec $sh_dir/syscalls.* | sort | uniq -c | sort -n | tail -n $n
|
247 | echo
|
248 | done
|
249 | }
|
250 |
|
251 | report-syscalls() {
|
252 | # Hm this is instructive, the shell itself makes the most syscalls
|
253 | # And fewer than other shells?
|
254 |
|
255 | for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
|
256 | echo "--- $sh_dir"
|
257 | wc -l $sh_dir/syscalls.* | sort -n | tail
|
258 | echo
|
259 | done
|
260 | }
|
261 |
|
262 | #
|
263 | # Cachegrind
|
264 | #
|
265 |
|
266 | measure-valgrind() {
|
267 | local tool=$1
|
268 |
|
269 | # opt seems to give OK results, but I thought dbg was more accurate
|
270 | #local osh=_bin/cxx-opt/osh
|
271 | local osh=_bin/cxx-dbg/osh
|
272 |
|
273 | ninja $osh
|
274 |
|
275 | local osh=$REPO_ROOT/$osh
|
276 |
|
277 | local base_dir=$REPO_ROOT/_tmp/$tool
|
278 |
|
279 | local dir=$base_dir/cpython-configure
|
280 | rm -r -f -v $dir
|
281 |
|
282 | local out_file=$base_dir/cpython-configure.txt
|
283 |
|
284 | mkdir -v -p $dir
|
285 |
|
286 | pushd $dir
|
287 | $tool $out_file $osh $PY_CONF
|
288 | popd
|
289 | }
|
290 |
|
291 | measure-cachegrind() {
|
292 | measure-valgrind with-cachegrind
|
293 | }
|
294 |
|
295 | measure-callgrind() {
|
296 | # This takes ~5 minutes with opt binary, ~6:43 with dbg
|
297 | # vs ~15 seconds uninstrumented
|
298 | time measure-valgrind with-callgrind
|
299 | }
|
300 |
|
301 | # Note:
|
302 | # benchmarks/osh-runtime.sh compares our release, which does not have #ifdef
|
303 | # GC_TIMING, so we don't know total GC time.
|
304 |
|
305 | # TODO:
|
306 | #
|
307 | # - Run locally, reproduce GC_TIMING - this is not in the release build
|
308 | # - it seems to say only 143 ms total GC time, but we're seeing 1.5+ seconds
|
309 | # slowdown on Cpython configure vs. bash
|
310 | # - I want a local run that automates it, and returns PERCENTAGES for elapsed
|
311 | # time, sys time, user time
|
312 | # - We also might not want to amortize free() inside Allocate()
|
313 | # - #ifdef LAZY_FREE I think! That might show a big slowdown with free
|
314 |
|
315 | patch-pyconf() {
|
316 | #sed -i $'s/ac_compile=\'$CC/ac_compile=\'times; $CC/g' $PY_CONF
|
317 |
|
318 | # temporary
|
319 | echo 'times > $SH_BENCHMARK_TIMES' >> $PY_CONF
|
320 | }
|
321 |
|
322 | measure-times() {
|
323 | local osh=_bin/cxx-opt/osh
|
324 | ninja $osh
|
325 |
|
326 | local base_dir=$BASE_DIR/times
|
327 | rm -r -f -v $base_dir
|
328 |
|
329 | local trace_dir=$base_dir/oils-trace
|
330 | mkdir -p $trace_dir
|
331 |
|
332 | shell-tasks | while read -r sh_label sh_path; do
|
333 | #case $sh_label in bash|dash) continue ;; esac
|
334 |
|
335 | local dir=$base_dir/$sh_label
|
336 | mkdir -p $dir
|
337 |
|
338 | pushd $dir
|
339 |
|
340 | local -a flags=(
|
341 | --output "$base_dir/$sh_label.tsv"
|
342 | --rusage
|
343 | )
|
344 |
|
345 | local -a time_argv
|
346 |
|
347 | time_argv=(
|
348 | time-tsv --print-header
|
349 | "${flags[@]}"
|
350 | --field sh_label
|
351 | )
|
352 | "${time_argv[@]}"
|
353 |
|
354 | time_argv=(
|
355 | time-tsv --append
|
356 | "${flags[@]}"
|
357 | --field "$sh_label"
|
358 | -- $sh_path $PY_CONF
|
359 | )
|
360 |
|
361 | #echo "${time_argv[@]}"
|
362 |
|
363 | # 1269 argv0.json files created
|
364 | # we can miss some via NOLASTFORK optimization
|
365 | #OILS_TRACE_DIR=$trace_dir \
|
366 |
|
367 | _OILS_GC_VERBOSE=1 OILS_GC_STATS_FD=99 \
|
368 | SH_BENCHMARK_TIMES=$base_dir/$sh_label.times.txt \
|
369 | "${time_argv[@]}" \
|
370 | 99>$base_dir/$sh_label.gc-stats.txt
|
371 |
|
372 | popd
|
373 | done
|
374 | }
|
375 |
|
376 | inner-long-tsv() {
|
377 | python2 -c '
|
378 | import os, re, sys
|
379 |
|
380 | def PrintRow(row):
|
381 | print("\t".join(row))
|
382 |
|
383 | PrintRow(["shell", "who", "what", "seconds"])
|
384 |
|
385 | for path in sys.argv[1:]:
|
386 | filename = os.path.basename(path)
|
387 | shell = filename.split(".")[0]
|
388 |
|
389 | f = open(path)
|
390 | s = f.read()
|
391 |
|
392 | secs = re.findall("0m([0-9.]+)s", s)
|
393 | assert len(secs) == 4, secs
|
394 |
|
395 | PrintRow([shell, "self", "user", secs[0]])
|
396 | PrintRow([shell, "self", "sys", secs[1]])
|
397 | PrintRow([shell, "child", "user", secs[2]])
|
398 | PrintRow([shell, "child", "sys", secs[3]])
|
399 |
|
400 | # Non-normalized, but OK
|
401 | total_secs = sum(float(s) for s in secs)
|
402 | PrintRow([shell, "both", "both", str(total_secs)])
|
403 |
|
404 | ' $BASE_DIR/times/*.times.txt
|
405 | }
|
406 |
|
407 | compare-dim() {
|
408 | # 8% more child system time
|
409 | local who=${1:-child}
|
410 | local what=${2:-user}
|
411 |
|
412 | echo "=== $who $what ==="
|
413 |
|
414 | # Annoying
|
415 | # https://www.math.utah.edu/docs/info/gawk_8.html
|
416 | # "If, for some reason, you need to force a number to be converted to a
|
417 | # string, concatenate the empty string, "", with that number. To force a
|
418 | # string to be converted to a number, add zero to that string."
|
419 |
|
420 | cat $BASE_DIR/times-long.tsv | awk -v "who=$who" -v "what=$what" '
|
421 | BEGIN {
|
422 | TAB = "\t"
|
423 |
|
424 | i = 0
|
425 |
|
426 | printf "%s\t%s\t%s\t%s\n", "shell", "secs", "ratio", "diff secs"
|
427 | }
|
428 | $2 == who && $3 == what {
|
429 | if (i == 0) {
|
430 | first_secs = $4 + 0
|
431 | }
|
432 | i++
|
433 |
|
434 | secs = $4 + 0
|
435 | ratio = secs / first_secs
|
436 | diff = secs - first_secs
|
437 |
|
438 | # Need commas for OFMT to work correctly?
|
439 | printf "%s\t%5.3f\t%5.3f\t%5.3f\n", $1, secs, ratio, diff
|
440 | }
|
441 | '
|
442 |
|
443 | echo
|
444 | }
|
445 |
|
446 | compare-times() {
|
447 | log "INNER"
|
448 | log ''
|
449 |
|
450 | compare-dim self user
|
451 |
|
452 | compare-dim self sys
|
453 |
|
454 | compare-dim child user
|
455 |
|
456 | compare-dim child sys
|
457 |
|
458 | compare-dim both both
|
459 |
|
460 | # outer
|
461 | log "OUTER"
|
462 | log ''
|
463 |
|
464 | compare-dim both elapsed
|
465 |
|
466 | # These kinda match
|
467 | return
|
468 | compare-dim both user
|
469 | compare-dim both sys
|
470 | }
|
471 |
|
472 | outer-long-tsv() {
|
473 | log "=== outer times ==="
|
474 | awk '
|
475 | BEGIN {
|
476 | i = 0
|
477 |
|
478 | printf "%s\t%s\t%s\t%s\n", "shell", "who", "what", "seconds"
|
479 | }
|
480 | i == 0 {
|
481 | #print "Skipping header"
|
482 | i++
|
483 | next
|
484 | }
|
485 | i >= 1 {
|
486 | elapsed = $2 + 0
|
487 | user = $3 + 0
|
488 | sys = $4 + 0
|
489 | sh_label = $6
|
490 |
|
491 | printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "elapsed", elapsed
|
492 | printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "user", user
|
493 | printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "sys", sys
|
494 |
|
495 | i++
|
496 | }
|
497 | ' $BASE_DIR/outer-wide.tsv
|
498 | }
|
499 |
|
500 | report-times() {
|
501 | head $BASE_DIR/times/*.tsv
|
502 | echo
|
503 | head $BASE_DIR/times/*.times.txt
|
504 | echo
|
505 |
|
506 | inner-long-tsv | tee $BASE_DIR/inner-long.tsv
|
507 | echo
|
508 |
|
509 | tsv-concat $BASE_DIR/times/*.tsv | tee $BASE_DIR/outer-wide.tsv
|
510 | outer-long-tsv | tee $BASE_DIR/outer-long.tsv
|
511 | echo
|
512 |
|
513 | tsv-concat $BASE_DIR/{inner,outer}-long.tsv | tee $BASE_DIR/times-long.tsv
|
514 |
|
515 | compare-times
|
516 | }
|
517 |
|
518 | ### Why is clone() taking longer according to strace?
|
519 |
|
520 | fork-tasks() {
|
521 | echo "bash${TAB}bash"
|
522 | echo "dash${TAB}dash"
|
523 |
|
524 | # Hm this is noisy, but cxx-opt-sh does seem slower
|
525 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/osh"
|
526 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt-sh/osh"
|
527 | }
|
528 |
|
529 | measure-fork() {
|
530 | fork-tasks | while read -r sh_label sh_path; do
|
531 | #case $sh_label in bash|dash) continue ;; esac
|
532 |
|
533 | echo "=== $sh_path ==="
|
534 |
|
535 | # Builtin is very fast
|
536 | #time $sh_path -c 'for i in $(seq 100); do true; done'
|
537 |
|
538 | # Hm this is very noisy
|
539 | # TODO use hyperfine?
|
540 | time $sh_path -c 'for i in $(seq 100); do /bin/true; done'
|
541 |
|
542 | case $sh_label in
|
543 | osh)
|
544 | # Oops, we are not symlinking to the .stripped binary!
|
545 | # This is explicitly done for symbols and benchmarking.
|
546 | # Hm does that make it slower then?
|
547 |
|
548 | ls -l -L $sh_path
|
549 | ldd $sh_path
|
550 | ;;
|
551 | esac
|
552 | done
|
553 | }
|
554 |
|
555 | # $ head _tmp/elapsed/*.times.txt
|
556 | # ==> _tmp/elapsed/bash.times.txt <==
|
557 | # 0m0.213s 0m0.477s
|
558 | # 0m8.233s 0m2.931s
|
559 | #
|
560 | # ==> _tmp/elapsed/dash.times.txt <==
|
561 | # 0m0.217s 0m0.463s
|
562 | # 0m8.281s 0m2.922s
|
563 | #
|
564 | # ==> _tmp/elapsed/osh.times.txt <==
|
565 | # 0m0.360s 0m0.720s
|
566 | # 0m8.790s 0m2.960s
|
567 |
|
568 | # shell user time - GC and allocs
|
569 | # shell system time - ???
|
570 | # child user time - ???
|
571 | # TODO: count how many processes this is.
|
572 | # It's more than 500 ms
|
573 | # Is that 500 processes, and 1 ms per process?
|
574 |
|
575 | fork-time() {
|
576 | local osh=_bin/cxx-opt/osh
|
577 |
|
578 | $osh -c 'time for i in {1..1000}; do true; done'
|
579 | echo
|
580 |
|
581 | $osh -c 'time for i in {1..1000}; do ( true ); done'
|
582 | echo
|
583 |
|
584 | # Does this increase fork time or no?
|
585 | # Hm I can only get the forks up to 306ms for 1000, or 300 us
|
586 | # But the HereDocWriter does dup() and so forth
|
587 | $osh -c '
|
588 | echo ysh-parse
|
589 | time for i in {1..40}; do
|
590 | . test/ysh-parse-errors.sh
|
591 | done
|
592 | times
|
593 | time for i in {1..1000}; do
|
594 | ( true )
|
595 | done'
|
596 | echo
|
597 | }
|
598 |
|
599 | "$@"
|