| 1 | #!/usr/bin/env bash
 | 
| 2 | #
 | 
| 3 | # Keep track of benchmark data provenance.
 | 
| 4 | #
 | 
| 5 | # Usage:
 | 
| 6 | #   benchmarks/id.sh <function name>
 | 
| 7 | 
 | 
| 8 | set -o nounset
 | 
| 9 | set -o pipefail
 | 
| 10 | set -o errexit
 | 
| 11 | 
 | 
| 12 | REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
 | 
| 13 | 
 | 
| 14 | source build/common.sh  # for $CLANG
 | 
| 15 | source benchmarks/common.sh
 | 
| 16 | source test/tsv-lib.sh  # tsv-row
 | 
| 17 | 
 | 
| 18 | print-job-id() {
 | 
| 19 |   date '+%Y-%m-%d__%H-%M-%S'
 | 
| 20 | }
 | 
| 21 | 
 | 
| 22 | # TODO: add benchmark labels/hashes for osh and all other shells
 | 
| 23 | #
 | 
| 24 | # Need to archive labels too.
 | 
| 25 | #
 | 
| 26 | # TODO: How do I make sure the zsh label is current?  Across different
 | 
| 27 | # machines?
 | 
| 28 | #
 | 
| 29 | # What happens when zsh is silently upgraded?
 | 
| 30 | # I guess before every benchmark, you have to run the ID collection.  Man
 | 
| 31 | # that is a lot of code.
 | 
| 32 | #
 | 
| 33 | # Should I make symlinks to the published location?
 | 
| 34 | #
 | 
| 35 | # Maybe bash/dash/mksh/zsh should be invoked through a symlink?
 | 
| 36 | # Every symlink is a shell runtime version, and it has an associated
 | 
| 37 | # toolchain?
 | 
| 38 | 
 | 
| 39 | # Platform is ambient?
 | 
| 40 | # _tmp/
 | 
| 41 | #   shell-id/
 | 
| 42 | #     bash/
 | 
| 43 | #       HASH.txt
 | 
| 44 | #       version.txt
 | 
| 45 | #     dash/
 | 
| 46 | #       HASH.txt
 | 
| 47 | #       version.txt
 | 
| 48 | #   host-id/
 | 
| 49 | #     lisa/
 | 
| 50 | #       HASH.txt
 | 
| 51 | #       cpuinfo.txt
 | 
| 52 | 
 | 
| 53 | # ../benchmark-data/
 | 
| 54 | #   shell-id/
 | 
| 55 | #     bash-$HASH/
 | 
| 56 | #     osh-$HASH/   # osh-cpython, osh-ovm?   osh-opy-ovm?  Too many dimensions.
 | 
| 57 | #                # the other shells don't have this?
 | 
| 58 | #     zsh-$HASH/
 | 
| 59 | #   host-id/
 | 
| 60 | #     lisa-$HASH/
 | 
| 61 | 
 | 
| 62 | _dump-if-exists() {
 | 
| 63 |   local path=$1
 | 
| 64 |   local out=$2
 | 
| 65 |   if ! test -f "$path"; then
 | 
| 66 |     return
 | 
| 67 |   fi
 | 
| 68 |   cat "$path" > $out
 | 
| 69 | }
 | 
| 70 | 
 | 
| 71 | #
 | 
| 72 | # Shell ID
 | 
| 73 | #
 | 
| 74 | 
 | 
| 75 | dump-shell-id() {
 | 
| 76 |   local sh_path=$1
 | 
| 77 |   local out_dir=$2
 | 
| 78 | 
 | 
| 79 |   if ! command -v $sh_path >/dev/null; then
 | 
| 80 |     die "dump-shell-id: Couldn't find $sh_path"
 | 
| 81 |   fi
 | 
| 82 | 
 | 
| 83 |   mkdir -p $out_dir
 | 
| 84 | 
 | 
| 85 |   echo $sh_path > $out_dir/sh-path.txt
 | 
| 86 | 
 | 
| 87 |   # Add extra repository info for osh.
 | 
| 88 |   case $sh_path in
 | 
| 89 |     */osh*)
 | 
| 90 |       local commit_hash=$out_dir/git-commit-hash.txt
 | 
| 91 | 
 | 
| 92 |       if test -n "${XSHAR_GIT_COMMIT:-}"; then
 | 
| 93 |         echo "$XSHAR_GIT_COMMIT" > $commit_hash
 | 
| 94 |       else
 | 
| 95 |         local branch
 | 
| 96 |         branch=$(git rev-parse --abbrev-ref HEAD)
 | 
| 97 |         echo $branch > $out_dir/git-branch.txt
 | 
| 98 |         git rev-parse $branch > $commit_hash
 | 
| 99 |       fi
 | 
| 100 |       ;;
 | 
| 101 |   esac
 | 
| 102 | 
 | 
| 103 |   local sh_name
 | 
| 104 |   sh_name=$(basename $sh_path)
 | 
| 105 | 
 | 
| 106 |   case $sh_name in
 | 
| 107 |     bash|zsh|yash)
 | 
| 108 |       $sh_path --version > $out_dir/version.txt
 | 
| 109 |       ;;
 | 
| 110 |     osh)
 | 
| 111 |       case $sh_path in
 | 
| 112 |         *_bin/*/osh)
 | 
| 113 |           # Doesn't support --version yet
 | 
| 114 |           ;;
 | 
| 115 |         *)
 | 
| 116 |           $sh_path --version > $out_dir/osh-version.txt
 | 
| 117 |           ;;
 | 
| 118 |       esac
 | 
| 119 |       ;;
 | 
| 120 |     # oils-for-unix|oils-for-unix.stripped)
 | 
| 121 |     #  ;;
 | 
| 122 |     dash|mksh)
 | 
| 123 |       # These don't have version strings!
 | 
| 124 |       dpkg -s $sh_name > $out_dir/dpkg-version.txt
 | 
| 125 |       ;;
 | 
| 126 | 
 | 
| 127 |     # not a shell, but useful for benchmarks/compute
 | 
| 128 |     python2)
 | 
| 129 |       $sh_path -V 2> $out_dir/version.txt
 | 
| 130 |       ;;
 | 
| 131 |     *)
 | 
| 132 |       die "Invalid shell '$sh_name'"
 | 
| 133 |       ;;
 | 
| 134 |   esac
 | 
| 135 | }
 | 
| 136 | 
 | 
| 137 | _shell-id-hash() {
 | 
| 138 |   local src=$1
 | 
| 139 | 
 | 
| 140 |   local file
 | 
| 141 | 
 | 
| 142 |   # for shells and Python
 | 
| 143 |   file=$src/version.txt
 | 
| 144 |   test -f $file && cat $file
 | 
| 145 | 
 | 
| 146 |   # Only hash the dimensions we want to keep
 | 
| 147 |   file=$src/dpkg-version.txt
 | 
| 148 |   test -f $file && egrep '^Version' $file
 | 
| 149 | 
 | 
| 150 |   # Interpreter as CPython vs. OVM is what we care about, so
 | 
| 151 |   # select 'Interpreter:' but not 'Interpreter version:'.
 | 
| 152 |   # For example, the version is different on Ubuntu Bionic vs. Trusty, but we
 | 
| 153 |   # ignore that.
 | 
| 154 |   file=$src/osh-version.txt
 | 
| 155 |   test -f $file && egrep '^Oil version|^Interpreter:' $file
 | 
| 156 | 
 | 
| 157 |   # For OSH
 | 
| 158 |   file=$src/git-commit-hash.txt
 | 
| 159 |   test -f $file && cat $file
 | 
| 160 | 
 | 
| 161 |   return 0
 | 
| 162 | }
 | 
| 163 | 
 | 
| 164 | publish-shell-id() {
 | 
| 165 |   ### Copy temp directory to hashed location
 | 
| 166 | 
 | 
| 167 |   local src=$1  # e.g. _tmp/prov-tmp/osh
 | 
| 168 |   local dest_base=${2:-../benchmark-data/shell-id}  # or _tmp/shell-id
 | 
| 169 | 
 | 
| 170 |   local sh_path sh_name
 | 
| 171 |   read sh_path < $src/sh-path.txt
 | 
| 172 |   sh_name=$(basename $sh_path)
 | 
| 173 | 
 | 
| 174 |   local hash
 | 
| 175 |   hash=$(_shell-id-hash $src | md5sum)  # not secure, an identifier
 | 
| 176 | 
 | 
| 177 |   local id="${hash:0:8}"
 | 
| 178 |   local dest="$dest_base/$sh_name-$id"
 | 
| 179 | 
 | 
| 180 |   mkdir -p $dest
 | 
| 181 |   cp --no-target-directory --recursive $src/ $dest/
 | 
| 182 | 
 | 
| 183 |   echo $hash > $dest/HASH.txt
 | 
| 184 | 
 | 
| 185 |   log "Published shell ID to $dest"
 | 
| 186 | 
 | 
| 187 |   echo $id
 | 
| 188 | }
 | 
| 189 | 
 | 
| 190 | #
 | 
| 191 | # Platform ID
 | 
| 192 | #
 | 
| 193 | 
 | 
| 194 | # Events that will change the env for a given machine:
 | 
| 195 | # - kernel upgrade
 | 
| 196 | # - distro upgrade
 | 
| 197 | 
 | 
| 198 | # How about ~/git/oilshell/benchmark-data/host-id/lisa-$HASH
 | 
| 199 | # How to calculate the hash though?
 | 
| 200 | 
 | 
| 201 | dump-host-id() {
 | 
| 202 |   local out_dir=${1:-_tmp/host-id/$(hostname)}
 | 
| 203 | 
 | 
| 204 |   mkdir -p $out_dir
 | 
| 205 | 
 | 
| 206 |   hostname > $out_dir/hostname.txt
 | 
| 207 | 
 | 
| 208 |   # does it make sense to do individual fields like -m?
 | 
| 209 |   # avoid parsing?
 | 
| 210 |   # We care about the kernel and the CPU architecture.
 | 
| 211 |   # There is a lot of redundant information there.
 | 
| 212 |   uname -m > $out_dir/machine.txt
 | 
| 213 | 
 | 
| 214 |   {
 | 
| 215 |     # Short flags work on OS X too
 | 
| 216 |     uname -s  # --kernel-name
 | 
| 217 |     uname -r  # --kernel-release
 | 
| 218 |     uname -v  # --kernel-version
 | 
| 219 |   } > $out_dir/kernel.txt
 | 
| 220 | 
 | 
| 221 |   _dump-if-exists /etc/lsb-release $out_dir/lsb-release.txt
 | 
| 222 | 
 | 
| 223 |   # remove the cpu MHz field, which changes a lot
 | 
| 224 |   if test -e /proc/cpuinfo; then
 | 
| 225 |     grep -i -v 'cpu mhz' /proc/cpuinfo > $out_dir/cpuinfo.txt
 | 
| 226 |   fi
 | 
| 227 | 
 | 
| 228 |   # mem info doesn't make a difference?  I guess it's just nice to check that
 | 
| 229 |   # it's not swapping.  But shouldn't be part of the hash.
 | 
| 230 | 
 | 
| 231 |   if test -e /proc/meminfo; then
 | 
| 232 |     grep '^MemTotal' /proc/meminfo > $out_dir/meminfo.txt
 | 
| 233 |   fi
 | 
| 234 | 
 | 
| 235 |   #head $out_dir/* 1>&2  # don't write to stdout
 | 
| 236 | }
 | 
| 237 | 
 | 
| 238 | # There is already concept of the triple?
 | 
| 239 | # http://wiki.osdev.org/Target_Triplet
 | 
| 240 | # It's not exactly the same as what we need here, but close.
 | 
| 241 | 
 | 
| 242 | _host-id-hash() {
 | 
| 243 |   local src=$1
 | 
| 244 | 
 | 
| 245 |   # Don't hash CPU or memory
 | 
| 246 |   #cat $src/cpuinfo.txt
 | 
| 247 |   #cat $src/hostname.txt  # e.g. lisa
 | 
| 248 | 
 | 
| 249 |   cat $src/machine.txt  # e.g. x86_64 
 | 
| 250 |   cat $src/kernel.txt
 | 
| 251 | 
 | 
| 252 |   # OS
 | 
| 253 |   local file=$src/lsb-release.txt
 | 
| 254 |   if test -f $file; then
 | 
| 255 |     cat $file
 | 
| 256 |   fi
 | 
| 257 | 
 | 
| 258 |   return 0
 | 
| 259 | }
 | 
| 260 | 
 | 
| 261 | # Writes a short ID to stdout.
 | 
| 262 | publish-host-id() {
 | 
| 263 |   local src=$1  # e.g. _tmp/host-id/lisa
 | 
| 264 |   local dest_base=${2:-../benchmark-data/host-id}
 | 
| 265 | 
 | 
| 266 |   local name
 | 
| 267 |   name=$(basename $src)
 | 
| 268 | 
 | 
| 269 |   local hash
 | 
| 270 |   hash=$(_host-id-hash $src | md5sum)  # not secure, an identifier
 | 
| 271 | 
 | 
| 272 |   local id="${hash:0:8}"
 | 
| 273 |   local dest="$dest_base/$name-$id"
 | 
| 274 | 
 | 
| 275 |   mkdir -p $dest
 | 
| 276 |   cp --no-target-directory --recursive $src/ $dest/
 | 
| 277 | 
 | 
| 278 |   echo $hash > $dest/HASH.txt
 | 
| 279 | 
 | 
| 280 |   log "Published host ID to $dest"
 | 
| 281 | 
 | 
| 282 |   echo $id
 | 
| 283 | }
 | 
| 284 | 
 | 
| 285 | #
 | 
| 286 | # Compilers
 | 
| 287 | # 
 | 
| 288 | 
 | 
| 289 | dump-compiler-id() {
 | 
| 290 |   local cc=$1  # path to the compiler
 | 
| 291 |   local out_dir=${2:-_tmp/compiler-id/$(basename $cc)}
 | 
| 292 | 
 | 
| 293 |   mkdir -p $out_dir
 | 
| 294 | 
 | 
| 295 |   case $cc in
 | 
| 296 |     */gcc)
 | 
| 297 |       $cc --version
 | 
| 298 |       # -v has more details, but they might be overkill.
 | 
| 299 |       ;;
 | 
| 300 |     */clang)
 | 
| 301 |       $cc --version
 | 
| 302 |       # -v has stuff we don't want
 | 
| 303 |       ;;
 | 
| 304 |   esac > $out_dir/version.txt
 | 
| 305 | }
 | 
| 306 | 
 | 
| 307 | _compiler-id-hash() {
 | 
| 308 |   local src=$1
 | 
| 309 | 
 | 
| 310 |   # Remove some extraneous information from clang.
 | 
| 311 |   cat $src/version.txt | grep -v InstalledDir 
 | 
| 312 | }
 | 
| 313 | 
 | 
| 314 | # Writes a short ID to stdout.
 | 
| 315 | publish-compiler-id() {
 | 
| 316 |   local src=$1  # e.g. _tmp/compiler-id/clang
 | 
| 317 |   local dest_base=${2:-../benchmark-data/compiler-id}
 | 
| 318 | 
 | 
| 319 |   local name=$(basename $src)
 | 
| 320 |   local hash
 | 
| 321 |   hash=$(_compiler-id-hash $src | md5sum)  # not secure, an identifier
 | 
| 322 | 
 | 
| 323 |   local id="${hash:0:8}"
 | 
| 324 |   local dest="$dest_base/$name-$id"
 | 
| 325 | 
 | 
| 326 |   mkdir -p $dest
 | 
| 327 |   cp --no-target-directory --recursive $src/ $dest/
 | 
| 328 | 
 | 
| 329 |   echo $hash > $dest/HASH.txt
 | 
| 330 | 
 | 
| 331 |   log "Published compiler ID to $dest"
 | 
| 332 | 
 | 
| 333 |   echo $id
 | 
| 334 | }
 | 
| 335 | 
 | 
| 336 | #
 | 
| 337 | # Table Output
 | 
| 338 | #
 | 
| 339 | 
 | 
| 340 | # Writes a table of host and shells to stdout.  Writes text files and
 | 
| 341 | # calculates IDs for them as a side effect.
 | 
| 342 | #
 | 
| 343 | # The table can be passed to other benchmarks to ensure that their provenance
 | 
| 344 | # is recorded.
 | 
| 345 | 
 | 
| 346 | shell-provenance-2() {
 | 
| 347 |   ### Write to _tmp/provenance.{txt,tsv} and $out_dir/{shell,host-id}
 | 
| 348 | 
 | 
| 349 |   local maybe_host=$1  # if it exists, it overrides the host
 | 
| 350 |   local job_id=$2
 | 
| 351 |   local out_dir=$3
 | 
| 352 |   shift 3
 | 
| 353 | 
 | 
| 354 |   # log "*** shell-provenance"
 | 
| 355 | 
 | 
| 356 |   mkdir -p _tmp/provenance
 | 
| 357 | 
 | 
| 358 |   local host_name
 | 
| 359 |   if test -n "$maybe_host"; then  # label is often 'no-host'
 | 
| 360 |     host_name=$maybe_host
 | 
| 361 |   else
 | 
| 362 |     host_name=$(hostname)
 | 
| 363 |   fi
 | 
| 364 | 
 | 
| 365 |   log "*** $maybe_host $host_name $job_id $out_dir"
 | 
| 366 | 
 | 
| 367 |   local tmp_dir=_tmp/prov-tmp/$host_name
 | 
| 368 |   dump-host-id $tmp_dir
 | 
| 369 | 
 | 
| 370 |   local host_hash
 | 
| 371 |   host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
 | 
| 372 |   local shell_hash
 | 
| 373 | 
 | 
| 374 |   local out_txt=_tmp/provenance.txt  # Legacy text file
 | 
| 375 |   echo -n '' > $out_txt  # trunacte, no header
 | 
| 376 | 
 | 
| 377 |   local out_tsv=_tmp/provenance.tsv
 | 
| 378 |   tsv-row job_id host_name host_hash sh_path shell_hash > $out_tsv
 | 
| 379 | 
 | 
| 380 |   local i=0
 | 
| 381 | 
 | 
| 382 |   for sh_path in "$@"; do
 | 
| 383 |     # There can be two different OSH
 | 
| 384 | 
 | 
| 385 |     tmp_dir=_tmp/prov-tmp/shell-$i
 | 
| 386 |     i=$((i + 1))
 | 
| 387 | 
 | 
| 388 |     dump-shell-id $sh_path $tmp_dir
 | 
| 389 | 
 | 
| 390 |     # writes to ../benchmark-data or _tmp/provenance
 | 
| 391 |     shell_hash=$(publish-shell-id $tmp_dir "$out_dir/shell-id")
 | 
| 392 | 
 | 
| 393 |     # note: filter-provenance depends on $4 being $sh_path
 | 
| 394 |     # APPEND to txt
 | 
| 395 |     echo "$job_id $host_name $host_hash $sh_path $shell_hash" >> $out_txt
 | 
| 396 | 
 | 
| 397 |     tsv-row "$job_id" "$host_name" "$host_hash" "$sh_path" "$shell_hash" >> $out_tsv
 | 
| 398 |   done
 | 
| 399 | 
 | 
| 400 |   log "Wrote $out_txt and $out_tsv"
 | 
| 401 | }
 | 
| 402 | 
 | 
| 403 | compiler-provenance() {
 | 
| 404 |   local job_id
 | 
| 405 |   job_id=$(print-job-id)
 | 
| 406 | 
 | 
| 407 |   local host
 | 
| 408 |   host=$(hostname)
 | 
| 409 | 
 | 
| 410 |   # Filename
 | 
| 411 |   local out=_tmp/provenance/${host}.${job_id}.compiler-provenance.txt
 | 
| 412 | 
 | 
| 413 |   local tmp_dir=_tmp/host-id/$host
 | 
| 414 |   dump-host-id $tmp_dir
 | 
| 415 | 
 | 
| 416 |   local host_hash
 | 
| 417 |   host_hash=$(publish-host-id $tmp_dir)
 | 
| 418 | 
 | 
| 419 |   local compiler_hash
 | 
| 420 | 
 | 
| 421 |   # gcc is assumed to be in the $PATH.
 | 
| 422 |   for compiler_path in $(which gcc) $CLANG; do
 | 
| 423 |     local name=$(basename $compiler_path)
 | 
| 424 | 
 | 
| 425 |     tmp_dir=_tmp/compiler-id/$name
 | 
| 426 |     dump-compiler-id $compiler_path $tmp_dir
 | 
| 427 | 
 | 
| 428 |     compiler_hash=$(publish-compiler-id $tmp_dir)
 | 
| 429 | 
 | 
| 430 |     echo "$job_id $host $host_hash $compiler_path $compiler_hash"
 | 
| 431 |   done > $out
 | 
| 432 | 
 | 
| 433 |   log "Wrote $out"
 | 
| 434 | 
 | 
| 435 |   # Return value used in command sub
 | 
| 436 |   echo $out
 | 
| 437 | }
 | 
| 438 | 
 | 
| 439 | out-param() {
 | 
| 440 |   declare -n out=$1
 | 
| 441 | 
 | 
| 442 |   out=returned
 | 
| 443 | }
 | 
| 444 | 
 | 
| 445 | if test $(basename $0) = 'id.sh'; then
 | 
| 446 |   "$@"
 | 
| 447 | fi
 | 
| 448 | 
 |