| 1 | #!/usr/bin/env bash
 | 
| 2 | 
 | 
| 3 | set -o noglob  # for unquoted $text splitting
 | 
| 4 | 
 | 
| 5 | tokenize() {
 | 
| 6 |   # read it once
 | 
| 7 |   read -r -d '' text
 | 
| 8 | 
 | 
| 9 |   for word in $text; do  # relies on word splitting
 | 
| 10 |     echo "$word"
 | 
| 11 |   done
 | 
| 12 | }
 | 
| 13 | 
 | 
| 14 | main() {
 | 
| 15 |   iters=${1:-100}
 | 
| 16 | 
 | 
| 17 |   # read it once
 | 
| 18 |   read -r -d '' text
 | 
| 19 | 
 | 
| 20 |   declare -A words
 | 
| 21 | 
 | 
| 22 |   # do it a bunch of times
 | 
| 23 |   for (( i = 0; i < iters; ++i )); do
 | 
| 24 | 
 | 
| 25 |     # Relies on unquoted IFS splitting.  Difference with Python: Python will
 | 
| 26 |     # give you \, but IFS splitting won't.
 | 
| 27 |     for word in $text; do
 | 
| 28 | 
 | 
| 29 |       # Hm this isn't correct in bash!
 | 
| 30 |       old=${words["$word"]}
 | 
| 31 |       words["$word"]=$((old + 1))
 | 
| 32 | 
 | 
| 33 |       # BUG in bash, see spec/assoc case #37
 | 
| 34 |       #(( words["$word"] += 1 ))
 | 
| 35 |       #(( words[\$word] += 1 ))
 | 
| 36 |     done
 | 
| 37 |   done
 | 
| 38 | 
 | 
| 39 |   # note: we can sort the output in the benchmark and assert that it's the same?
 | 
| 40 | 
 | 
| 41 |   for word in "${!words[@]}"; do
 | 
| 42 |     echo "${words["$word"]} $word"
 | 
| 43 |   done
 | 
| 44 | }
 | 
| 45 | 
 | 
| 46 | main "$@"
 | 
| 47 | #tokenize "$@"
 |