| 1 | #!/usr/bin/env bash
 | 
| 2 | #
 | 
| 3 | # What do Unix tools do with "bad" filenames?
 | 
| 4 | #
 | 
| 5 | # - Those with invalid unicode
 | 
| 6 | # - Those with terminal escape sequences
 | 
| 7 | #
 | 
| 8 | # Usage:
 | 
| 9 | #   data_lang/quoting-survey.sh <function name>
 | 
| 10 | 
 | 
| 11 | set -o nounset
 | 
| 12 | set -o pipefail
 | 
| 13 | set -o errexit
 | 
| 14 | 
 | 
| 15 | # We already know:
 | 
| 16 | # - bash ${#len} operator is very broken
 | 
| 17 | 
 | 
| 18 | # in bash it could be %Q.  Or maybe it's %Q everywhere?
 | 
| 19 | # in mycpp, we can translate %r calling repr() to qsn::encode()
 | 
| 20 | 
 | 
| 21 | 
 | 
| 22 | # Summary:
 | 
| 23 | #
 | 
| 24 | # These tools do UTF-8 error decoding, but they use a funny shell-like format:
 | 
| 25 | #
 | 
| 26 | # - GNU coreutils: ls, cp, stat
 | 
| 27 | # - GNU tar
 | 
| 28 | # - zsh in error message, and in printf %q
 | 
| 29 | # - bash and mksh in printf %q only, but not in error message
 | 
| 30 | # - GNU findutils: find, but NOT xargs
 | 
| 31 | #
 | 
| 32 | # Surprise: not grep
 | 
| 33 | #
 | 
| 34 | # TODO: automate this a bit?
 | 
| 35 | # - You can validate their stdout and stderr?
 | 
| 36 | #   - Look for the literal escape sequence.
 | 
| 37 | 
 | 
| 38 | 
 | 
| 39 | # TODO: What about the one that changes the title?
 | 
| 40 | 
 | 
| 41 | BOLD=$'\x1b[1m'
 | 
| 42 | RESET=$'\x1b[0;0m'
 | 
| 43 | 
 | 
| 44 | # A mix of valid and invalid utf-8
 | 
| 45 | char_then_byte=$'\xce\xce\xbc'
 | 
| 46 | byte_then_char=$'\xce\xbc\xce'
 | 
| 47 | 
 | 
| 48 | readonly DIR=_tmp/qsn-demo
 | 
| 49 | 
 | 
| 50 | banner() {
 | 
| 51 |   echo
 | 
| 52 |   echo =====
 | 
| 53 |   echo "$@"
 | 
| 54 |   echo
 | 
| 55 | }
 | 
| 56 | 
 | 
| 57 | setup-bad-files() {
 | 
| 58 | 
 | 
| 59 |   # - Make a file with an invalid code point, and utf-8 char
 | 
| 60 |   # - Make a file with a terminal escape sequence
 | 
| 61 | 
 | 
| 62 |   mkdir -p $DIR
 | 
| 63 |   pushd $DIR
 | 
| 64 |   touch -- $BOLD $char_then_byte $byte_then_char
 | 
| 65 | }
 | 
| 66 | 
 | 
| 67 | test-programs() {
 | 
| 68 |   echo "$BOLD Hello $RESET World"
 | 
| 69 | 
 | 
| 70 |   # does approximate decoding
 | 
| 71 |   printf '%q\n' "$char_then_byte"
 | 
| 72 |   printf '%q\n' "$byte_then_char"
 | 
| 73 | 
 | 
| 74 |   setup-bad-files
 | 
| 75 |   # ls doesn't print these by default, that' sgood
 | 
| 76 | 
 | 
| 77 |   # Hm this also does approximate decoding
 | 
| 78 |   banner 'ls'
 | 
| 79 |   ls 
 | 
| 80 |   echo
 | 
| 81 |   ls --escape
 | 
| 82 |   echo
 | 
| 83 |   # Test out error message
 | 
| 84 |   # It's basically correct, but ugly.  There are too many segments, and
 | 
| 85 |   # there's an unnecessary leading ''.
 | 
| 86 |   # J8 is shorter and more consistent.
 | 
| 87 | 
 | 
| 88 |   ls -- "$RESET" || true
 | 
| 89 | 
 | 
| 90 |   # same
 | 
| 91 |   banner 'cp'
 | 
| 92 |   cp -- "$RESET" /tmp || true
 | 
| 93 | 
 | 
| 94 |   # weird output but it ultimately understands it
 | 
| 95 |   banner 'stat'
 | 
| 96 |   stat *
 | 
| 97 | 
 | 
| 98 |   # Hm also understands utf-8
 | 
| 99 |   banner 'find'
 | 
| 100 |   find
 | 
| 101 |   # This prints it raw
 | 
| 102 |   #find -print0
 | 
| 103 | 
 | 
| 104 |   # xargs --verbose messes up!  Makes it bold.  It also understands less
 | 
| 105 |   # unicode.
 | 
| 106 |   if false; then
 | 
| 107 |     banner 'xargs'
 | 
| 108 |     echo * | xargs --verbose -n 1 -- true
 | 
| 109 |   fi
 | 
| 110 | 
 | 
| 111 |   # prints bytes, no unicode
 | 
| 112 |   banner 'strace'
 | 
| 113 |   strace -- true "$BOLD" "$char_then_byte" "$byte_then_char"
 | 
| 114 | 
 | 
| 115 |   # it does understand mu
 | 
| 116 |   banner 'ps'
 | 
| 117 |   bash -c "true zzmagic $BOLD $char_then_byte $byte_then_char; sleep 2" &
 | 
| 118 |   ps aux | grep zzmagic
 | 
| 119 | }
 | 
| 120 | 
 | 
| 121 | test-errors() {
 | 
| 122 |   # also prints it
 | 
| 123 |   setup-bad-files
 | 
| 124 | 
 | 
| 125 |   # GOOD
 | 
| 126 |   banner 'tar'
 | 
| 127 |   tar -f $BOLD || true
 | 
| 128 |   tar --create "$BOLD" "$byte_then_char" "$char_then_byte" > out.tar
 | 
| 129 |   tar --list < out.tar
 | 
| 130 | 
 | 
| 131 |   banner 'rm'
 | 
| 132 |   # works
 | 
| 133 |   rm -f -v -- "$BOLD" "$byte_then_char" "$char_then_byte"
 | 
| 134 | 
 | 
| 135 |   banner 'grep'
 | 
| 136 |   # BUG
 | 
| 137 |   #grep z "$BOLD"
 | 
| 138 |   grep z "$byte_then_char" || true
 | 
| 139 |   grep z "$char_then_byte" || true
 | 
| 140 | 
 | 
| 141 |   # python doesn't print it somehow?
 | 
| 142 |   banner 'python'
 | 
| 143 |   # BUG: Python prints terminal sequences
 | 
| 144 |   #python "$BOLD" || true
 | 
| 145 |   python "$byte_then_char" || true
 | 
| 146 |   python "$char_then_byte" || true
 | 
| 147 | 
 | 
| 148 |   # BUG: Lua prints terminal sequences
 | 
| 149 |   # So coreutils does it right!
 | 
| 150 |   banner 'lua'
 | 
| 151 |   #lua "$BOLD" || true
 | 
| 152 |   lua "$byte_then_char" || true
 | 
| 153 |   lua "$char_then_byte" || true
 | 
| 154 | 
 | 
| 155 |   # BUG: prints it
 | 
| 156 |   banner 'awk'
 | 
| 157 |   #awk -F "$BOLD" || true
 | 
| 158 |   awk -F "$byte_then_char" || true
 | 
| 159 |   awk -F "$char_then_byte" || true
 | 
| 160 | 
 | 
| 161 |   # BUG
 | 
| 162 |   banner 'ruby'
 | 
| 163 |   #ruby "$BOLD" || true
 | 
| 164 |   ruby "$byte_then_char" || true
 | 
| 165 |   ruby "$char_then_byte" || true
 | 
| 166 | 
 | 
| 167 |   # BUG
 | 
| 168 |   banner 'perl'
 | 
| 169 |   #perl "$BOLD" || true
 | 
| 170 |   perl "$byte_then_char" || true
 | 
| 171 |   perl "$char_then_byte" || true
 | 
| 172 | 
 | 
| 173 |   # BUG
 | 
| 174 |   # But it's a little smarter about mu cases
 | 
| 175 |   banner 'nodejs'
 | 
| 176 |   #nodejs "$BOLD" || true
 | 
| 177 |   nodejs "$byte_then_char" || true
 | 
| 178 |   nodejs "$char_then_byte" || true
 | 
| 179 | 
 | 
| 180 |   # shells:
 | 
| 181 | 
 | 
| 182 |   # BUG
 | 
| 183 |   banner 'bash'
 | 
| 184 |   #bash "$BOLD" || true
 | 
| 185 |   bash "$byte_then_char" || true
 | 
| 186 |   bash "$char_then_byte" || true
 | 
| 187 | 
 | 
| 188 |   banner 'dash'
 | 
| 189 |   #dash "$BOLD" || true
 | 
| 190 | 
 | 
| 191 |   # zsh actually escapes it!
 | 
| 192 |   banner 'zsh'
 | 
| 193 |   zsh "$BOLD" || true
 | 
| 194 |   zsh "$byte_then_char" || true
 | 
| 195 |   zsh "$char_then_byte" || true
 | 
| 196 | 
 | 
| 197 |   # BUG
 | 
| 198 |   banner 'mksh'
 | 
| 199 |   #mksh "$BOLD" || true
 | 
| 200 | }
 | 
| 201 | 
 | 
| 202 | test-busybox() {
 | 
| 203 |   setup-bad-files
 | 
| 204 | 
 | 
| 205 |   # displays ?? -- doesn't understand unicode
 | 
| 206 |   banner 'busybox ls'
 | 
| 207 |   busybox ls 
 | 
| 208 | 
 | 
| 209 |   # BUG: prints it literally
 | 
| 210 |   banner 'busybox find'
 | 
| 211 |   busybox find
 | 
| 212 | 
 | 
| 213 |   #reset
 | 
| 214 | }
 | 
| 215 | 
 | 
| 216 | "$@"
 |