| 1 | #!/usr/bin/env bash
 | 
| 2 | #
 | 
| 3 | # Usage:
 | 
| 4 | #   demo/04-unicode.sh <function name>
 | 
| 5 | # 
 | 
| 6 | # TODO: Test what happens if you read binary data into a $(command sub)
 | 
| 7 | # - internal NUL
 | 
| 8 | # - invalid utf-8 sequence
 | 
| 9 | #
 | 
| 10 | # It would be nice to move some of this into test/gold?  It depends on the
 | 
| 11 | # locale.
 | 
| 12 | 
 | 
| 13 | set -o nounset
 | 
| 14 | set -o pipefail
 | 
| 15 | set -o errexit
 | 
| 16 | 
 | 
| 17 | # https://www.gnu.org/software/bash/manual/bash.html#Shell-Parameter-Expansion
 | 
| 18 | #
 | 
| 19 | # See doc/unicode.txt.
 | 
| 20 | 
 | 
| 21 | unicode-char() {
 | 
| 22 |   python -c 'print u"[\u03bc]".encode("utf-8")'
 | 
| 23 | }
 | 
| 24 | 
 | 
| 25 | # http://stackoverflow.com/questions/602912/how-do-you-echo-a-4-digit-unicode-character-in-bash
 | 
| 26 | echo-char() {
 | 
| 27 |   #echo -e "\xE2\x98\xA0"
 | 
| 28 |   echo -e "\xE2\x98\xA0"
 | 
| 29 | 
 | 
| 30 |   #echo -e "\x03\xbc"
 | 
| 31 | 
 | 
| 32 |   # Woah bash has this!  Interesting.  Not documented in "help echo" though.
 | 
| 33 |   echo -e '\u2620'
 | 
| 34 | 
 | 
| 35 |   # GNU echo does not have it.
 | 
| 36 |   /bin/echo -e '\u2620'
 | 
| 37 | }
 | 
| 38 | 
 | 
| 39 | raw-char() {
 | 
| 40 |   # Use vim to put utf-8 in this source file:
 | 
| 41 |   # 1. i to enter Insert mode
 | 
| 42 |   # 2. Ctrl-V
 | 
| 43 |   # 3. u 
 | 
| 44 |   # 4. 03bc  -- 4 digits of hex0
 | 
| 45 |   echo [μ]
 | 
| 46 | }
 | 
| 47 | 
 | 
| 48 | quoted-chars() {
 | 
| 49 |   echo '[μ]'
 | 
| 50 |   echo "[μ]"
 | 
| 51 |   echo $'[\u03bc]'  # C-escaped string
 | 
| 52 | 
 | 
| 53 |   # Not implementing this
 | 
| 54 |   # https://www.gnu.org/software/bash/manual/html_node/Locale-Translation.html
 | 
| 55 |   echo $"hello"
 | 
| 56 | }
 | 
| 57 | 
 | 
| 58 | test-unicode() {
 | 
| 59 |   locale  # displays state
 | 
| 60 |   echo
 | 
| 61 |   echo $LANG
 | 
| 62 | 
 | 
| 63 |   unicode-char
 | 
| 64 | 
 | 
| 65 |   local u=$(unicode-char)
 | 
| 66 |   echo $u
 | 
| 67 | 
 | 
| 68 |   # This changes bash behavior!
 | 
| 69 | 
 | 
| 70 |   #LANG=C
 | 
| 71 |   echo ${#u}  # three chars
 | 
| 72 | 
 | 
| 73 |   # OK bash respect utf-8 when doing string slicing.  Does it have its own
 | 
| 74 |   # unicode support, or does it use libc?
 | 
| 75 |   echo ${u:0} ${u:1} ${u:2}
 | 
| 76 | 
 | 
| 77 |   local u=$(raw-char)
 | 
| 78 |   echo ${u:0} ${u:1} ${u:2}
 | 
| 79 | }
 | 
| 80 | 
 | 
| 81 | json() {
 | 
| 82 |   python -c 'print "\"\u03bc\""' | python -c '
 | 
| 83 | import sys, json
 | 
| 84 | print json.loads(sys.stdin.read())
 | 
| 85 | '
 | 
| 86 | 
 | 
| 87 |   # \0u000 code point seems to be representable
 | 
| 88 |   python -c 'print "\"[\u0000]\""' | python -c '
 | 
| 89 | import sys, json
 | 
| 90 | print repr(json.loads(sys.stdin.read()))
 | 
| 91 | '
 | 
| 92 |   # Works in python3 too.
 | 
| 93 |   python -c 'print "\"[\u0000]\""' | python3 -c '
 | 
| 94 | import sys, json
 | 
| 95 | print(repr(json.loads(sys.stdin.read())))
 | 
| 96 | '
 | 
| 97 | }
 | 
| 98 | 
 | 
| 99 | # Right now it's split into (Lit_Other '\xce') and (Lit_Other '\xbc').  This is
 | 
| 100 | # fine for most purposes, although we could probably simplify this.
 | 
| 101 | osh-literal() {
 | 
| 102 |   bin/osh -n -c 'echo [μ]'
 | 
| 103 |   # This works fine
 | 
| 104 |   bin/osh -c 'echo [μ]'
 | 
| 105 | }
 | 
| 106 | 
 | 
| 107 | "$@"
 |