1 | #!/usr/bin/env bash
|
2 | #
|
3 | # Usage:
|
4 | # demo/04-unicode.sh <function name>
|
5 | #
|
6 | # TODO: Test what happens if you read binary data into a $(command sub)
|
7 | # - internal NUL
|
8 | # - invalid utf-8 sequence
|
9 | #
|
10 | # It would be nice to move some of this into test/gold? It depends on the
|
11 | # locale.
|
12 |
|
13 | set -o nounset
|
14 | set -o pipefail
|
15 | set -o errexit
|
16 |
|
17 | # https://www.gnu.org/software/bash/manual/bash.html#Shell-Parameter-Expansion
|
18 | #
|
19 | # See doc/unicode.txt.
|
20 |
|
21 | unicode-char() {
|
22 | python -c 'print u"[\u03bc]".encode("utf-8")'
|
23 | }
|
24 |
|
25 | # http://stackoverflow.com/questions/602912/how-do-you-echo-a-4-digit-unicode-character-in-bash
|
26 | echo-char() {
|
27 | #echo -e "\xE2\x98\xA0"
|
28 | echo -e "\xE2\x98\xA0"
|
29 |
|
30 | #echo -e "\x03\xbc"
|
31 |
|
32 | # Woah bash has this! Interesting. Not documented in "help echo" though.
|
33 | echo -e '\u2620'
|
34 |
|
35 | # GNU echo does not have it.
|
36 | /bin/echo -e '\u2620'
|
37 | }
|
38 |
|
39 | raw-char() {
|
40 | # Use vim to put utf-8 in this source file:
|
41 | # 1. i to enter Insert mode
|
42 | # 2. Ctrl-V
|
43 | # 3. u
|
44 | # 4. 03bc -- 4 digits of hex0
|
45 | echo [μ]
|
46 | }
|
47 |
|
48 | quoted-chars() {
|
49 | echo '[μ]'
|
50 | echo "[μ]"
|
51 | echo $'[\u03bc]' # C-escaped string
|
52 |
|
53 | # Not implementing this
|
54 | # https://www.gnu.org/software/bash/manual/html_node/Locale-Translation.html
|
55 | echo $"hello"
|
56 | }
|
57 |
|
58 | test-unicode() {
|
59 | locale # displays state
|
60 | echo
|
61 | echo $LANG
|
62 |
|
63 | unicode-char
|
64 |
|
65 | local u=$(unicode-char)
|
66 | echo $u
|
67 |
|
68 | # This changes bash behavior!
|
69 |
|
70 | #LANG=C
|
71 | echo ${#u} # three chars
|
72 |
|
73 | # OK bash respect utf-8 when doing string slicing. Does it have its own
|
74 | # unicode support, or does it use libc?
|
75 | echo ${u:0} ${u:1} ${u:2}
|
76 |
|
77 | local u=$(raw-char)
|
78 | echo ${u:0} ${u:1} ${u:2}
|
79 | }
|
80 |
|
81 | json() {
|
82 | python -c 'print "\"\u03bc\""' | python -c '
|
83 | import sys, json
|
84 | print json.loads(sys.stdin.read())
|
85 | '
|
86 |
|
87 | # \0u000 code point seems to be representable
|
88 | python -c 'print "\"[\u0000]\""' | python -c '
|
89 | import sys, json
|
90 | print repr(json.loads(sys.stdin.read()))
|
91 | '
|
92 | # Works in python3 too.
|
93 | python -c 'print "\"[\u0000]\""' | python3 -c '
|
94 | import sys, json
|
95 | print(repr(json.loads(sys.stdin.read())))
|
96 | '
|
97 | }
|
98 |
|
99 | # Right now it's split into (Lit_Other '\xce') and (Lit_Other '\xbc'). This is
|
100 | # fine for most purposes, although we could probably simplify this.
|
101 | osh-literal() {
|
102 | bin/osh -n -c 'echo [μ]'
|
103 | # This works fine
|
104 | bin/osh -c 'echo [μ]'
|
105 | }
|
106 |
|
107 | "$@"
|