OILS / spec / unicode.test.sh View on Github | oilshell.org

196 lines, 67 significant
1## oils_failures_allowed: 0
2## compare_shells: bash mksh zsh
3
4#### OSH source code doesn't have to be valid Unicode (like other shells)
5
6# Should YSH be different? It would be nice.
7# We would have to validate all Lit_Chars tokens, and the like.
8#
9# The logical place to put that would be in osh/word_parse.py where we read
10# single and double quoted strings. Although there might be a global lexer
11# hack for Id.Lit_Chars tokens. Would that catch here docs though?
12
13# Test all the lexing contexts
14cat >unicode.sh << 'EOF'
15echo μ 'μ' "μ" $'μ'
16EOF
17
18# Show that all lexer modes recognize unicode sequences
19#
20# Oh I guess we need to check here docs too?
21
22#$SH -n unicode.sh
23
24$SH unicode.sh
25
26# Trim off the first byte of mu
27sed 's/\xce//g' unicode.sh > not-unicode.sh
28
29echo --
30$SH not-unicode.sh | od -A n -t x1
31
32## STDOUT:
33μ μ μ μ
34--
35 bc 20 bc 20 bc 20 bc 0a
36## END
37
38
39# dash and ash don't support $''
40
41#### Unicode escapes \u03bc \U000003bc in $'', echo -e, printf
42
43case $SH in dash|ash) exit ;; esac
44
45echo $'\u03bc \U000003bc'
46
47echo -e '\u03bc \U000003bc'
48
49printf '\u03bc \U000003bc\n'
50
51## STDOUT:
52μ μ
53μ μ
54μ μ
55## END
56
57## N-I dash/ash STDOUT:
58## END
59
60#### Max code point U+10ffff can escaped with $'' printf echo -e
61
62case $SH in dash|ash) exit ;; esac
63
64py-repr() {
65 python2 -c 'import sys; print repr(sys.argv[1])' "$@"
66}
67
68py-repr $'\U0010ffff'
69py-repr $(echo -e '\U0010ffff')
70py-repr $(printf '\U0010ffff')
71
72## STDOUT:
73'\xf4\x8f\xbf\xbf'
74'\xf4\x8f\xbf\xbf'
75'\xf4\x8f\xbf\xbf'
76## END
77
78## N-I dash/ash STDOUT:
79## END
80
81# Unicode replacement char
82
83## BUG mksh STDOUT:
84'\xef\xbf\xbd'
85'\xef\xbf\xbd'
86'\xf4\x8f\xbf\xbf'
87## END
88
89#### $'' does NOT check that 0x110000 is too big at parse time
90
91py-repr() {
92 python2 -c 'import sys; print repr(sys.argv[1])' "$@"
93}
94
95py-repr $'\U00110000'
96
97## STDOUT:
98'\xf4\x90\x80\x80'
99## END
100
101## BUG mksh STDOUT:
102'\xef\xbf\xbd'
103## END
104
105#### $'' does not check for surrogate range at parse time
106
107py-repr() {
108 python2 -c 'import sys; print repr(sys.argv[1])' "$@"
109}
110
111py-repr $'\udc00'
112
113py-repr $'\U0000dc00'
114
115## STDOUT:
116'\xed\xb0\x80'
117'\xed\xb0\x80'
118## END
119
120## OK zsh status: 1
121## OK zsh STDOUT:
122## END
123
124
125#### printf / echo -e do NOT check max code point at runtime
126case $SH in mksh) exit ;; esac
127
128py-repr() {
129 python2 -c 'import sys; print repr(sys.argv[1])' "$@"
130}
131
132e="$(echo -e '\U00110000')"
133echo status=$?
134py-repr "$e"
135
136p="$(printf '\U00110000')"
137echo status=$?
138py-repr "$p"
139
140## STDOUT:
141status=0
142'\xf4\x90\x80\x80'
143status=0
144'\xf4\x90\x80\x80'
145## END
146
147## BUG mksh STDOUT:
148## END
149
150#### printf / echo -e do NOT check surrogates at runtime
151case $SH in mksh) exit ;; esac
152
153py-repr() {
154 python2 -c 'import sys; print repr(sys.argv[1])' "$@"
155}
156
157e="$(echo -e '\udc00')"
158echo status=$?
159py-repr "$e"
160
161e="$(echo -e '\U0000dc00')"
162echo status=$?
163py-repr "$e"
164
165p="$(printf '\udc00')"
166echo status=$?
167py-repr "$p"
168
169p="$(printf '\U0000dc00')"
170echo status=$?
171py-repr "$p"
172
173## STDOUT:
174status=0
175'\xed\xb0\x80'
176status=0
177'\xed\xb0\x80'
178status=0
179'\xed\xb0\x80'
180status=0
181'\xed\xb0\x80'
182## END
183
184## BUG zsh STDOUT:
185status=0
186''
187status=0
188''
189status=0
190''
191status=0
192''
193## END
194
195## BUG mksh STDOUT:
196## END