data_lang/quoting-survey.sh

OILS / data_lang / quoting-survey.sh View on Github | oilshell.org

216 lines, 92 significant

1	#!/usr/bin/env bash
2	#
3	# What do Unix tools do with "bad" filenames?
4	#
5	# - Those with invalid unicode
6	# - Those with terminal escape sequences
7	#
8	# Usage:
9	# data_lang/quoting-survey.sh <function name>
10
11	set -o nounset
12	set -o pipefail
13	set -o errexit
14
15	# We already know:
16	# - bash ${#len} operator is very broken
17
18	# in bash it could be %Q. Or maybe it's %Q everywhere?
19	# in mycpp, we can translate %r calling repr() to qsn::encode()
20
21
22	# Summary:
23	#
24	# These tools do UTF-8 error decoding, but they use a funny shell-like format:
25	#
26	# - GNU coreutils: ls, cp, stat
27	# - GNU tar
28	# - zsh in error message, and in printf %q
29	# - bash and mksh in printf %q only, but not in error message
30	# - GNU findutils: find, but NOT xargs
31	#
32	# Surprise: not grep
33	#
34	# TODO: automate this a bit?
35	# - You can validate their stdout and stderr?
36	# - Look for the literal escape sequence.
37
38
39	# TODO: What about the one that changes the title?
40
41	BOLD=$'\x1b[1m'
42	RESET=$'\x1b[0;0m'
43
44	# A mix of valid and invalid utf-8
45	char_then_byte=$'\xce\xce\xbc'
46	byte_then_char=$'\xce\xbc\xce'
47
48	readonly DIR=_tmp/qsn-demo
49
50	banner() {
51	echo
52	echo =====
53	echo "$@"
54	echo
55	}
56
57	setup-bad-files() {
58
59	# - Make a file with an invalid code point, and utf-8 char
60	# - Make a file with a terminal escape sequence
61
62	mkdir -p $DIR
63	pushd $DIR
64	touch -- $BOLD $char_then_byte $byte_then_char
65	}
66
67	test-programs() {
68	echo "$BOLD Hello $RESET World"
69
70	# does approximate decoding
71	printf '%q\n' "$char_then_byte"
72	printf '%q\n' "$byte_then_char"
73
74	setup-bad-files
75	# ls doesn't print these by default, that' sgood
76
77	# Hm this also does approximate decoding
78	banner 'ls'
79	ls
80	echo
81	ls --escape
82	echo
83	# Test out error message
84	# It's basically correct, but ugly. There are too many segments, and
85	# there's an unnecessary leading ''.
86	# J8 is shorter and more consistent.
87
88	ls -- "$RESET" \|\| true
89
90	# same
91	banner 'cp'
92	cp -- "$RESET" /tmp \|\| true
93
94	# weird output but it ultimately understands it
95	banner 'stat'
96	stat *
97
98	# Hm also understands utf-8
99	banner 'find'
100	find
101	# This prints it raw
102	#find -print0
103
104	# xargs --verbose messes up! Makes it bold. It also understands less
105	# unicode.
106	if false; then
107	banner 'xargs'
108	echo * \| xargs --verbose -n 1 -- true
109	fi
110
111	# prints bytes, no unicode
112	banner 'strace'
113	strace -- true "$BOLD" "$char_then_byte" "$byte_then_char"
114
115	# it does understand mu
116	banner 'ps'
117	bash -c "true zzmagic $BOLD $char_then_byte $byte_then_char; sleep 2" &
118	ps aux \| grep zzmagic
119	}
120
121	test-errors() {
122	# also prints it
123	setup-bad-files
124
125	# GOOD
126	banner 'tar'
127	tar -f $BOLD \|\| true
128	tar --create "$BOLD" "$byte_then_char" "$char_then_byte" > out.tar
129	tar --list < out.tar
130
131	banner 'rm'
132	# works
133	rm -f -v -- "$BOLD" "$byte_then_char" "$char_then_byte"
134
135	banner 'grep'
136	# BUG
137	#grep z "$BOLD"
138	grep z "$byte_then_char" \|\| true
139	grep z "$char_then_byte" \|\| true
140
141	# python doesn't print it somehow?
142	banner 'python'
143	# BUG: Python prints terminal sequences
144	#python "$BOLD" \|\| true
145	python "$byte_then_char" \|\| true
146	python "$char_then_byte" \|\| true
147
148	# BUG: Lua prints terminal sequences
149	# So coreutils does it right!
150	banner 'lua'
151	#lua "$BOLD" \|\| true
152	lua "$byte_then_char" \|\| true
153	lua "$char_then_byte" \|\| true
154
155	# BUG: prints it
156	banner 'awk'
157	#awk -F "$BOLD" \|\| true
158	awk -F "$byte_then_char" \|\| true
159	awk -F "$char_then_byte" \|\| true
160
161	# BUG
162	banner 'ruby'
163	#ruby "$BOLD" \|\| true
164	ruby "$byte_then_char" \|\| true
165	ruby "$char_then_byte" \|\| true
166
167	# BUG
168	banner 'perl'
169	#perl "$BOLD" \|\| true
170	perl "$byte_then_char" \|\| true
171	perl "$char_then_byte" \|\| true
172
173	# BUG
174	# But it's a little smarter about mu cases
175	banner 'nodejs'
176	#nodejs "$BOLD" \|\| true
177	nodejs "$byte_then_char" \|\| true
178	nodejs "$char_then_byte" \|\| true
179
180	# shells:
181
182	# BUG
183	banner 'bash'
184	#bash "$BOLD" \|\| true
185	bash "$byte_then_char" \|\| true
186	bash "$char_then_byte" \|\| true
187
188	banner 'dash'
189	#dash "$BOLD" \|\| true
190
191	# zsh actually escapes it!
192	banner 'zsh'
193	zsh "$BOLD" \|\| true
194	zsh "$byte_then_char" \|\| true
195	zsh "$char_then_byte" \|\| true
196
197	# BUG
198	banner 'mksh'
199	#mksh "$BOLD" \|\| true
200	}
201
202	test-busybox() {
203	setup-bad-files
204
205	# displays ?? -- doesn't understand unicode
206	banner 'busybox ls'
207	busybox ls
208
209	# BUG: prints it literally
210	banner 'busybox find'
211	busybox find
212
213	#reset
214	}
215
216	"$@"