osh/split.py

OILS / osh / split.py View on Github | oilshell.org

311 lines, 159 significant

1	"""
2	split.py - Word Splitting
3
4	Nice blog post on the complexity/corner cases/differing intuition of splitting
5	strings:
6
7	https://chriszetter.com/blog/2017/10/29/splitting-strings/
8
9	python-dev doesn't want to touch it anymore!
10
11	Other possible splitters:
12
13	- AwkSplitter -- how does this compare to awk -F?
14	- RegexSplitter
15	- CsvSplitter
16	- TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
17	a pure slice, but neither is IFS splitting because of backslashes.
18	- Perl?
19	- does perl have a spilt context?
20
21	with SPLIT_REGEX = / digit+ / {
22	echo $#
23	echo $len(argv)
24	echo $1 $2
25	echo @argv
26	}
27	"""
28
29	from _devbuild.gen.runtime_asdl import (scope_e, span_e, emit_i, char_kind_i,
30	state_i)
31	from _devbuild.gen.value_asdl import (value, value_e, value_t)
32	from mycpp.mylib import log
33	from core import pyutil
34	from frontend import consts
35	from mycpp import mylib
36	from mycpp.mylib import tagswitch
37
38	from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
39	if TYPE_CHECKING:
40	from core.state import Mem
41	from _devbuild.gen.runtime_asdl import span_t
42	Span = Tuple[span_t, int]
43
44	DEFAULT_IFS = ' \t\n'
45
46
47	def _SpansToParts(s, spans):
48	# type: (str, List[Span]) -> List[str]
49	"""Helper for SplitForWordEval."""
50	parts = [] # type: List[mylib.BufWriter]
51	start_index = 0
52
53	# If the last span was black, and we get a backslash, set join_next to merge
54	# two black spans.
55	join_next = False
56	last_span_was_black = False
57
58	for span_type, end_index in spans:
59	if span_type == span_e.Black:
60	if len(parts) and join_next:
61	parts[-1].write(s[start_index:end_index])
62	join_next = False
63	else:
64	buf = mylib.BufWriter()
65	buf.write(s[start_index:end_index])
66	parts.append(buf)
67
68	last_span_was_black = True
69
70	elif span_type == span_e.Backslash:
71	if last_span_was_black:
72	join_next = True
73	last_span_was_black = False
74
75	else:
76	last_span_was_black = False
77
78	start_index = end_index
79
80	result = [buf.getvalue() for buf in parts]
81	return result
82
83
84	class SplitContext(object):
85	"""A polymorphic interface to field splitting.
86
87	It respects a STACK of IFS values, for example:
88
89	echo $x # uses default shell IFS
90	IFS=':' myfunc # new splitter
91	echo $x # uses default shell IFS again.
92	"""
93
94	def __init__(self, mem):
95	# type: (Mem) -> None
96	self.mem = mem
97	# Split into (ifs_whitespace, ifs_other)
98	self.splitters = {
99	} # type: Dict[str, IfsSplitter] # aka IFS value -> splitter instance
100
101	def _GetSplitter(self, ifs=None):
102	# type: (str) -> IfsSplitter
103	"""Based on the current stack frame, get the splitter."""
104	if ifs is None:
105	# Like _ESCAPER, this has dynamic scope!
106	val = self.mem.GetValue('IFS', scope_e.Dynamic)
107
108	UP_val = val
109	with tagswitch(val) as case:
110	if case(value_e.Undef):
111	ifs = DEFAULT_IFS
112	elif case(value_e.Str):
113	val = cast(value.Str, UP_val)
114	ifs = val.s
115	else:
116	# TODO: Raise proper error
117	raise AssertionError("IFS shouldn't be an array")
118
119	sp = self.splitters.get(ifs) # cache lookup
120	if sp is None:
121	# Figure out what kind of splitter we should instantiate.
122
123	ifs_whitespace = mylib.BufWriter()
124	ifs_other = mylib.BufWriter()
125	for c in ifs:
126	if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
127	ifs_whitespace.write(c)
128	else:
129	# TODO: \ not supported
130	ifs_other.write(c)
131
132	sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
133
134	# NOTE: Technically, we could make the key more precise. IFS=$' \t' is
135	# the same as IFS=$'\t '. But most programs probably don't do that, and
136	# everything should work in any case.
137	self.splitters[ifs] = sp
138
139	return sp
140
141	def GetJoinChar(self):
142	# type: () -> str
143	"""For decaying arrays by joining, eg.
144
145	"$@" -> $@. array
146	"""
147	# https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
148	# http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
149	# "When the expansion occurs within a double-quoted string (see
150	# Double-Quotes), it shall expand to a single field with the value of
151	# each parameter separated by the first character of the IFS variable, or
152	# by a <space> if IFS is unset. If IFS is set to a null string, this is
153	# not equivalent to unsetting it; its first character does not exist, so
154	# the parameter values are concatenated."
155	val = self.mem.GetValue('IFS', scope_e.Dynamic) # type: value_t
156	UP_val = val
157	with tagswitch(val) as case:
158	if case(value_e.Undef):
159	return ' '
160	elif case(value_e.Str):
161	val = cast(value.Str, UP_val)
162	if len(val.s):
163	return val.s[0]
164	else:
165	return ''
166	else:
167	# TODO: Raise proper error
168	raise AssertionError("IFS shouldn't be an array")
169
170	raise AssertionError('for -Wreturn-type in C++')
171
172	def Escape(self, s):
173	# type: (str) -> str
174	"""Escape IFS chars."""
175	sp = self._GetSplitter()
176	return sp.Escape(s)
177
178	def SplitForWordEval(self, s, ifs=None):
179	# type: (str, Optional[str]) -> List[str]
180	"""Split used by word evaluation.
181
182	Also used by the explicit shSplit() function.
183	"""
184	sp = self._GetSplitter(ifs=ifs)
185	spans = sp.Split(s, True)
186	if 0:
187	for span in spans:
188	log('SPAN %s', span)
189	return _SpansToParts(s, spans)
190
191	def SplitForRead(self, line, allow_escape, do_split):
192	# type: (str, bool, bool) -> List[Span]
193
194	# None: use the default splitter, consulting $IFS
195	# '' : forces IFS='' behavior
196	ifs = None if do_split else ''
197
198	sp = self._GetSplitter(ifs=ifs)
199	return sp.Split(line, allow_escape)
200
201
202	class _BaseSplitter(object):
203
204	def __init__(self, escape_chars):
205	# type: (str) -> None
206	self.escape_chars = escape_chars + '\\' # Backslash is always escaped
207
208	def Escape(self, s):
209	# type: (str) -> str
210	# Note the characters here are DYNAMIC, unlike other usages of
211	# BackslashEscape().
212	return pyutil.BackslashEscape(s, self.escape_chars)
213
214
215	class IfsSplitter(_BaseSplitter):
216	"""Split a string when IFS has non-whitespace characters."""
217
218	def __init__(self, ifs_whitespace, ifs_other):
219	# type: (str, str) -> None
220	_BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
221	self.ifs_whitespace = ifs_whitespace
222	self.ifs_other = ifs_other
223
224	def Split(self, s, allow_escape):
225	# type: (str, bool) -> List[Span]
226	"""
227	Args:
228	s: string to split
229	allow_escape: False for read -r, this means \ doesn't do anything.
230
231	Returns:
232	List of (runtime.span, end_index) pairs
233
234	TODO: This should be (frag, do_split) pairs, to avoid IFS='\'
235	double-escaping issue.
236	"""
237	ws_chars = self.ifs_whitespace
238	other_chars = self.ifs_other
239
240	n = len(s)
241	# NOTE: in C, could reserve() this to len(s)
242	spans = [] # type: List[Span]
243
244	if n == 0:
245	return spans # empty
246
247	# Ad hoc rule from POSIX: ignore leading whitespace.
248	# "IFS white space shall be ignored at the beginning and end of the input"
249	# This can't really be handled by the state machine.
250
251	i = 0
252	while i < n and mylib.ByteInSet(mylib.ByteAt(s, i), ws_chars):
253	i += 1
254
255	# Append an ignored span.
256	if i != 0:
257	spans.append((span_e.Delim, i))
258
259	# String is ONLY whitespace. We want to skip the last span after the
260	# while loop.
261	if i == n:
262	return spans
263
264	state = state_i.Start
265	while state != state_i.Done:
266	if i < n:
267	byte = mylib.ByteAt(s, i)
268
269	if mylib.ByteInSet(byte, ws_chars):
270	ch = char_kind_i.DE_White
271	elif mylib.ByteInSet(byte, other_chars):
272	ch = char_kind_i.DE_Gray
273	elif allow_escape and mylib.ByteEquals(byte, '\\'):
274	ch = char_kind_i.Backslash
275	else:
276	ch = char_kind_i.Black
277
278	elif i == n:
279	ch = char_kind_i.Sentinel # one more iterations for the end of string
280
281	else:
282	raise AssertionError() # shouldn't happen
283
284	new_state, action = consts.IfsEdge(state, ch)
285	if new_state == state_i.Invalid:
286	raise AssertionError('Invalid transition from %r with %r' %
287	(state, ch))
288
289	if 0:
290	log('i %d byte %r ch %s current: %s next: %s %s', i, byte, ch,
291	state, new_state, action)
292
293	if action == emit_i.Part:
294	spans.append((span_e.Black, i))
295	elif action == emit_i.Delim:
296	spans.append((span_e.Delim, i)) # ignored delimiter
297	elif action == emit_i.Empty:
298	spans.append((span_e.Delim, i)) # ignored delimiter
299	# EMPTY part that is NOT ignored
300	spans.append((span_e.Black, i))
301	elif action == emit_i.Escape:
302	spans.append((span_e.Backslash, i)) # \
303	elif action == emit_i.Nothing:
304	pass
305	else:
306	raise AssertionError()
307
308	state = new_state
309	i += 1
310
311	return spans