OILS / osh / split.py View on Github | oilshell.org

311 lines, 159 significant
1"""
2split.py - Word Splitting
3
4Nice blog post on the complexity/corner cases/differing intuition of splitting
5strings:
6
7https://chriszetter.com/blog/2017/10/29/splitting-strings/
8
9python-dev doesn't want to touch it anymore!
10
11Other possible splitters:
12
13- AwkSplitter -- how does this compare to awk -F?
14- RegexSplitter
15- CsvSplitter
16- TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
17 a pure slice, but neither is IFS splitting because of backslashes.
18- Perl?
19 - does perl have a spilt context?
20
21with SPLIT_REGEX = / digit+ / {
22 echo $#
23 echo $len(argv)
24 echo $1 $2
25 echo @argv
26}
27"""
28
29from _devbuild.gen.runtime_asdl import (scope_e, span_e, emit_i, char_kind_i,
30 state_i)
31from _devbuild.gen.value_asdl import (value, value_e, value_t)
32from mycpp.mylib import log
33from core import pyutil
34from frontend import consts
35from mycpp import mylib
36from mycpp.mylib import tagswitch
37
38from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
39if TYPE_CHECKING:
40 from core.state import Mem
41 from _devbuild.gen.runtime_asdl import span_t
42 Span = Tuple[span_t, int]
43
44DEFAULT_IFS = ' \t\n'
45
46
47def _SpansToParts(s, spans):
48 # type: (str, List[Span]) -> List[str]
49 """Helper for SplitForWordEval."""
50 parts = [] # type: List[mylib.BufWriter]
51 start_index = 0
52
53 # If the last span was black, and we get a backslash, set join_next to merge
54 # two black spans.
55 join_next = False
56 last_span_was_black = False
57
58 for span_type, end_index in spans:
59 if span_type == span_e.Black:
60 if len(parts) and join_next:
61 parts[-1].write(s[start_index:end_index])
62 join_next = False
63 else:
64 buf = mylib.BufWriter()
65 buf.write(s[start_index:end_index])
66 parts.append(buf)
67
68 last_span_was_black = True
69
70 elif span_type == span_e.Backslash:
71 if last_span_was_black:
72 join_next = True
73 last_span_was_black = False
74
75 else:
76 last_span_was_black = False
77
78 start_index = end_index
79
80 result = [buf.getvalue() for buf in parts]
81 return result
82
83
84class SplitContext(object):
85 """A polymorphic interface to field splitting.
86
87 It respects a STACK of IFS values, for example:
88
89 echo $x # uses default shell IFS
90 IFS=':' myfunc # new splitter
91 echo $x # uses default shell IFS again.
92 """
93
94 def __init__(self, mem):
95 # type: (Mem) -> None
96 self.mem = mem
97 # Split into (ifs_whitespace, ifs_other)
98 self.splitters = {
99 } # type: Dict[str, IfsSplitter] # aka IFS value -> splitter instance
100
101 def _GetSplitter(self, ifs=None):
102 # type: (str) -> IfsSplitter
103 """Based on the current stack frame, get the splitter."""
104 if ifs is None:
105 # Like _ESCAPER, this has dynamic scope!
106 val = self.mem.GetValue('IFS', scope_e.Dynamic)
107
108 UP_val = val
109 with tagswitch(val) as case:
110 if case(value_e.Undef):
111 ifs = DEFAULT_IFS
112 elif case(value_e.Str):
113 val = cast(value.Str, UP_val)
114 ifs = val.s
115 else:
116 # TODO: Raise proper error
117 raise AssertionError("IFS shouldn't be an array")
118
119 sp = self.splitters.get(ifs) # cache lookup
120 if sp is None:
121 # Figure out what kind of splitter we should instantiate.
122
123 ifs_whitespace = mylib.BufWriter()
124 ifs_other = mylib.BufWriter()
125 for c in ifs:
126 if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
127 ifs_whitespace.write(c)
128 else:
129 # TODO: \ not supported
130 ifs_other.write(c)
131
132 sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
133
134 # NOTE: Technically, we could make the key more precise. IFS=$' \t' is
135 # the same as IFS=$'\t '. But most programs probably don't do that, and
136 # everything should work in any case.
137 self.splitters[ifs] = sp
138
139 return sp
140
141 def GetJoinChar(self):
142 # type: () -> str
143 """For decaying arrays by joining, eg.
144
145 "$@" -> $@. array
146 """
147 # https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
148 # http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
149 # "When the expansion occurs within a double-quoted string (see
150 # Double-Quotes), it shall expand to a single field with the value of
151 # each parameter separated by the first character of the IFS variable, or
152 # by a <space> if IFS is unset. If IFS is set to a null string, this is
153 # not equivalent to unsetting it; its first character does not exist, so
154 # the parameter values are concatenated."
155 val = self.mem.GetValue('IFS', scope_e.Dynamic) # type: value_t
156 UP_val = val
157 with tagswitch(val) as case:
158 if case(value_e.Undef):
159 return ' '
160 elif case(value_e.Str):
161 val = cast(value.Str, UP_val)
162 if len(val.s):
163 return val.s[0]
164 else:
165 return ''
166 else:
167 # TODO: Raise proper error
168 raise AssertionError("IFS shouldn't be an array")
169
170 raise AssertionError('for -Wreturn-type in C++')
171
172 def Escape(self, s):
173 # type: (str) -> str
174 """Escape IFS chars."""
175 sp = self._GetSplitter()
176 return sp.Escape(s)
177
178 def SplitForWordEval(self, s, ifs=None):
179 # type: (str, Optional[str]) -> List[str]
180 """Split used by word evaluation.
181
182 Also used by the explicit shSplit() function.
183 """
184 sp = self._GetSplitter(ifs=ifs)
185 spans = sp.Split(s, True)
186 if 0:
187 for span in spans:
188 log('SPAN %s', span)
189 return _SpansToParts(s, spans)
190
191 def SplitForRead(self, line, allow_escape, do_split):
192 # type: (str, bool, bool) -> List[Span]
193
194 # None: use the default splitter, consulting $IFS
195 # '' : forces IFS='' behavior
196 ifs = None if do_split else ''
197
198 sp = self._GetSplitter(ifs=ifs)
199 return sp.Split(line, allow_escape)
200
201
202class _BaseSplitter(object):
203
204 def __init__(self, escape_chars):
205 # type: (str) -> None
206 self.escape_chars = escape_chars + '\\' # Backslash is always escaped
207
208 def Escape(self, s):
209 # type: (str) -> str
210 # Note the characters here are DYNAMIC, unlike other usages of
211 # BackslashEscape().
212 return pyutil.BackslashEscape(s, self.escape_chars)
213
214
215class IfsSplitter(_BaseSplitter):
216 """Split a string when IFS has non-whitespace characters."""
217
218 def __init__(self, ifs_whitespace, ifs_other):
219 # type: (str, str) -> None
220 _BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
221 self.ifs_whitespace = ifs_whitespace
222 self.ifs_other = ifs_other
223
224 def Split(self, s, allow_escape):
225 # type: (str, bool) -> List[Span]
226 """
227 Args:
228 s: string to split
229 allow_escape: False for read -r, this means \ doesn't do anything.
230
231 Returns:
232 List of (runtime.span, end_index) pairs
233
234 TODO: This should be (frag, do_split) pairs, to avoid IFS='\'
235 double-escaping issue.
236 """
237 ws_chars = self.ifs_whitespace
238 other_chars = self.ifs_other
239
240 n = len(s)
241 # NOTE: in C, could reserve() this to len(s)
242 spans = [] # type: List[Span]
243
244 if n == 0:
245 return spans # empty
246
247 # Ad hoc rule from POSIX: ignore leading whitespace.
248 # "IFS white space shall be ignored at the beginning and end of the input"
249 # This can't really be handled by the state machine.
250
251 i = 0
252 while i < n and mylib.ByteInSet(mylib.ByteAt(s, i), ws_chars):
253 i += 1
254
255 # Append an ignored span.
256 if i != 0:
257 spans.append((span_e.Delim, i))
258
259 # String is ONLY whitespace. We want to skip the last span after the
260 # while loop.
261 if i == n:
262 return spans
263
264 state = state_i.Start
265 while state != state_i.Done:
266 if i < n:
267 byte = mylib.ByteAt(s, i)
268
269 if mylib.ByteInSet(byte, ws_chars):
270 ch = char_kind_i.DE_White
271 elif mylib.ByteInSet(byte, other_chars):
272 ch = char_kind_i.DE_Gray
273 elif allow_escape and mylib.ByteEquals(byte, '\\'):
274 ch = char_kind_i.Backslash
275 else:
276 ch = char_kind_i.Black
277
278 elif i == n:
279 ch = char_kind_i.Sentinel # one more iterations for the end of string
280
281 else:
282 raise AssertionError() # shouldn't happen
283
284 new_state, action = consts.IfsEdge(state, ch)
285 if new_state == state_i.Invalid:
286 raise AssertionError('Invalid transition from %r with %r' %
287 (state, ch))
288
289 if 0:
290 log('i %d byte %r ch %s current: %s next: %s %s', i, byte, ch,
291 state, new_state, action)
292
293 if action == emit_i.Part:
294 spans.append((span_e.Black, i))
295 elif action == emit_i.Delim:
296 spans.append((span_e.Delim, i)) # ignored delimiter
297 elif action == emit_i.Empty:
298 spans.append((span_e.Delim, i)) # ignored delimiter
299 # EMPTY part that is NOT ignored
300 spans.append((span_e.Black, i))
301 elif action == emit_i.Escape:
302 spans.append((span_e.Backslash, i)) # \
303 elif action == emit_i.Nothing:
304 pass
305 else:
306 raise AssertionError()
307
308 state = new_state
309 i += 1
310
311 return spans