| 1 | """
 | 
| 2 | split.py - Word Splitting
 | 
| 3 | 
 | 
| 4 | Nice blog post on the complexity/corner cases/differing intuition of splitting
 | 
| 5 | strings:
 | 
| 6 | 
 | 
| 7 | https://chriszetter.com/blog/2017/10/29/splitting-strings/
 | 
| 8 | 
 | 
| 9 | python-dev doesn't want to touch it anymore!
 | 
| 10 | 
 | 
| 11 | Other possible splitters:
 | 
| 12 | 
 | 
| 13 | - AwkSplitter -- how does this compare to awk -F?
 | 
| 14 | - RegexSplitter
 | 
| 15 | - CsvSplitter
 | 
| 16 | - TSV2Splitter -- Data is transformed because of # \u0065 in JSON.  So it's not
 | 
| 17 |   a pure slice, but neither is IFS splitting because of backslashes.
 | 
| 18 | - Perl?
 | 
| 19 |   - does perl have a spilt context?
 | 
| 20 | 
 | 
| 21 | with SPLIT_REGEX = / digit+ / {
 | 
| 22 |   echo $#
 | 
| 23 |   echo $len(argv)
 | 
| 24 |   echo $1 $2
 | 
| 25 |   echo @argv
 | 
| 26 | }
 | 
| 27 | """
 | 
| 28 | 
 | 
| 29 | from _devbuild.gen.runtime_asdl import (scope_e, span_e, emit_i, char_kind_i,
 | 
| 30 |                                         state_i)
 | 
| 31 | from _devbuild.gen.value_asdl import (value, value_e, value_t)
 | 
| 32 | from mycpp.mylib import log
 | 
| 33 | from core import pyutil
 | 
| 34 | from frontend import consts
 | 
| 35 | from mycpp import mylib
 | 
| 36 | from mycpp.mylib import tagswitch
 | 
| 37 | 
 | 
| 38 | from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
 | 
| 39 | if TYPE_CHECKING:
 | 
| 40 |     from core.state import Mem
 | 
| 41 |     from _devbuild.gen.runtime_asdl import span_t
 | 
| 42 |     Span = Tuple[span_t, int]
 | 
| 43 | 
 | 
| 44 | DEFAULT_IFS = ' \t\n'
 | 
| 45 | 
 | 
| 46 | 
 | 
| 47 | def _SpansToParts(s, spans):
 | 
| 48 |     # type: (str, List[Span]) -> List[str]
 | 
| 49 |     """Helper for SplitForWordEval."""
 | 
| 50 |     parts = []  # type: List[mylib.BufWriter]
 | 
| 51 |     start_index = 0
 | 
| 52 | 
 | 
| 53 |     # If the last span was black, and we get a backslash, set join_next to merge
 | 
| 54 |     # two black spans.
 | 
| 55 |     join_next = False
 | 
| 56 |     last_span_was_black = False
 | 
| 57 | 
 | 
| 58 |     for span_type, end_index in spans:
 | 
| 59 |         if span_type == span_e.Black:
 | 
| 60 |             if len(parts) and join_next:
 | 
| 61 |                 parts[-1].write(s[start_index:end_index])
 | 
| 62 |                 join_next = False
 | 
| 63 |             else:
 | 
| 64 |                 buf = mylib.BufWriter()
 | 
| 65 |                 buf.write(s[start_index:end_index])
 | 
| 66 |                 parts.append(buf)
 | 
| 67 | 
 | 
| 68 |             last_span_was_black = True
 | 
| 69 | 
 | 
| 70 |         elif span_type == span_e.Backslash:
 | 
| 71 |             if last_span_was_black:
 | 
| 72 |                 join_next = True
 | 
| 73 |             last_span_was_black = False
 | 
| 74 | 
 | 
| 75 |         else:
 | 
| 76 |             last_span_was_black = False
 | 
| 77 | 
 | 
| 78 |         start_index = end_index
 | 
| 79 | 
 | 
| 80 |     result = [buf.getvalue() for buf in parts]
 | 
| 81 |     return result
 | 
| 82 | 
 | 
| 83 | 
 | 
| 84 | class SplitContext(object):
 | 
| 85 |     """A polymorphic interface to field splitting.
 | 
| 86 | 
 | 
| 87 |     It respects a STACK of IFS values, for example:
 | 
| 88 | 
 | 
| 89 |     echo $x  # uses default shell IFS
 | 
| 90 |     IFS=':' myfunc  # new splitter
 | 
| 91 |     echo $x  # uses default shell IFS again.
 | 
| 92 |     """
 | 
| 93 | 
 | 
| 94 |     def __init__(self, mem):
 | 
| 95 |         # type: (Mem) -> None
 | 
| 96 |         self.mem = mem
 | 
| 97 |         # Split into (ifs_whitespace, ifs_other)
 | 
| 98 |         self.splitters = {
 | 
| 99 |         }  # type: Dict[str, IfsSplitter]  # aka IFS value -> splitter instance
 | 
| 100 | 
 | 
| 101 |     def _GetSplitter(self, ifs=None):
 | 
| 102 |         # type: (str) -> IfsSplitter
 | 
| 103 |         """Based on the current stack frame, get the splitter."""
 | 
| 104 |         if ifs is None:
 | 
| 105 |             # Like _ESCAPER, this has dynamic scope!
 | 
| 106 |             val = self.mem.GetValue('IFS', scope_e.Dynamic)
 | 
| 107 | 
 | 
| 108 |             UP_val = val
 | 
| 109 |             with tagswitch(val) as case:
 | 
| 110 |                 if case(value_e.Undef):
 | 
| 111 |                     ifs = DEFAULT_IFS
 | 
| 112 |                 elif case(value_e.Str):
 | 
| 113 |                     val = cast(value.Str, UP_val)
 | 
| 114 |                     ifs = val.s
 | 
| 115 |                 else:
 | 
| 116 |                     # TODO: Raise proper error
 | 
| 117 |                     raise AssertionError("IFS shouldn't be an array")
 | 
| 118 | 
 | 
| 119 |         sp = self.splitters.get(ifs)  # cache lookup
 | 
| 120 |         if sp is None:
 | 
| 121 |             # Figure out what kind of splitter we should instantiate.
 | 
| 122 | 
 | 
| 123 |             ifs_whitespace = mylib.BufWriter()
 | 
| 124 |             ifs_other = mylib.BufWriter()
 | 
| 125 |             for c in ifs:
 | 
| 126 |                 if c in ' \t\n':  # Happens to be the same as DEFAULT_IFS
 | 
| 127 |                     ifs_whitespace.write(c)
 | 
| 128 |                 else:
 | 
| 129 |                     # TODO: \ not supported
 | 
| 130 |                     ifs_other.write(c)
 | 
| 131 | 
 | 
| 132 |             sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
 | 
| 133 | 
 | 
| 134 |             # NOTE: Technically, we could make the key more precise.  IFS=$' \t' is
 | 
| 135 |             # the same as IFS=$'\t '.  But most programs probably don't do that, and
 | 
| 136 |             # everything should work in any case.
 | 
| 137 |             self.splitters[ifs] = sp
 | 
| 138 | 
 | 
| 139 |         return sp
 | 
| 140 | 
 | 
| 141 |     def GetJoinChar(self):
 | 
| 142 |         # type: () -> str
 | 
| 143 |         """For decaying arrays by joining, eg.
 | 
| 144 | 
 | 
| 145 |         "$@" -> $@. array
 | 
| 146 |         """
 | 
| 147 |         # https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
 | 
| 148 |         # http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
 | 
| 149 |         # "When the expansion occurs within a double-quoted string (see
 | 
| 150 |         # Double-Quotes), it shall expand to a single field with the value of
 | 
| 151 |         # each parameter separated by the first character of the IFS variable, or
 | 
| 152 |         # by a <space> if IFS is unset. If IFS is set to a null string, this is
 | 
| 153 |         # not equivalent to unsetting it; its first character does not exist, so
 | 
| 154 |         # the parameter values are concatenated."
 | 
| 155 |         val = self.mem.GetValue('IFS', scope_e.Dynamic)  # type: value_t
 | 
| 156 |         UP_val = val
 | 
| 157 |         with tagswitch(val) as case:
 | 
| 158 |             if case(value_e.Undef):
 | 
| 159 |                 return ' '
 | 
| 160 |             elif case(value_e.Str):
 | 
| 161 |                 val = cast(value.Str, UP_val)
 | 
| 162 |                 if len(val.s):
 | 
| 163 |                     return val.s[0]
 | 
| 164 |                 else:
 | 
| 165 |                     return ''
 | 
| 166 |             else:
 | 
| 167 |                 # TODO: Raise proper error
 | 
| 168 |                 raise AssertionError("IFS shouldn't be an array")
 | 
| 169 | 
 | 
| 170 |         raise AssertionError('for -Wreturn-type in C++')
 | 
| 171 | 
 | 
| 172 |     def Escape(self, s):
 | 
| 173 |         # type: (str) -> str
 | 
| 174 |         """Escape IFS chars."""
 | 
| 175 |         sp = self._GetSplitter()
 | 
| 176 |         return sp.Escape(s)
 | 
| 177 | 
 | 
| 178 |     def SplitForWordEval(self, s, ifs=None):
 | 
| 179 |         # type: (str, Optional[str]) -> List[str]
 | 
| 180 |         """Split used by word evaluation.
 | 
| 181 | 
 | 
| 182 |         Also used by the explicit shSplit() function.
 | 
| 183 |         """
 | 
| 184 |         sp = self._GetSplitter(ifs=ifs)
 | 
| 185 |         spans = sp.Split(s, True)
 | 
| 186 |         if 0:
 | 
| 187 |             for span in spans:
 | 
| 188 |                 log('SPAN %s', span)
 | 
| 189 |         return _SpansToParts(s, spans)
 | 
| 190 | 
 | 
| 191 |     def SplitForRead(self, line, allow_escape, do_split):
 | 
| 192 |         # type: (str, bool, bool) -> List[Span]
 | 
| 193 | 
 | 
| 194 |         # None: use the default splitter, consulting $IFS
 | 
| 195 |         # ''  : forces IFS='' behavior
 | 
| 196 |         ifs = None if do_split else ''
 | 
| 197 | 
 | 
| 198 |         sp = self._GetSplitter(ifs=ifs)
 | 
| 199 |         return sp.Split(line, allow_escape)
 | 
| 200 | 
 | 
| 201 | 
 | 
| 202 | class _BaseSplitter(object):
 | 
| 203 | 
 | 
| 204 |     def __init__(self, escape_chars):
 | 
| 205 |         # type: (str) -> None
 | 
| 206 |         self.escape_chars = escape_chars + '\\'  # Backslash is always escaped
 | 
| 207 | 
 | 
| 208 |     def Escape(self, s):
 | 
| 209 |         # type: (str) -> str
 | 
| 210 |         # Note the characters here are DYNAMIC, unlike other usages of
 | 
| 211 |         # BackslashEscape().
 | 
| 212 |         return pyutil.BackslashEscape(s, self.escape_chars)
 | 
| 213 | 
 | 
| 214 | 
 | 
| 215 | class IfsSplitter(_BaseSplitter):
 | 
| 216 |     """Split a string when IFS has non-whitespace characters."""
 | 
| 217 | 
 | 
| 218 |     def __init__(self, ifs_whitespace, ifs_other):
 | 
| 219 |         # type: (str, str) -> None
 | 
| 220 |         _BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
 | 
| 221 |         self.ifs_whitespace = ifs_whitespace
 | 
| 222 |         self.ifs_other = ifs_other
 | 
| 223 | 
 | 
| 224 |     def Split(self, s, allow_escape):
 | 
| 225 |         # type: (str, bool) -> List[Span]
 | 
| 226 |         """
 | 
| 227 |     Args:
 | 
| 228 |       s: string to split
 | 
| 229 |       allow_escape: False for read -r, this means \ doesn't do anything.
 | 
| 230 | 
 | 
| 231 |     Returns:
 | 
| 232 |       List of (runtime.span, end_index) pairs
 | 
| 233 | 
 | 
| 234 |     TODO: This should be (frag, do_split) pairs, to avoid IFS='\'
 | 
| 235 |     double-escaping issue.
 | 
| 236 |     """
 | 
| 237 |         ws_chars = self.ifs_whitespace
 | 
| 238 |         other_chars = self.ifs_other
 | 
| 239 | 
 | 
| 240 |         n = len(s)
 | 
| 241 |         # NOTE: in C, could reserve() this to len(s)
 | 
| 242 |         spans = []  # type: List[Span]
 | 
| 243 | 
 | 
| 244 |         if n == 0:
 | 
| 245 |             return spans  # empty
 | 
| 246 | 
 | 
| 247 |         # Ad hoc rule from POSIX: ignore leading whitespace.
 | 
| 248 |         # "IFS white space shall be ignored at the beginning and end of the input"
 | 
| 249 |         # This can't really be handled by the state machine.
 | 
| 250 | 
 | 
| 251 |         i = 0
 | 
| 252 |         while i < n and mylib.ByteInSet(mylib.ByteAt(s, i), ws_chars):
 | 
| 253 |             i += 1
 | 
| 254 | 
 | 
| 255 |         # Append an ignored span.
 | 
| 256 |         if i != 0:
 | 
| 257 |             spans.append((span_e.Delim, i))
 | 
| 258 | 
 | 
| 259 |         # String is ONLY whitespace.  We want to skip the last span after the
 | 
| 260 |         # while loop.
 | 
| 261 |         if i == n:
 | 
| 262 |             return spans
 | 
| 263 | 
 | 
| 264 |         state = state_i.Start
 | 
| 265 |         while state != state_i.Done:
 | 
| 266 |             if i < n:
 | 
| 267 |                 byte = mylib.ByteAt(s, i)
 | 
| 268 | 
 | 
| 269 |                 if mylib.ByteInSet(byte, ws_chars):
 | 
| 270 |                     ch = char_kind_i.DE_White
 | 
| 271 |                 elif mylib.ByteInSet(byte, other_chars):
 | 
| 272 |                     ch = char_kind_i.DE_Gray
 | 
| 273 |                 elif allow_escape and mylib.ByteEquals(byte, '\\'):
 | 
| 274 |                     ch = char_kind_i.Backslash
 | 
| 275 |                 else:
 | 
| 276 |                     ch = char_kind_i.Black
 | 
| 277 | 
 | 
| 278 |             elif i == n:
 | 
| 279 |                 ch = char_kind_i.Sentinel  # one more iterations for the end of string
 | 
| 280 | 
 | 
| 281 |             else:
 | 
| 282 |                 raise AssertionError()  # shouldn't happen
 | 
| 283 | 
 | 
| 284 |             new_state, action = consts.IfsEdge(state, ch)
 | 
| 285 |             if new_state == state_i.Invalid:
 | 
| 286 |                 raise AssertionError('Invalid transition from %r with %r' %
 | 
| 287 |                                      (state, ch))
 | 
| 288 | 
 | 
| 289 |             if 0:
 | 
| 290 |                 log('i %d byte %r ch %s current: %s next: %s %s', i, byte, ch,
 | 
| 291 |                     state, new_state, action)
 | 
| 292 | 
 | 
| 293 |             if action == emit_i.Part:
 | 
| 294 |                 spans.append((span_e.Black, i))
 | 
| 295 |             elif action == emit_i.Delim:
 | 
| 296 |                 spans.append((span_e.Delim, i))  # ignored delimiter
 | 
| 297 |             elif action == emit_i.Empty:
 | 
| 298 |                 spans.append((span_e.Delim, i))  # ignored delimiter
 | 
| 299 |                 # EMPTY part that is NOT ignored
 | 
| 300 |                 spans.append((span_e.Black, i))
 | 
| 301 |             elif action == emit_i.Escape:
 | 
| 302 |                 spans.append((span_e.Backslash, i))  # \
 | 
| 303 |             elif action == emit_i.Nothing:
 | 
| 304 |                 pass
 | 
| 305 |             else:
 | 
| 306 |                 raise AssertionError()
 | 
| 307 | 
 | 
| 308 |             state = new_state
 | 
| 309 |             i += 1
 | 
| 310 | 
 | 
| 311 |         return spans
 |