1 | """
|
2 | string_ops.py - String library functions that can be exposed with a saner syntax.
|
3 |
|
4 | OSH:
|
5 |
|
6 | local y=${x//a*/b}
|
7 |
|
8 | YSH:
|
9 |
|
10 | var y = x => sub('a*', 'b', :ALL)
|
11 |
|
12 | Pass x => sub('a*', 'b', :ALL) => var y
|
13 | """
|
14 |
|
15 | from _devbuild.gen.id_kind_asdl import Id
|
16 | from _devbuild.gen.syntax_asdl import loc, Token, suffix_op
|
17 | from core import pyutil
|
18 | from core import ui
|
19 | from core import error
|
20 | from core.error import e_die, e_strict
|
21 | from mycpp.mylib import log
|
22 | from mycpp import mylib
|
23 | from osh import glob_
|
24 |
|
25 | import libc
|
26 | import fastfunc
|
27 |
|
28 | from typing import List, Tuple
|
29 |
|
30 | _ = log
|
31 |
|
32 | # Error types returned by fastfunc.Utf8DecodeOne
|
33 | # Derived from Utf8Error enum from data_lang/utf8.h
|
34 | UTF8_ERR_OVERLONG = -1 # Encodes a codepoint in more bytes than necessary
|
35 | UTF8_ERR_SURROGATE = -2 # Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF)
|
36 | UTF8_ERR_TOO_LARGE = -3 # Encodes a value greater than the max codepoint U+10FFFF
|
37 | UTF8_ERR_BAD_ENCODING = -4 # Encoding doesn't conform to the UTF-8 bit patterns
|
38 | UTF8_ERR_TRUNCATED_BYTES = -5 # It looks like there is another codepoint, but it has been truncated
|
39 |
|
40 |
|
41 | def Utf8Error_str(error):
|
42 | # type: (int) -> str
|
43 | if error == UTF8_ERR_OVERLONG:
|
44 | return "UTF-8 decode: Overlong"
|
45 | if error == UTF8_ERR_SURROGATE:
|
46 | return "UTF-8 decode: Surrogate range"
|
47 | if error == UTF8_ERR_TOO_LARGE:
|
48 | return "UTF-8 decode: Integer too large"
|
49 | if error == UTF8_ERR_BAD_ENCODING:
|
50 | return "UTF-8 decode: Bad encoding"
|
51 | if error == UTF8_ERR_TRUNCATED_BYTES:
|
52 | return "UTF-8 decode: Truncated bytes"
|
53 |
|
54 | raise AssertionError(0)
|
55 |
|
56 |
|
57 | def DecodeUtf8Char(s, start):
|
58 | # type: (str, int) -> int
|
59 | """Given a string and start index, decode the Unicode char immediately
|
60 | following the start index. The start location is in bytes and should be
|
61 | found using a function like NextUtf8Char or PreviousUtf8Char.
|
62 |
|
63 | If the codepoint in invalid, we raise an `error.Expr`. (This is different
|
64 | from {Next,Previous}Utf8Char which raises an `error.Strict` on encoding
|
65 | errors.)
|
66 | """
|
67 | codepoint_or_error, _bytes_read = fastfunc.Utf8DecodeOne(s, start)
|
68 | if codepoint_or_error < 0:
|
69 | raise error.Expr(
|
70 | "%s at offset %d in string of %d bytes" %
|
71 | (Utf8Error_str(codepoint_or_error), start, len(s)), loc.Missing)
|
72 | return codepoint_or_error
|
73 |
|
74 |
|
75 | def NextUtf8Char(s, i):
|
76 | # type: (str, int) -> int
|
77 | """Given a string and a byte offset, returns the byte position after the
|
78 | character at this position. Usually this is the position of the next
|
79 | character, but for the last character in the string, it's the position just
|
80 | past the end of the string.
|
81 |
|
82 | Validates UTF-8.
|
83 | """
|
84 | codepoint_or_error, bytes_read = fastfunc.Utf8DecodeOne(s, i)
|
85 | if codepoint_or_error < 0:
|
86 | e_strict(
|
87 | "%s at offset %d in string of %d bytes" %
|
88 | (Utf8Error_str(codepoint_or_error), i, len(s)), loc.Missing)
|
89 | return i + bytes_read
|
90 |
|
91 |
|
92 | _INVALID_START = 'Invalid start of UTF-8 sequence'
|
93 |
|
94 |
|
95 | def _Utf8CharLen(starting_byte):
|
96 | # type: (int) -> int
|
97 | if (starting_byte >> 7) == 0b0:
|
98 | return 1
|
99 | elif (starting_byte >> 5) == 0b110:
|
100 | return 2
|
101 | elif (starting_byte >> 4) == 0b1110:
|
102 | return 3
|
103 | elif (starting_byte >> 3) == 0b11110:
|
104 | return 4
|
105 | else:
|
106 | e_strict(_INVALID_START, loc.Missing)
|
107 |
|
108 |
|
109 | def PreviousUtf8Char(s, i):
|
110 | # type: (str, int) -> int
|
111 | """Given a string and a byte offset, returns the position of the character
|
112 | before that offset. To start (find the first byte of the last character),
|
113 | pass len(s) for the initial value of i.
|
114 |
|
115 | Validates UTF-8.
|
116 | """
|
117 | # All bytes in a valid UTF-8 string have one of the following formats:
|
118 | #
|
119 | # 0xxxxxxx (1-byte char)
|
120 | # 110xxxxx (start of 2-byte char)
|
121 | # 1110xxxx (start of 3-byte char)
|
122 | # 11110xxx (start of 4-byte char)
|
123 | # 10xxxxxx (continuation byte)
|
124 | #
|
125 | # Any byte that starts with 10... MUST be a continuation byte,
|
126 | # otherwise it must be the start of a character (or just invalid
|
127 | # data).
|
128 | #
|
129 | # Walking backward, we stop at the first non-continuaton byte
|
130 | # found. We try to interpret it as a valid UTF-8 character starting
|
131 | # byte, and check that it indicates the correct length, based on how
|
132 | # far we've moved from the original byte. Possible problems:
|
133 | # * byte we stopped on does not have a valid value (e.g., 11111111)
|
134 | # * start byte indicates more or fewer continuation bytes than we've seen
|
135 | # * no start byte at beginning of array
|
136 | #
|
137 | # Note that because we are going backward, on malformed input, we
|
138 | # won't error out in the same place as when parsing the string
|
139 | # forwards as normal.
|
140 | orig_i = i
|
141 |
|
142 | while i > 0:
|
143 | i -= 1
|
144 | byte_as_int = mylib.ByteAt(s, i)
|
145 | if (byte_as_int >> 6) != 0b10:
|
146 | offset = orig_i - i
|
147 | if offset != _Utf8CharLen(byte_as_int):
|
148 | # Leaving a generic error for now, but if we want to, it's not
|
149 | # hard to calculate the position where things go wrong. Note
|
150 | # that offset might be more than 4, for an invalid utf-8 string.
|
151 | e_strict(_INVALID_START, loc.Missing)
|
152 | return i
|
153 |
|
154 | e_strict(_INVALID_START, loc.Missing)
|
155 |
|
156 |
|
157 | def CountUtf8Chars(s):
|
158 | # type: (str) -> int
|
159 | """Returns the number of utf-8 characters in the byte string 's'.
|
160 |
|
161 | TODO: Raise exception rather than returning a string, so we can set the exit
|
162 | code of the command to 1 ?
|
163 |
|
164 | $ echo ${#bad}
|
165 | Invalid utf-8 at index 3 of string 'bad': 'ab\xffd'
|
166 | $ echo $?
|
167 | 1
|
168 | """
|
169 | num_chars = 0
|
170 | num_bytes = len(s)
|
171 | i = 0
|
172 | while i < num_bytes:
|
173 | i = NextUtf8Char(s, i)
|
174 | num_chars += 1
|
175 | return num_chars
|
176 |
|
177 |
|
178 | def AdvanceUtf8Chars(s, num_chars, byte_offset):
|
179 | # type: (str, int, int) -> int
|
180 | """Starting from byte offset, advance by N UTF-8 runes
|
181 |
|
182 | Returns a byte offset.
|
183 |
|
184 | Used for shell slicing.
|
185 | """
|
186 | num_bytes = len(s)
|
187 | i = byte_offset # current byte position
|
188 |
|
189 | for _ in xrange(num_chars):
|
190 | # Neither bash or zsh checks out of bounds for slicing. Either begin or
|
191 | # length.
|
192 | if i >= num_bytes:
|
193 | return i
|
194 | #raise RuntimeError('Out of bounds')
|
195 |
|
196 | i = NextUtf8Char(s, i)
|
197 |
|
198 | return i
|
199 |
|
200 |
|
201 | # Limited Unicode codepoints for whitespace characters.
|
202 | # Oils intentionally does not include characters from <USP>, as that set
|
203 | # depends on the version of the Unicode standard used.
|
204 | #
|
205 | # See discussion on the original pull request which added this list here:
|
206 | #
|
207 | # https://github.com/oilshell/oil/pull/1836#issuecomment-1942173520
|
208 | #
|
209 | # See also the Mozilla Javascript documentation, and the note on how
|
210 | # changes to the standard affected Javascript:
|
211 | #
|
212 | # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#white_space
|
213 |
|
214 | SPACES = [
|
215 | 0x0009, # Horizontal tab (\t)
|
216 | 0x000A, # Newline (\n)
|
217 | 0x000B, # Vertical tab (\v)
|
218 | 0x000C, # Form feed (\f)
|
219 | 0x000D, # Carriage return (\r)
|
220 | 0x0020, # Normal space
|
221 | 0x00A0, # No-break space <NBSP>
|
222 | 0xFEFF, # Zero-width no-break space <ZWNBSP>
|
223 | ]
|
224 |
|
225 |
|
226 | def _IsSpace(codepoint):
|
227 | # type: (int) -> bool
|
228 | return codepoint in SPACES
|
229 |
|
230 |
|
231 | def StartsWithWhitespaceByteRange(s):
|
232 | # type: (str) -> Tuple[int, int]
|
233 | """Returns the range of 's' which has leading whitespace characters.
|
234 |
|
235 | If 's' has no leading whitespace, an valid but empty range is returned.
|
236 |
|
237 | The returned range is given as byte positions, and is a half-open range
|
238 | "[start, end)" which is returned as a tuple.
|
239 |
|
240 | Used for shell functions like 'trimStart' to match then trim whitespace.
|
241 | """
|
242 | len_s = len(s)
|
243 | i = 0
|
244 | while i < len_s:
|
245 | codepoint = DecodeUtf8Char(s, i)
|
246 | if not _IsSpace(codepoint):
|
247 | break
|
248 |
|
249 | try:
|
250 | i = NextUtf8Char(s, i)
|
251 | except error.Strict:
|
252 | assert False, "DecodeUtf8Char should have caught any encoding errors"
|
253 |
|
254 | start = 0
|
255 | end = i
|
256 | return (start, end)
|
257 |
|
258 |
|
259 | def EndsWithWhitespaceByteRange(s):
|
260 | # type: (str) -> Tuple[int, int]
|
261 | """Returns the range of 's' which has trailing whitespace characters.
|
262 |
|
263 | If 's' has no leading whitespace, an valid but empty range is returned.
|
264 |
|
265 | The returned range is given as byte positions, and is a half-open range
|
266 | "[start, end)" which is returned as a tuple.
|
267 |
|
268 | Used for shell functions like 'trimEnd' to match then trim whitespace.
|
269 | """
|
270 | len_s = len(s)
|
271 | i = len_s
|
272 | while i > 0:
|
273 | # TODO: Gracefully handle surrogate pairs and overlong encodings when
|
274 | # finding the start of each character.
|
275 | prev = PreviousUtf8Char(s, i)
|
276 |
|
277 | codepoint = DecodeUtf8Char(s, prev)
|
278 | if not _IsSpace(codepoint):
|
279 | break
|
280 |
|
281 | i = prev
|
282 |
|
283 | start = i
|
284 | end = len_s
|
285 | return (start, end)
|
286 |
|
287 |
|
288 | # Implementation without Python regex:
|
289 | #
|
290 | # (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
|
291 | # regexec. in a loop. fnmatch() does NOT given positions of matches.
|
292 | #
|
293 | # (2) Strip -- % %% # ## -
|
294 | #
|
295 | # a. Fast path for constant strings.
|
296 | # b. Convert to POSIX extended regex, to see if it matches at ALL. If it
|
297 | # doesn't match, short circuit out? We can't do this with fnmatch.
|
298 | # c. If it does match, call fnmatch() iteratively over prefixes / suffixes.
|
299 | #
|
300 | # - # shortest prefix - [:1], [:2], [:3] until it matches
|
301 | # - ## longest prefix - [:-1] [:-2], [:3]. Works because fnmatch does not
|
302 | # match prefixes, it matches EXACTLY.
|
303 | # - % shortest suffix - [-1:] [-2:] [-3:] ...
|
304 | # - %% longest suffix - [1:] [2:] [3:]
|
305 | #
|
306 | # See remove_pattern() in subst.c for bash, and trimsub() in eval.c for
|
307 | # mksh. Dash doesn't implement it.
|
308 |
|
309 | # TODO:
|
310 | # - Unicode support: Convert both pattern, string, and replacement to unicode,
|
311 | # then the result back at the end.
|
312 | # - Compile time errors for [[:space:]] ?
|
313 |
|
314 |
|
315 | def DoUnarySuffixOp(s, op_tok, arg, is_extglob):
|
316 | # type: (str, Token, str, bool) -> str
|
317 | """Helper for ${x#prefix} and family."""
|
318 |
|
319 | id_ = op_tok.id
|
320 |
|
321 | # Fast path for constant strings.
|
322 | # TODO: Should be LooksLikeExtendedGlob!
|
323 | if not is_extglob and not glob_.LooksLikeGlob(arg):
|
324 | # It doesn't look like a glob, but we glob-escaped it (e.g. [ -> \[). So
|
325 | # reverse it. NOTE: We also do this check in Globber.Expand(). It would
|
326 | # be nice to somehow store the original string rather than
|
327 | # escaping/unescaping.
|
328 | arg = glob_.GlobUnescape(arg)
|
329 |
|
330 | if id_ in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
|
331 | # explicit check for non-empty arg (len for mycpp)
|
332 | if len(arg) and s.startswith(arg):
|
333 | return s[len(arg):]
|
334 | else:
|
335 | return s
|
336 |
|
337 | elif id_ in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
|
338 | # need explicit check for non-empty arg (len for mycpp)
|
339 | if len(arg) and s.endswith(arg):
|
340 | return s[:-len(arg)]
|
341 | else:
|
342 | return s
|
343 |
|
344 | # These operators take glob arguments, we don't implement that obscure case.
|
345 | elif id_ == Id.VOp1_Comma: # Only lowercase the first letter
|
346 | if arg != '':
|
347 | e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
|
348 | if len(s):
|
349 | return s[0].lower() + s[1:]
|
350 | else:
|
351 | return s
|
352 |
|
353 | elif id_ == Id.VOp1_DComma:
|
354 | if arg != '':
|
355 | e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
|
356 | return s.lower()
|
357 |
|
358 | elif id_ == Id.VOp1_Caret: # Only uppercase the first letter
|
359 | if arg != '':
|
360 | e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
|
361 | if len(s):
|
362 | return s[0].upper() + s[1:]
|
363 | else:
|
364 | return s
|
365 |
|
366 | elif id_ == Id.VOp1_DCaret:
|
367 | if arg != '':
|
368 | e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
|
369 | return s.upper()
|
370 |
|
371 | else: # e.g. ^ ^^ , ,,
|
372 | raise AssertionError(id_)
|
373 |
|
374 | # For patterns, do fnmatch() in a loop.
|
375 | #
|
376 | # TODO:
|
377 | # - Another potential fast path:
|
378 | # v=aabbccdd
|
379 | # echo ${v#*b} # strip shortest prefix
|
380 | #
|
381 | # If the whole thing doesn't match '*b*', then no test can succeed. So we
|
382 | # can fail early. Conversely echo ${v%%c*} and '*c*'.
|
383 | #
|
384 | # (Although honestly this whole construct is nuts and should be deprecated.)
|
385 |
|
386 | n = len(s)
|
387 |
|
388 | if id_ == Id.VOp1_Pound: # shortest prefix
|
389 | # 'abcd': match '', 'a', 'ab', 'abc', ...
|
390 | i = 0
|
391 | while True:
|
392 | assert i <= n
|
393 | #log('Matching pattern %r with %r', arg, s[:i])
|
394 | if libc.fnmatch(arg, s[:i]):
|
395 | return s[i:]
|
396 | if i >= n:
|
397 | break
|
398 | i = NextUtf8Char(s, i)
|
399 | return s
|
400 |
|
401 | elif id_ == Id.VOp1_DPound: # longest prefix
|
402 | # 'abcd': match 'abc', 'ab', 'a'
|
403 | i = n
|
404 | while True:
|
405 | assert i >= 0
|
406 | #log('Matching pattern %r with %r', arg, s[:i])
|
407 | if libc.fnmatch(arg, s[:i]):
|
408 | return s[i:]
|
409 | if i == 0:
|
410 | break
|
411 | i = PreviousUtf8Char(s, i)
|
412 | return s
|
413 |
|
414 | elif id_ == Id.VOp1_Percent: # shortest suffix
|
415 | # 'abcd': match 'abcd', 'abc', 'ab', 'a'
|
416 | i = n
|
417 | while True:
|
418 | assert i >= 0
|
419 | #log('Matching pattern %r with %r', arg, s[:i])
|
420 | if libc.fnmatch(arg, s[i:]):
|
421 | return s[:i]
|
422 | if i == 0:
|
423 | break
|
424 | i = PreviousUtf8Char(s, i)
|
425 | return s
|
426 |
|
427 | elif id_ == Id.VOp1_DPercent: # longest suffix
|
428 | # 'abcd': match 'abc', 'bc', 'c', ...
|
429 | i = 0
|
430 | while True:
|
431 | assert i <= n
|
432 | #log('Matching pattern %r with %r', arg, s[:i])
|
433 | if libc.fnmatch(arg, s[i:]):
|
434 | return s[:i]
|
435 | if i >= n:
|
436 | break
|
437 | i = NextUtf8Char(s, i)
|
438 | return s
|
439 |
|
440 | else:
|
441 | raise NotImplementedError(ui.PrettyId(id_))
|
442 |
|
443 |
|
444 | def _AllMatchPositions(s, regex):
|
445 | # type: (str, str) -> List[Tuple[int, int]]
|
446 | """Returns a list of all (start, end) match positions of the regex against
|
447 | s.
|
448 |
|
449 | (If there are no matches, it returns the empty list.)
|
450 | """
|
451 | matches = [] # type: List[Tuple[int, int]]
|
452 | pos = 0
|
453 | n = len(s)
|
454 | while pos < n: # needed to prevent infinite loop in (.*) case
|
455 | m = libc.regex_first_group_match(regex, s, pos)
|
456 | if m is None:
|
457 | break
|
458 | matches.append(m)
|
459 | start, end = m
|
460 | pos = end # advance position
|
461 | return matches
|
462 |
|
463 |
|
464 | def _PatSubAll(s, regex, replace_str):
|
465 | # type: (str, str, str) -> str
|
466 | parts = [] # type: List[str]
|
467 | prev_end = 0
|
468 | for start, end in _AllMatchPositions(s, regex):
|
469 | parts.append(s[prev_end:start])
|
470 | parts.append(replace_str)
|
471 | prev_end = end
|
472 | parts.append(s[prev_end:])
|
473 | return ''.join(parts)
|
474 |
|
475 |
|
476 | class GlobReplacer(object):
|
477 |
|
478 | def __init__(self, regex, replace_str, slash_tok):
|
479 | # type: (str, str, Token) -> None
|
480 |
|
481 | # TODO: It would be nice to cache the compilation of the regex here,
|
482 | # instead of just the string. That would require more sophisticated use of
|
483 | # the Python/C API in libc.c, which we might want to avoid.
|
484 | self.regex = regex
|
485 | self.replace_str = replace_str
|
486 | self.slash_tok = slash_tok
|
487 |
|
488 | def __repr__(self):
|
489 | # type: () -> str
|
490 | return '<_GlobReplacer regex %r r %r>' % (self.regex, self.replace_str)
|
491 |
|
492 | def Replace(self, s, op):
|
493 | # type: (str, suffix_op.PatSub) -> str
|
494 |
|
495 | regex = '(%s)' % self.regex # make it a group
|
496 |
|
497 | if op.replace_mode == Id.Lit_Slash:
|
498 | # Avoid infinite loop when replacing all copies of empty string
|
499 | if len(self.regex) == 0:
|
500 | return s
|
501 |
|
502 | try:
|
503 | # loop over matches
|
504 | return _PatSubAll(s, regex, self.replace_str)
|
505 | except RuntimeError as e:
|
506 | # Not sure if this is possible since we convert from glob:
|
507 | # libc.regex_first_group_match raises RuntimeError on regex syntax
|
508 | # error.
|
509 | msg = e.message # type: str
|
510 | e_die('Error matching regex %r: %s' % (regex, msg),
|
511 | self.slash_tok)
|
512 |
|
513 | if op.replace_mode == Id.Lit_Pound:
|
514 | regex = '^' + regex
|
515 | elif op.replace_mode == Id.Lit_Percent:
|
516 | regex = regex + '$'
|
517 |
|
518 | m = libc.regex_first_group_match(regex, s, 0)
|
519 | #log('regex = %r, s = %r, match = %r', regex, s, m)
|
520 | if m is None:
|
521 | return s
|
522 | start, end = m
|
523 | return s[:start] + self.replace_str + s[end:]
|
524 |
|
525 |
|
526 | def ShellQuoteB(s):
|
527 | # type: (str) -> str
|
528 | """Quote by adding backslashes.
|
529 |
|
530 | Used for autocompletion, so it's friendlier for display on the
|
531 | command line. We use the strategy above for other use cases.
|
532 | """
|
533 | # There's no way to escape a newline! Bash prints ^J for some reason, but
|
534 | # we're more explicit. This will happen if there's a newline on a file
|
535 | # system or a completion plugin returns a newline.
|
536 |
|
537 | # NOTE: tabs CAN be escaped with \.
|
538 | s = s.replace('\r', '<INVALID CR>').replace('\n', '<INVALID NEWLINE>')
|
539 |
|
540 | # ~ for home dir
|
541 | # ! for history
|
542 | # * [] ? for glob
|
543 | # {} for brace expansion
|
544 | # space because it separates words
|
545 | return pyutil.BackslashEscape(s, ' `~!$&*()[]{}\\|;\'"<>?')
|