OILS / osh / string_ops.py View on Github | oilshell.org

545 lines, 253 significant
1"""
2string_ops.py - String library functions that can be exposed with a saner syntax.
3
4OSH:
5
6 local y=${x//a*/b}
7
8YSH:
9
10 var y = x => sub('a*', 'b', :ALL)
11
12 Pass x => sub('a*', 'b', :ALL) => var y
13"""
14
15from _devbuild.gen.id_kind_asdl import Id
16from _devbuild.gen.syntax_asdl import loc, Token, suffix_op
17from core import pyutil
18from core import ui
19from core import error
20from core.error import e_die, e_strict
21from mycpp.mylib import log
22from mycpp import mylib
23from osh import glob_
24
25import libc
26import fastfunc
27
28from typing import List, Tuple
29
30_ = log
31
32# Error types returned by fastfunc.Utf8DecodeOne
33# Derived from Utf8Error enum from data_lang/utf8.h
34UTF8_ERR_OVERLONG = -1 # Encodes a codepoint in more bytes than necessary
35UTF8_ERR_SURROGATE = -2 # Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF)
36UTF8_ERR_TOO_LARGE = -3 # Encodes a value greater than the max codepoint U+10FFFF
37UTF8_ERR_BAD_ENCODING = -4 # Encoding doesn't conform to the UTF-8 bit patterns
38UTF8_ERR_TRUNCATED_BYTES = -5 # It looks like there is another codepoint, but it has been truncated
39
40
41def Utf8Error_str(error):
42 # type: (int) -> str
43 if error == UTF8_ERR_OVERLONG:
44 return "UTF-8 decode: Overlong"
45 if error == UTF8_ERR_SURROGATE:
46 return "UTF-8 decode: Surrogate range"
47 if error == UTF8_ERR_TOO_LARGE:
48 return "UTF-8 decode: Integer too large"
49 if error == UTF8_ERR_BAD_ENCODING:
50 return "UTF-8 decode: Bad encoding"
51 if error == UTF8_ERR_TRUNCATED_BYTES:
52 return "UTF-8 decode: Truncated bytes"
53
54 raise AssertionError(0)
55
56
57def DecodeUtf8Char(s, start):
58 # type: (str, int) -> int
59 """Given a string and start index, decode the Unicode char immediately
60 following the start index. The start location is in bytes and should be
61 found using a function like NextUtf8Char or PreviousUtf8Char.
62
63 If the codepoint in invalid, we raise an `error.Expr`. (This is different
64 from {Next,Previous}Utf8Char which raises an `error.Strict` on encoding
65 errors.)
66 """
67 codepoint_or_error, _bytes_read = fastfunc.Utf8DecodeOne(s, start)
68 if codepoint_or_error < 0:
69 raise error.Expr(
70 "%s at offset %d in string of %d bytes" %
71 (Utf8Error_str(codepoint_or_error), start, len(s)), loc.Missing)
72 return codepoint_or_error
73
74
75def NextUtf8Char(s, i):
76 # type: (str, int) -> int
77 """Given a string and a byte offset, returns the byte position after the
78 character at this position. Usually this is the position of the next
79 character, but for the last character in the string, it's the position just
80 past the end of the string.
81
82 Validates UTF-8.
83 """
84 codepoint_or_error, bytes_read = fastfunc.Utf8DecodeOne(s, i)
85 if codepoint_or_error < 0:
86 e_strict(
87 "%s at offset %d in string of %d bytes" %
88 (Utf8Error_str(codepoint_or_error), i, len(s)), loc.Missing)
89 return i + bytes_read
90
91
92_INVALID_START = 'Invalid start of UTF-8 sequence'
93
94
95def _Utf8CharLen(starting_byte):
96 # type: (int) -> int
97 if (starting_byte >> 7) == 0b0:
98 return 1
99 elif (starting_byte >> 5) == 0b110:
100 return 2
101 elif (starting_byte >> 4) == 0b1110:
102 return 3
103 elif (starting_byte >> 3) == 0b11110:
104 return 4
105 else:
106 e_strict(_INVALID_START, loc.Missing)
107
108
109def PreviousUtf8Char(s, i):
110 # type: (str, int) -> int
111 """Given a string and a byte offset, returns the position of the character
112 before that offset. To start (find the first byte of the last character),
113 pass len(s) for the initial value of i.
114
115 Validates UTF-8.
116 """
117 # All bytes in a valid UTF-8 string have one of the following formats:
118 #
119 # 0xxxxxxx (1-byte char)
120 # 110xxxxx (start of 2-byte char)
121 # 1110xxxx (start of 3-byte char)
122 # 11110xxx (start of 4-byte char)
123 # 10xxxxxx (continuation byte)
124 #
125 # Any byte that starts with 10... MUST be a continuation byte,
126 # otherwise it must be the start of a character (or just invalid
127 # data).
128 #
129 # Walking backward, we stop at the first non-continuaton byte
130 # found. We try to interpret it as a valid UTF-8 character starting
131 # byte, and check that it indicates the correct length, based on how
132 # far we've moved from the original byte. Possible problems:
133 # * byte we stopped on does not have a valid value (e.g., 11111111)
134 # * start byte indicates more or fewer continuation bytes than we've seen
135 # * no start byte at beginning of array
136 #
137 # Note that because we are going backward, on malformed input, we
138 # won't error out in the same place as when parsing the string
139 # forwards as normal.
140 orig_i = i
141
142 while i > 0:
143 i -= 1
144 byte_as_int = mylib.ByteAt(s, i)
145 if (byte_as_int >> 6) != 0b10:
146 offset = orig_i - i
147 if offset != _Utf8CharLen(byte_as_int):
148 # Leaving a generic error for now, but if we want to, it's not
149 # hard to calculate the position where things go wrong. Note
150 # that offset might be more than 4, for an invalid utf-8 string.
151 e_strict(_INVALID_START, loc.Missing)
152 return i
153
154 e_strict(_INVALID_START, loc.Missing)
155
156
157def CountUtf8Chars(s):
158 # type: (str) -> int
159 """Returns the number of utf-8 characters in the byte string 's'.
160
161 TODO: Raise exception rather than returning a string, so we can set the exit
162 code of the command to 1 ?
163
164 $ echo ${#bad}
165 Invalid utf-8 at index 3 of string 'bad': 'ab\xffd'
166 $ echo $?
167 1
168 """
169 num_chars = 0
170 num_bytes = len(s)
171 i = 0
172 while i < num_bytes:
173 i = NextUtf8Char(s, i)
174 num_chars += 1
175 return num_chars
176
177
178def AdvanceUtf8Chars(s, num_chars, byte_offset):
179 # type: (str, int, int) -> int
180 """Starting from byte offset, advance by N UTF-8 runes
181
182 Returns a byte offset.
183
184 Used for shell slicing.
185 """
186 num_bytes = len(s)
187 i = byte_offset # current byte position
188
189 for _ in xrange(num_chars):
190 # Neither bash or zsh checks out of bounds for slicing. Either begin or
191 # length.
192 if i >= num_bytes:
193 return i
194 #raise RuntimeError('Out of bounds')
195
196 i = NextUtf8Char(s, i)
197
198 return i
199
200
201# Limited Unicode codepoints for whitespace characters.
202# Oils intentionally does not include characters from <USP>, as that set
203# depends on the version of the Unicode standard used.
204#
205# See discussion on the original pull request which added this list here:
206#
207# https://github.com/oilshell/oil/pull/1836#issuecomment-1942173520
208#
209# See also the Mozilla Javascript documentation, and the note on how
210# changes to the standard affected Javascript:
211#
212# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#white_space
213
214SPACES = [
215 0x0009, # Horizontal tab (\t)
216 0x000A, # Newline (\n)
217 0x000B, # Vertical tab (\v)
218 0x000C, # Form feed (\f)
219 0x000D, # Carriage return (\r)
220 0x0020, # Normal space
221 0x00A0, # No-break space <NBSP>
222 0xFEFF, # Zero-width no-break space <ZWNBSP>
223]
224
225
226def _IsSpace(codepoint):
227 # type: (int) -> bool
228 return codepoint in SPACES
229
230
231def StartsWithWhitespaceByteRange(s):
232 # type: (str) -> Tuple[int, int]
233 """Returns the range of 's' which has leading whitespace characters.
234
235 If 's' has no leading whitespace, an valid but empty range is returned.
236
237 The returned range is given as byte positions, and is a half-open range
238 "[start, end)" which is returned as a tuple.
239
240 Used for shell functions like 'trimStart' to match then trim whitespace.
241 """
242 len_s = len(s)
243 i = 0
244 while i < len_s:
245 codepoint = DecodeUtf8Char(s, i)
246 if not _IsSpace(codepoint):
247 break
248
249 try:
250 i = NextUtf8Char(s, i)
251 except error.Strict:
252 assert False, "DecodeUtf8Char should have caught any encoding errors"
253
254 start = 0
255 end = i
256 return (start, end)
257
258
259def EndsWithWhitespaceByteRange(s):
260 # type: (str) -> Tuple[int, int]
261 """Returns the range of 's' which has trailing whitespace characters.
262
263 If 's' has no leading whitespace, an valid but empty range is returned.
264
265 The returned range is given as byte positions, and is a half-open range
266 "[start, end)" which is returned as a tuple.
267
268 Used for shell functions like 'trimEnd' to match then trim whitespace.
269 """
270 len_s = len(s)
271 i = len_s
272 while i > 0:
273 # TODO: Gracefully handle surrogate pairs and overlong encodings when
274 # finding the start of each character.
275 prev = PreviousUtf8Char(s, i)
276
277 codepoint = DecodeUtf8Char(s, prev)
278 if not _IsSpace(codepoint):
279 break
280
281 i = prev
282
283 start = i
284 end = len_s
285 return (start, end)
286
287
288# Implementation without Python regex:
289#
290# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
291# regexec. in a loop. fnmatch() does NOT given positions of matches.
292#
293# (2) Strip -- % %% # ## -
294#
295# a. Fast path for constant strings.
296# b. Convert to POSIX extended regex, to see if it matches at ALL. If it
297# doesn't match, short circuit out? We can't do this with fnmatch.
298# c. If it does match, call fnmatch() iteratively over prefixes / suffixes.
299#
300# - # shortest prefix - [:1], [:2], [:3] until it matches
301# - ## longest prefix - [:-1] [:-2], [:3]. Works because fnmatch does not
302# match prefixes, it matches EXACTLY.
303# - % shortest suffix - [-1:] [-2:] [-3:] ...
304# - %% longest suffix - [1:] [2:] [3:]
305#
306# See remove_pattern() in subst.c for bash, and trimsub() in eval.c for
307# mksh. Dash doesn't implement it.
308
309# TODO:
310# - Unicode support: Convert both pattern, string, and replacement to unicode,
311# then the result back at the end.
312# - Compile time errors for [[:space:]] ?
313
314
315def DoUnarySuffixOp(s, op_tok, arg, is_extglob):
316 # type: (str, Token, str, bool) -> str
317 """Helper for ${x#prefix} and family."""
318
319 id_ = op_tok.id
320
321 # Fast path for constant strings.
322 # TODO: Should be LooksLikeExtendedGlob!
323 if not is_extglob and not glob_.LooksLikeGlob(arg):
324 # It doesn't look like a glob, but we glob-escaped it (e.g. [ -> \[). So
325 # reverse it. NOTE: We also do this check in Globber.Expand(). It would
326 # be nice to somehow store the original string rather than
327 # escaping/unescaping.
328 arg = glob_.GlobUnescape(arg)
329
330 if id_ in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
331 # explicit check for non-empty arg (len for mycpp)
332 if len(arg) and s.startswith(arg):
333 return s[len(arg):]
334 else:
335 return s
336
337 elif id_ in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
338 # need explicit check for non-empty arg (len for mycpp)
339 if len(arg) and s.endswith(arg):
340 return s[:-len(arg)]
341 else:
342 return s
343
344 # These operators take glob arguments, we don't implement that obscure case.
345 elif id_ == Id.VOp1_Comma: # Only lowercase the first letter
346 if arg != '':
347 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
348 if len(s):
349 return s[0].lower() + s[1:]
350 else:
351 return s
352
353 elif id_ == Id.VOp1_DComma:
354 if arg != '':
355 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
356 return s.lower()
357
358 elif id_ == Id.VOp1_Caret: # Only uppercase the first letter
359 if arg != '':
360 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
361 if len(s):
362 return s[0].upper() + s[1:]
363 else:
364 return s
365
366 elif id_ == Id.VOp1_DCaret:
367 if arg != '':
368 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
369 return s.upper()
370
371 else: # e.g. ^ ^^ , ,,
372 raise AssertionError(id_)
373
374 # For patterns, do fnmatch() in a loop.
375 #
376 # TODO:
377 # - Another potential fast path:
378 # v=aabbccdd
379 # echo ${v#*b} # strip shortest prefix
380 #
381 # If the whole thing doesn't match '*b*', then no test can succeed. So we
382 # can fail early. Conversely echo ${v%%c*} and '*c*'.
383 #
384 # (Although honestly this whole construct is nuts and should be deprecated.)
385
386 n = len(s)
387
388 if id_ == Id.VOp1_Pound: # shortest prefix
389 # 'abcd': match '', 'a', 'ab', 'abc', ...
390 i = 0
391 while True:
392 assert i <= n
393 #log('Matching pattern %r with %r', arg, s[:i])
394 if libc.fnmatch(arg, s[:i]):
395 return s[i:]
396 if i >= n:
397 break
398 i = NextUtf8Char(s, i)
399 return s
400
401 elif id_ == Id.VOp1_DPound: # longest prefix
402 # 'abcd': match 'abc', 'ab', 'a'
403 i = n
404 while True:
405 assert i >= 0
406 #log('Matching pattern %r with %r', arg, s[:i])
407 if libc.fnmatch(arg, s[:i]):
408 return s[i:]
409 if i == 0:
410 break
411 i = PreviousUtf8Char(s, i)
412 return s
413
414 elif id_ == Id.VOp1_Percent: # shortest suffix
415 # 'abcd': match 'abcd', 'abc', 'ab', 'a'
416 i = n
417 while True:
418 assert i >= 0
419 #log('Matching pattern %r with %r', arg, s[:i])
420 if libc.fnmatch(arg, s[i:]):
421 return s[:i]
422 if i == 0:
423 break
424 i = PreviousUtf8Char(s, i)
425 return s
426
427 elif id_ == Id.VOp1_DPercent: # longest suffix
428 # 'abcd': match 'abc', 'bc', 'c', ...
429 i = 0
430 while True:
431 assert i <= n
432 #log('Matching pattern %r with %r', arg, s[:i])
433 if libc.fnmatch(arg, s[i:]):
434 return s[:i]
435 if i >= n:
436 break
437 i = NextUtf8Char(s, i)
438 return s
439
440 else:
441 raise NotImplementedError(ui.PrettyId(id_))
442
443
444def _AllMatchPositions(s, regex):
445 # type: (str, str) -> List[Tuple[int, int]]
446 """Returns a list of all (start, end) match positions of the regex against
447 s.
448
449 (If there are no matches, it returns the empty list.)
450 """
451 matches = [] # type: List[Tuple[int, int]]
452 pos = 0
453 n = len(s)
454 while pos < n: # needed to prevent infinite loop in (.*) case
455 m = libc.regex_first_group_match(regex, s, pos)
456 if m is None:
457 break
458 matches.append(m)
459 start, end = m
460 pos = end # advance position
461 return matches
462
463
464def _PatSubAll(s, regex, replace_str):
465 # type: (str, str, str) -> str
466 parts = [] # type: List[str]
467 prev_end = 0
468 for start, end in _AllMatchPositions(s, regex):
469 parts.append(s[prev_end:start])
470 parts.append(replace_str)
471 prev_end = end
472 parts.append(s[prev_end:])
473 return ''.join(parts)
474
475
476class GlobReplacer(object):
477
478 def __init__(self, regex, replace_str, slash_tok):
479 # type: (str, str, Token) -> None
480
481 # TODO: It would be nice to cache the compilation of the regex here,
482 # instead of just the string. That would require more sophisticated use of
483 # the Python/C API in libc.c, which we might want to avoid.
484 self.regex = regex
485 self.replace_str = replace_str
486 self.slash_tok = slash_tok
487
488 def __repr__(self):
489 # type: () -> str
490 return '<_GlobReplacer regex %r r %r>' % (self.regex, self.replace_str)
491
492 def Replace(self, s, op):
493 # type: (str, suffix_op.PatSub) -> str
494
495 regex = '(%s)' % self.regex # make it a group
496
497 if op.replace_mode == Id.Lit_Slash:
498 # Avoid infinite loop when replacing all copies of empty string
499 if len(self.regex) == 0:
500 return s
501
502 try:
503 # loop over matches
504 return _PatSubAll(s, regex, self.replace_str)
505 except RuntimeError as e:
506 # Not sure if this is possible since we convert from glob:
507 # libc.regex_first_group_match raises RuntimeError on regex syntax
508 # error.
509 msg = e.message # type: str
510 e_die('Error matching regex %r: %s' % (regex, msg),
511 self.slash_tok)
512
513 if op.replace_mode == Id.Lit_Pound:
514 regex = '^' + regex
515 elif op.replace_mode == Id.Lit_Percent:
516 regex = regex + '$'
517
518 m = libc.regex_first_group_match(regex, s, 0)
519 #log('regex = %r, s = %r, match = %r', regex, s, m)
520 if m is None:
521 return s
522 start, end = m
523 return s[:start] + self.replace_str + s[end:]
524
525
526def ShellQuoteB(s):
527 # type: (str) -> str
528 """Quote by adding backslashes.
529
530 Used for autocompletion, so it's friendlier for display on the
531 command line. We use the strategy above for other use cases.
532 """
533 # There's no way to escape a newline! Bash prints ^J for some reason, but
534 # we're more explicit. This will happen if there's a newline on a file
535 # system or a completion plugin returns a newline.
536
537 # NOTE: tabs CAN be escaped with \.
538 s = s.replace('\r', '<INVALID CR>').replace('\n', '<INVALID NEWLINE>')
539
540 # ~ for home dir
541 # ! for history
542 # * [] ? for glob
543 # {} for brace expansion
544 # space because it separates words
545 return pyutil.BackslashEscape(s, ' `~!$&*()[]{}\\|;\'"<>?')