| 1 | """
 | 
| 2 | string_ops.py - String library functions that can be exposed with a saner syntax.
 | 
| 3 | 
 | 
| 4 | OSH:
 | 
| 5 | 
 | 
| 6 |     local y=${x//a*/b}
 | 
| 7 | 
 | 
| 8 | YSH:
 | 
| 9 | 
 | 
| 10 |     var y = x => sub('a*', 'b', :ALL)
 | 
| 11 | 
 | 
| 12 |     Pass x => sub('a*', 'b', :ALL) => var y
 | 
| 13 | """
 | 
| 14 | 
 | 
| 15 | from _devbuild.gen.id_kind_asdl import Id
 | 
| 16 | from _devbuild.gen.syntax_asdl import loc, Token, suffix_op
 | 
| 17 | from core import pyutil
 | 
| 18 | from core import ui
 | 
| 19 | from core import error
 | 
| 20 | from core.error import e_die, e_strict
 | 
| 21 | from mycpp.mylib import log
 | 
| 22 | from mycpp import mylib
 | 
| 23 | from osh import glob_
 | 
| 24 | 
 | 
| 25 | import libc
 | 
| 26 | import fastfunc
 | 
| 27 | 
 | 
| 28 | from typing import List, Tuple
 | 
| 29 | 
 | 
| 30 | _ = log
 | 
| 31 | 
 | 
| 32 | # Error types returned by fastfunc.Utf8DecodeOne
 | 
| 33 | # Derived from Utf8Error enum from data_lang/utf8.h
 | 
| 34 | UTF8_ERR_OVERLONG = -1  # Encodes a codepoint in more bytes than necessary
 | 
| 35 | UTF8_ERR_SURROGATE = -2  # Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF)
 | 
| 36 | UTF8_ERR_TOO_LARGE = -3  # Encodes a value greater than the max codepoint U+10FFFF
 | 
| 37 | UTF8_ERR_BAD_ENCODING = -4  # Encoding doesn't conform to the UTF-8 bit patterns
 | 
| 38 | UTF8_ERR_TRUNCATED_BYTES = -5  # It looks like there is another codepoint, but it has been truncated
 | 
| 39 | 
 | 
| 40 | 
 | 
| 41 | def Utf8Error_str(error):
 | 
| 42 |     # type: (int) -> str
 | 
| 43 |     if error == UTF8_ERR_OVERLONG:
 | 
| 44 |         return "UTF-8 decode: Overlong"
 | 
| 45 |     if error == UTF8_ERR_SURROGATE:
 | 
| 46 |         return "UTF-8 decode: Surrogate range"
 | 
| 47 |     if error == UTF8_ERR_TOO_LARGE:
 | 
| 48 |         return "UTF-8 decode: Integer too large"
 | 
| 49 |     if error == UTF8_ERR_BAD_ENCODING:
 | 
| 50 |         return "UTF-8 decode: Bad encoding"
 | 
| 51 |     if error == UTF8_ERR_TRUNCATED_BYTES:
 | 
| 52 |         return "UTF-8 decode: Truncated bytes"
 | 
| 53 | 
 | 
| 54 |     raise AssertionError(0)
 | 
| 55 | 
 | 
| 56 | 
 | 
| 57 | def DecodeUtf8Char(s, start):
 | 
| 58 |     # type: (str, int) -> int
 | 
| 59 |     """Given a string and start index, decode the Unicode char immediately
 | 
| 60 |     following the start index. The start location is in bytes and should be
 | 
| 61 |     found using a function like NextUtf8Char or PreviousUtf8Char.
 | 
| 62 | 
 | 
| 63 |     If the codepoint in invalid, we raise an `error.Expr`. (This is different
 | 
| 64 |     from {Next,Previous}Utf8Char which raises an `error.Strict` on encoding
 | 
| 65 |     errors.)
 | 
| 66 |     """
 | 
| 67 |     codepoint_or_error, _bytes_read = fastfunc.Utf8DecodeOne(s, start)
 | 
| 68 |     if codepoint_or_error < 0:
 | 
| 69 |         raise error.Expr(
 | 
| 70 |             "%s at offset %d in string of %d bytes" %
 | 
| 71 |             (Utf8Error_str(codepoint_or_error), start, len(s)), loc.Missing)
 | 
| 72 |     return codepoint_or_error
 | 
| 73 | 
 | 
| 74 | 
 | 
| 75 | def NextUtf8Char(s, i):
 | 
| 76 |     # type: (str, int) -> int
 | 
| 77 |     """Given a string and a byte offset, returns the byte position after the
 | 
| 78 |     character at this position.  Usually this is the position of the next
 | 
| 79 |     character, but for the last character in the string, it's the position just
 | 
| 80 |     past the end of the string.
 | 
| 81 | 
 | 
| 82 |     Validates UTF-8.
 | 
| 83 |     """
 | 
| 84 |     codepoint_or_error, bytes_read = fastfunc.Utf8DecodeOne(s, i)
 | 
| 85 |     if codepoint_or_error < 0:
 | 
| 86 |         e_strict(
 | 
| 87 |             "%s at offset %d in string of %d bytes" %
 | 
| 88 |             (Utf8Error_str(codepoint_or_error), i, len(s)), loc.Missing)
 | 
| 89 |     return i + bytes_read
 | 
| 90 | 
 | 
| 91 | 
 | 
| 92 | _INVALID_START = 'Invalid start of UTF-8 sequence'
 | 
| 93 | 
 | 
| 94 | 
 | 
| 95 | def _Utf8CharLen(starting_byte):
 | 
| 96 |     # type: (int) -> int
 | 
| 97 |     if (starting_byte >> 7) == 0b0:
 | 
| 98 |         return 1
 | 
| 99 |     elif (starting_byte >> 5) == 0b110:
 | 
| 100 |         return 2
 | 
| 101 |     elif (starting_byte >> 4) == 0b1110:
 | 
| 102 |         return 3
 | 
| 103 |     elif (starting_byte >> 3) == 0b11110:
 | 
| 104 |         return 4
 | 
| 105 |     else:
 | 
| 106 |         e_strict(_INVALID_START, loc.Missing)
 | 
| 107 | 
 | 
| 108 | 
 | 
| 109 | def PreviousUtf8Char(s, i):
 | 
| 110 |     # type: (str, int) -> int
 | 
| 111 |     """Given a string and a byte offset, returns the position of the character
 | 
| 112 |     before that offset.  To start (find the first byte of the last character),
 | 
| 113 |     pass len(s) for the initial value of i.
 | 
| 114 | 
 | 
| 115 |     Validates UTF-8.
 | 
| 116 |     """
 | 
| 117 |     # All bytes in a valid UTF-8 string have one of the following formats:
 | 
| 118 |     #
 | 
| 119 |     #   0xxxxxxx (1-byte char)
 | 
| 120 |     #   110xxxxx (start of 2-byte char)
 | 
| 121 |     #   1110xxxx (start of 3-byte char)
 | 
| 122 |     #   11110xxx (start of 4-byte char)
 | 
| 123 |     #   10xxxxxx (continuation byte)
 | 
| 124 |     #
 | 
| 125 |     # Any byte that starts with 10... MUST be a continuation byte,
 | 
| 126 |     # otherwise it must be the start of a character (or just invalid
 | 
| 127 |     # data).
 | 
| 128 |     #
 | 
| 129 |     # Walking backward, we stop at the first non-continuaton byte
 | 
| 130 |     # found.  We try to interpret it as a valid UTF-8 character starting
 | 
| 131 |     # byte, and check that it indicates the correct length, based on how
 | 
| 132 |     # far we've moved from the original byte.  Possible problems:
 | 
| 133 |     #   * byte we stopped on does not have a valid value (e.g., 11111111)
 | 
| 134 |     #   * start byte indicates more or fewer continuation bytes than we've seen
 | 
| 135 |     #   * no start byte at beginning of array
 | 
| 136 |     #
 | 
| 137 |     # Note that because we are going backward, on malformed input, we
 | 
| 138 |     # won't error out in the same place as when parsing the string
 | 
| 139 |     # forwards as normal.
 | 
| 140 |     orig_i = i
 | 
| 141 | 
 | 
| 142 |     while i > 0:
 | 
| 143 |         i -= 1
 | 
| 144 |         byte_as_int = mylib.ByteAt(s, i)
 | 
| 145 |         if (byte_as_int >> 6) != 0b10:
 | 
| 146 |             offset = orig_i - i
 | 
| 147 |             if offset != _Utf8CharLen(byte_as_int):
 | 
| 148 |                 # Leaving a generic error for now, but if we want to, it's not
 | 
| 149 |                 # hard to calculate the position where things go wrong.  Note
 | 
| 150 |                 # that offset might be more than 4, for an invalid utf-8 string.
 | 
| 151 |                 e_strict(_INVALID_START, loc.Missing)
 | 
| 152 |             return i
 | 
| 153 | 
 | 
| 154 |     e_strict(_INVALID_START, loc.Missing)
 | 
| 155 | 
 | 
| 156 | 
 | 
| 157 | def CountUtf8Chars(s):
 | 
| 158 |     # type: (str) -> int
 | 
| 159 |     """Returns the number of utf-8 characters in the byte string 's'.
 | 
| 160 | 
 | 
| 161 |     TODO: Raise exception rather than returning a string, so we can set the exit
 | 
| 162 |     code of the command to 1 ?
 | 
| 163 | 
 | 
| 164 |     $ echo ${#bad}
 | 
| 165 |     Invalid utf-8 at index 3 of string 'bad': 'ab\xffd'
 | 
| 166 |     $ echo $?
 | 
| 167 |     1
 | 
| 168 |     """
 | 
| 169 |     num_chars = 0
 | 
| 170 |     num_bytes = len(s)
 | 
| 171 |     i = 0
 | 
| 172 |     while i < num_bytes:
 | 
| 173 |         i = NextUtf8Char(s, i)
 | 
| 174 |         num_chars += 1
 | 
| 175 |     return num_chars
 | 
| 176 | 
 | 
| 177 | 
 | 
| 178 | def AdvanceUtf8Chars(s, num_chars, byte_offset):
 | 
| 179 |     # type: (str, int, int) -> int
 | 
| 180 |     """Starting from byte offset, advance by N UTF-8 runes
 | 
| 181 | 
 | 
| 182 |     Returns a byte offset.
 | 
| 183 | 
 | 
| 184 |     Used for shell slicing.
 | 
| 185 |     """
 | 
| 186 |     num_bytes = len(s)
 | 
| 187 |     i = byte_offset  # current byte position
 | 
| 188 | 
 | 
| 189 |     for _ in xrange(num_chars):
 | 
| 190 |         # Neither bash or zsh checks out of bounds for slicing.  Either begin or
 | 
| 191 |         # length.
 | 
| 192 |         if i >= num_bytes:
 | 
| 193 |             return i
 | 
| 194 |             #raise RuntimeError('Out of bounds')
 | 
| 195 | 
 | 
| 196 |         i = NextUtf8Char(s, i)
 | 
| 197 | 
 | 
| 198 |     return i
 | 
| 199 | 
 | 
| 200 | 
 | 
| 201 | # Limited Unicode codepoints for whitespace characters.
 | 
| 202 | # Oils intentionally does not include characters from <USP>, as that set
 | 
| 203 | # depends on the version of the Unicode standard used.
 | 
| 204 | #
 | 
| 205 | # See discussion on the original pull request which added this list here:
 | 
| 206 | #
 | 
| 207 | #   https://github.com/oilshell/oil/pull/1836#issuecomment-1942173520
 | 
| 208 | #
 | 
| 209 | # See also the Mozilla Javascript documentation, and the note on how
 | 
| 210 | # changes to the standard affected Javascript:
 | 
| 211 | #
 | 
| 212 | #     https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#white_space
 | 
| 213 | 
 | 
| 214 | SPACES = [
 | 
| 215 |     0x0009,  # Horizontal tab (\t)
 | 
| 216 |     0x000A,  # Newline (\n)
 | 
| 217 |     0x000B,  # Vertical tab (\v)
 | 
| 218 |     0x000C,  # Form feed (\f)
 | 
| 219 |     0x000D,  # Carriage return (\r)
 | 
| 220 |     0x0020,  # Normal space
 | 
| 221 |     0x00A0,  # No-break space <NBSP>
 | 
| 222 |     0xFEFF,  # Zero-width no-break space <ZWNBSP>
 | 
| 223 | ]
 | 
| 224 | 
 | 
| 225 | 
 | 
| 226 | def _IsSpace(codepoint):
 | 
| 227 |     # type: (int) -> bool
 | 
| 228 |     return codepoint in SPACES
 | 
| 229 | 
 | 
| 230 | 
 | 
| 231 | def StartsWithWhitespaceByteRange(s):
 | 
| 232 |     # type: (str) -> Tuple[int, int]
 | 
| 233 |     """Returns the range of 's' which has leading whitespace characters.
 | 
| 234 | 
 | 
| 235 |     If 's' has no leading whitespace, an valid but empty range is returned.
 | 
| 236 | 
 | 
| 237 |     The returned range is given as byte positions, and is a half-open range
 | 
| 238 |     "[start, end)" which is returned as a tuple.
 | 
| 239 | 
 | 
| 240 |     Used for shell functions like 'trimStart' to match then trim whitespace.
 | 
| 241 |     """
 | 
| 242 |     len_s = len(s)
 | 
| 243 |     i = 0
 | 
| 244 |     while i < len_s:
 | 
| 245 |         codepoint = DecodeUtf8Char(s, i)
 | 
| 246 |         if not _IsSpace(codepoint):
 | 
| 247 |             break
 | 
| 248 | 
 | 
| 249 |         try:
 | 
| 250 |             i = NextUtf8Char(s, i)
 | 
| 251 |         except error.Strict:
 | 
| 252 |             assert False, "DecodeUtf8Char should have caught any encoding errors"
 | 
| 253 | 
 | 
| 254 |     start = 0
 | 
| 255 |     end = i
 | 
| 256 |     return (start, end)
 | 
| 257 | 
 | 
| 258 | 
 | 
| 259 | def EndsWithWhitespaceByteRange(s):
 | 
| 260 |     # type: (str) -> Tuple[int, int]
 | 
| 261 |     """Returns the range of 's' which has trailing whitespace characters.
 | 
| 262 | 
 | 
| 263 |     If 's' has no leading whitespace, an valid but empty range is returned.
 | 
| 264 | 
 | 
| 265 |     The returned range is given as byte positions, and is a half-open range
 | 
| 266 |     "[start, end)" which is returned as a tuple.
 | 
| 267 | 
 | 
| 268 |     Used for shell functions like 'trimEnd' to match then trim whitespace.
 | 
| 269 |     """
 | 
| 270 |     len_s = len(s)
 | 
| 271 |     i = len_s
 | 
| 272 |     while i > 0:
 | 
| 273 |         # TODO: Gracefully handle surrogate pairs and overlong encodings when
 | 
| 274 |         # finding the start of each character.
 | 
| 275 |         prev = PreviousUtf8Char(s, i)
 | 
| 276 | 
 | 
| 277 |         codepoint = DecodeUtf8Char(s, prev)
 | 
| 278 |         if not _IsSpace(codepoint):
 | 
| 279 |             break
 | 
| 280 | 
 | 
| 281 |         i = prev
 | 
| 282 | 
 | 
| 283 |     start = i
 | 
| 284 |     end = len_s
 | 
| 285 |     return (start, end)
 | 
| 286 | 
 | 
| 287 | 
 | 
| 288 | # Implementation without Python regex:
 | 
| 289 | #
 | 
| 290 | # (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
 | 
| 291 | # regexec.  in a loop.  fnmatch() does NOT given positions of matches.
 | 
| 292 | #
 | 
| 293 | # (2) Strip -- % %% # ## -
 | 
| 294 | #
 | 
| 295 | # a. Fast path for constant strings.
 | 
| 296 | # b. Convert to POSIX extended regex, to see if it matches at ALL.  If it
 | 
| 297 | # doesn't match, short circuit out?  We can't do this with fnmatch.
 | 
| 298 | # c. If it does match, call fnmatch() iteratively over prefixes / suffixes.
 | 
| 299 | #
 | 
| 300 | # - # shortest prefix - [:1], [:2], [:3] until it matches
 | 
| 301 | # - ## longest prefix - [:-1] [:-2], [:3].  Works because fnmatch does not
 | 
| 302 | #                       match prefixes, it matches EXACTLY.
 | 
| 303 | # - % shortest suffix - [-1:] [-2:] [-3:] ...
 | 
| 304 | # - %% longest suffix - [1:] [2:] [3:]
 | 
| 305 | #
 | 
| 306 | # See remove_pattern() in subst.c for bash, and trimsub() in eval.c for
 | 
| 307 | # mksh.  Dash doesn't implement it.
 | 
| 308 | 
 | 
| 309 | # TODO:
 | 
| 310 | # - Unicode support: Convert both pattern, string, and replacement to unicode,
 | 
| 311 | #   then the result back at the end.
 | 
| 312 | # - Compile time errors for [[:space:]] ?
 | 
| 313 | 
 | 
| 314 | 
 | 
| 315 | def DoUnarySuffixOp(s, op_tok, arg, is_extglob):
 | 
| 316 |     # type: (str, Token, str, bool) -> str
 | 
| 317 |     """Helper for ${x#prefix} and family."""
 | 
| 318 | 
 | 
| 319 |     id_ = op_tok.id
 | 
| 320 | 
 | 
| 321 |     # Fast path for constant strings.
 | 
| 322 |     # TODO: Should be LooksLikeExtendedGlob!
 | 
| 323 |     if not is_extglob and not glob_.LooksLikeGlob(arg):
 | 
| 324 |         # It doesn't look like a glob, but we glob-escaped it (e.g. [ -> \[).  So
 | 
| 325 |         # reverse it.  NOTE: We also do this check in Globber.Expand().  It would
 | 
| 326 |         # be nice to somehow store the original string rather than
 | 
| 327 |         # escaping/unescaping.
 | 
| 328 |         arg = glob_.GlobUnescape(arg)
 | 
| 329 | 
 | 
| 330 |         if id_ in (Id.VOp1_Pound, Id.VOp1_DPound):  # const prefix
 | 
| 331 |             # explicit check for non-empty arg (len for mycpp)
 | 
| 332 |             if len(arg) and s.startswith(arg):
 | 
| 333 |                 return s[len(arg):]
 | 
| 334 |             else:
 | 
| 335 |                 return s
 | 
| 336 | 
 | 
| 337 |         elif id_ in (Id.VOp1_Percent, Id.VOp1_DPercent):  # const suffix
 | 
| 338 |             # need explicit check for non-empty arg (len for mycpp)
 | 
| 339 |             if len(arg) and s.endswith(arg):
 | 
| 340 |                 return s[:-len(arg)]
 | 
| 341 |             else:
 | 
| 342 |                 return s
 | 
| 343 | 
 | 
| 344 |         # These operators take glob arguments, we don't implement that obscure case.
 | 
| 345 |         elif id_ == Id.VOp1_Comma:  # Only lowercase the first letter
 | 
| 346 |             if arg != '':
 | 
| 347 |                 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
 | 
| 348 |             if len(s):
 | 
| 349 |                 return s[0].lower() + s[1:]
 | 
| 350 |             else:
 | 
| 351 |                 return s
 | 
| 352 | 
 | 
| 353 |         elif id_ == Id.VOp1_DComma:
 | 
| 354 |             if arg != '':
 | 
| 355 |                 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
 | 
| 356 |             return s.lower()
 | 
| 357 | 
 | 
| 358 |         elif id_ == Id.VOp1_Caret:  # Only uppercase the first letter
 | 
| 359 |             if arg != '':
 | 
| 360 |                 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
 | 
| 361 |             if len(s):
 | 
| 362 |                 return s[0].upper() + s[1:]
 | 
| 363 |             else:
 | 
| 364 |                 return s
 | 
| 365 | 
 | 
| 366 |         elif id_ == Id.VOp1_DCaret:
 | 
| 367 |             if arg != '':
 | 
| 368 |                 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
 | 
| 369 |             return s.upper()
 | 
| 370 | 
 | 
| 371 |         else:  # e.g. ^ ^^ , ,,
 | 
| 372 |             raise AssertionError(id_)
 | 
| 373 | 
 | 
| 374 |     # For patterns, do fnmatch() in a loop.
 | 
| 375 |     #
 | 
| 376 |     # TODO:
 | 
| 377 |     # - Another potential fast path:
 | 
| 378 |     #   v=aabbccdd
 | 
| 379 |     #   echo ${v#*b}  # strip shortest prefix
 | 
| 380 |     #
 | 
| 381 |     # If the whole thing doesn't match '*b*', then no test can succeed.  So we
 | 
| 382 |     # can fail early.  Conversely echo ${v%%c*} and '*c*'.
 | 
| 383 |     #
 | 
| 384 |     # (Although honestly this whole construct is nuts and should be deprecated.)
 | 
| 385 | 
 | 
| 386 |     n = len(s)
 | 
| 387 | 
 | 
| 388 |     if id_ == Id.VOp1_Pound:  # shortest prefix
 | 
| 389 |         # 'abcd': match '', 'a', 'ab', 'abc', ...
 | 
| 390 |         i = 0
 | 
| 391 |         while True:
 | 
| 392 |             assert i <= n
 | 
| 393 |             #log('Matching pattern %r with %r', arg, s[:i])
 | 
| 394 |             if libc.fnmatch(arg, s[:i]):
 | 
| 395 |                 return s[i:]
 | 
| 396 |             if i >= n:
 | 
| 397 |                 break
 | 
| 398 |             i = NextUtf8Char(s, i)
 | 
| 399 |         return s
 | 
| 400 | 
 | 
| 401 |     elif id_ == Id.VOp1_DPound:  # longest prefix
 | 
| 402 |         # 'abcd': match 'abc', 'ab', 'a'
 | 
| 403 |         i = n
 | 
| 404 |         while True:
 | 
| 405 |             assert i >= 0
 | 
| 406 |             #log('Matching pattern %r with %r', arg, s[:i])
 | 
| 407 |             if libc.fnmatch(arg, s[:i]):
 | 
| 408 |                 return s[i:]
 | 
| 409 |             if i == 0:
 | 
| 410 |                 break
 | 
| 411 |             i = PreviousUtf8Char(s, i)
 | 
| 412 |         return s
 | 
| 413 | 
 | 
| 414 |     elif id_ == Id.VOp1_Percent:  # shortest suffix
 | 
| 415 |         # 'abcd': match 'abcd', 'abc', 'ab', 'a'
 | 
| 416 |         i = n
 | 
| 417 |         while True:
 | 
| 418 |             assert i >= 0
 | 
| 419 |             #log('Matching pattern %r with %r', arg, s[:i])
 | 
| 420 |             if libc.fnmatch(arg, s[i:]):
 | 
| 421 |                 return s[:i]
 | 
| 422 |             if i == 0:
 | 
| 423 |                 break
 | 
| 424 |             i = PreviousUtf8Char(s, i)
 | 
| 425 |         return s
 | 
| 426 | 
 | 
| 427 |     elif id_ == Id.VOp1_DPercent:  # longest suffix
 | 
| 428 |         # 'abcd': match 'abc', 'bc', 'c', ...
 | 
| 429 |         i = 0
 | 
| 430 |         while True:
 | 
| 431 |             assert i <= n
 | 
| 432 |             #log('Matching pattern %r with %r', arg, s[:i])
 | 
| 433 |             if libc.fnmatch(arg, s[i:]):
 | 
| 434 |                 return s[:i]
 | 
| 435 |             if i >= n:
 | 
| 436 |                 break
 | 
| 437 |             i = NextUtf8Char(s, i)
 | 
| 438 |         return s
 | 
| 439 | 
 | 
| 440 |     else:
 | 
| 441 |         raise NotImplementedError(ui.PrettyId(id_))
 | 
| 442 | 
 | 
| 443 | 
 | 
| 444 | def _AllMatchPositions(s, regex):
 | 
| 445 |     # type: (str, str) -> List[Tuple[int, int]]
 | 
| 446 |     """Returns a list of all (start, end) match positions of the regex against
 | 
| 447 |     s.
 | 
| 448 | 
 | 
| 449 |     (If there are no matches, it returns the empty list.)
 | 
| 450 |     """
 | 
| 451 |     matches = []  # type: List[Tuple[int, int]]
 | 
| 452 |     pos = 0
 | 
| 453 |     n = len(s)
 | 
| 454 |     while pos < n:  # needed to prevent infinite loop in (.*) case
 | 
| 455 |         m = libc.regex_first_group_match(regex, s, pos)
 | 
| 456 |         if m is None:
 | 
| 457 |             break
 | 
| 458 |         matches.append(m)
 | 
| 459 |         start, end = m
 | 
| 460 |         pos = end  # advance position
 | 
| 461 |     return matches
 | 
| 462 | 
 | 
| 463 | 
 | 
| 464 | def _PatSubAll(s, regex, replace_str):
 | 
| 465 |     # type: (str, str, str) -> str
 | 
| 466 |     parts = []  # type: List[str]
 | 
| 467 |     prev_end = 0
 | 
| 468 |     for start, end in _AllMatchPositions(s, regex):
 | 
| 469 |         parts.append(s[prev_end:start])
 | 
| 470 |         parts.append(replace_str)
 | 
| 471 |         prev_end = end
 | 
| 472 |     parts.append(s[prev_end:])
 | 
| 473 |     return ''.join(parts)
 | 
| 474 | 
 | 
| 475 | 
 | 
| 476 | class GlobReplacer(object):
 | 
| 477 | 
 | 
| 478 |     def __init__(self, regex, replace_str, slash_tok):
 | 
| 479 |         # type: (str, str, Token) -> None
 | 
| 480 | 
 | 
| 481 |         # TODO: It would be nice to cache the compilation of the regex here,
 | 
| 482 |         # instead of just the string.  That would require more sophisticated use of
 | 
| 483 |         # the Python/C API in libc.c, which we might want to avoid.
 | 
| 484 |         self.regex = regex
 | 
| 485 |         self.replace_str = replace_str
 | 
| 486 |         self.slash_tok = slash_tok
 | 
| 487 | 
 | 
| 488 |     def __repr__(self):
 | 
| 489 |         # type: () -> str
 | 
| 490 |         return '<_GlobReplacer regex %r r %r>' % (self.regex, self.replace_str)
 | 
| 491 | 
 | 
| 492 |     def Replace(self, s, op):
 | 
| 493 |         # type: (str, suffix_op.PatSub) -> str
 | 
| 494 | 
 | 
| 495 |         regex = '(%s)' % self.regex  # make it a group
 | 
| 496 | 
 | 
| 497 |         if op.replace_mode == Id.Lit_Slash:
 | 
| 498 |             # Avoid infinite loop when replacing all copies of empty string
 | 
| 499 |             if len(self.regex) == 0:
 | 
| 500 |                 return s
 | 
| 501 | 
 | 
| 502 |             try:
 | 
| 503 |                 # loop over matches
 | 
| 504 |                 return _PatSubAll(s, regex, self.replace_str)
 | 
| 505 |             except RuntimeError as e:
 | 
| 506 |                 # Not sure if this is possible since we convert from glob:
 | 
| 507 |                 # libc.regex_first_group_match raises RuntimeError on regex syntax
 | 
| 508 |                 # error.
 | 
| 509 |                 msg = e.message  # type: str
 | 
| 510 |                 e_die('Error matching regex %r: %s' % (regex, msg),
 | 
| 511 |                       self.slash_tok)
 | 
| 512 | 
 | 
| 513 |         if op.replace_mode == Id.Lit_Pound:
 | 
| 514 |             regex = '^' + regex
 | 
| 515 |         elif op.replace_mode == Id.Lit_Percent:
 | 
| 516 |             regex = regex + '$'
 | 
| 517 | 
 | 
| 518 |         m = libc.regex_first_group_match(regex, s, 0)
 | 
| 519 |         #log('regex = %r, s = %r, match = %r', regex, s, m)
 | 
| 520 |         if m is None:
 | 
| 521 |             return s
 | 
| 522 |         start, end = m
 | 
| 523 |         return s[:start] + self.replace_str + s[end:]
 | 
| 524 | 
 | 
| 525 | 
 | 
| 526 | def ShellQuoteB(s):
 | 
| 527 |     # type: (str) -> str
 | 
| 528 |     """Quote by adding backslashes.
 | 
| 529 | 
 | 
| 530 |     Used for autocompletion, so it's friendlier for display on the
 | 
| 531 |     command line. We use the strategy above for other use cases.
 | 
| 532 |     """
 | 
| 533 |     # There's no way to escape a newline!  Bash prints ^J for some reason, but
 | 
| 534 |     # we're more explicit.  This will happen if there's a newline on a file
 | 
| 535 |     # system or a completion plugin returns a newline.
 | 
| 536 | 
 | 
| 537 |     # NOTE: tabs CAN be escaped with \.
 | 
| 538 |     s = s.replace('\r', '<INVALID CR>').replace('\n', '<INVALID NEWLINE>')
 | 
| 539 | 
 | 
| 540 |     # ~ for home dir
 | 
| 541 |     # ! for history
 | 
| 542 |     # * [] ? for glob
 | 
| 543 |     # {} for brace expansion
 | 
| 544 |     # space because it separates words
 | 
| 545 |     return pyutil.BackslashEscape(s, ' `~!$&*()[]{}\\|;\'"<>?')
 |