OILS / builtin / method_str.py View on Github | oilshell.org

514 lines, 302 significant
1"""YSH Str methods"""
2
3from __future__ import print_function
4
5from _devbuild.gen.syntax_asdl import loc_t, loc
6from _devbuild.gen.runtime_asdl import scope_e
7from _devbuild.gen.value_asdl import (value, value_e, value_t, eggex_ops,
8 eggex_ops_t, RegexMatch, LeftName)
9from builtin import pure_ysh
10from core import error
11from core import state
12from core import vm
13from frontend import typed_args
14from mycpp import mops
15from mycpp.mylib import log, tagswitch
16from osh import string_ops
17from ysh import expr_eval
18from ysh import regex_translate
19from ysh import val_ops
20
21import libc
22from libc import REG_NOTBOL
23
24from typing import cast, Any, List, Optional, Tuple
25
26_ = log
27
28
29def _StrMatchStart(s, p):
30 # type: (str, str) -> Tuple[bool, int, int]
31 """Returns the range of bytes in 's' that match string pattern `p`. the
32 pattern matches if 's' starts with all the characters in 'p'.
33
34 The returned match result is the tuple "(matched, begin, end)". 'matched'
35 is true if the pattern matched. 'begin' and 'end' give the half-open range
36 "[begin, end)" of byte indices from 's' for the match, and are a valid but
37 empty range if 'match' is false.
38
39 Used for shell functions like 'trimStart' when trimming a prefix string.
40 """
41 if s.startswith(p):
42 return (True, 0, len(p))
43 else:
44 return (False, 0, 0)
45
46
47def _StrMatchEnd(s, p):
48 # type: (str, str) -> Tuple[bool, int, int]
49 """Returns a match result for the bytes in 's' that match string pattern
50 `p`. the pattern matches if 's' ends with all the characters in 'p'.
51
52 The returned match result is the tuple "(matched, begin, end)". 'matched'
53 is true if the pattern matched. 'begin' and 'end' give the half-open range
54 "[begin, end)" of byte indices from 's' for the match, and are a valid but
55 empty range if 'match' is false.
56
57 Used for shell functions like 'trimEnd' when trimming a suffix string.
58 """
59 len_s = len(s)
60 if s.endswith(p):
61 return (True, len_s - len(p), len_s)
62 else:
63 return (False, len_s, len_s)
64
65
66def _EggexMatchCommon(s, p, ere, empty_p):
67 # type: (str, value.Eggex, str, int) -> Tuple[bool, int, int]
68 cflags = regex_translate.LibcFlags(p.canonical_flags)
69 eflags = 0
70 indices = libc.regex_search(ere, cflags, s, eflags)
71 if indices is None:
72 return (False, empty_p, empty_p)
73
74 start = indices[0]
75 end = indices[1]
76
77 return (True, start, end)
78
79
80def _EggexMatchStart(s, p):
81 # type: (str, value.Eggex) -> Tuple[bool, int, int]
82 """Returns a match result for the bytes in 's' that match Eggex pattern
83 `p` when constrained to match at the start of the string.
84
85 Any capturing done by the Eggex pattern is ignored.
86
87 The returned match result is the tuple "(matched, begin, end)". 'matched'
88 is true if the pattern matched. 'begin' and 'end' give the half-open range
89 "[begin, end)" of byte indices from 's' for the match, and are a valid but
90 empty range if 'match' is false.
91
92 Used for shell functions like 'trimStart' when trimming with an Eggex
93 pattern.
94 """
95 ere = regex_translate.AsPosixEre(p)
96 if not ere.startswith('^'):
97 ere = '^' + ere
98 return _EggexMatchCommon(s, p, ere, 0)
99
100
101def _EggexMatchEnd(s, p):
102 # type: (str, value.Eggex) -> Tuple[bool, int, int]
103 """Like _EggexMatchStart, but matches against the end of the
104 string.
105 """
106 ere = regex_translate.AsPosixEre(p)
107 if not ere.endswith('$'):
108 ere = ere + '$'
109 return _EggexMatchCommon(s, p, ere, len(s))
110
111
112START = 0b01
113END = 0b10
114
115
116class HasAffix(vm._Callable):
117 """ Implements `startsWith()`, `endsWith()`. """
118
119 def __init__(self, anchor):
120 # type: (int) -> None
121 assert anchor in (START, END), ("Anchor must be START or END")
122 self.anchor = anchor
123
124 def Call(self, rd):
125 # type: (typed_args.Reader) -> value_t
126 """
127 string => startsWith(pattern_str) # => bool
128 string => startsWith(pattern_eggex) # => bool
129 string => endsWith(pattern_str) # => bool
130 string => endsWith(pattern_eggex) # => bool
131 """
132
133 string = rd.PosStr()
134 pattern_val = rd.PosValue()
135 pattern_str = None # type: str
136 pattern_eggex = None # type: value.Eggex
137 with tagswitch(pattern_val) as case:
138 if case(value_e.Eggex):
139 pattern_eggex = cast(value.Eggex, pattern_val)
140 elif case(value_e.Str):
141 pattern_str = cast(value.Str, pattern_val).s
142 else:
143 raise error.TypeErr(pattern_val,
144 'expected pattern to be Eggex or Str',
145 rd.LeftParenToken())
146 rd.Done()
147
148 matched = False
149 try:
150 if pattern_str is not None:
151 if self.anchor & START:
152 matched, _, _ = _StrMatchStart(string, pattern_str)
153 else:
154 matched, _, _ = _StrMatchEnd(string, pattern_str)
155 else:
156 assert pattern_eggex is not None
157 if self.anchor & START:
158 matched, _, _ = _EggexMatchStart(string, pattern_eggex)
159 else:
160 matched, _, _ = _EggexMatchEnd(string, pattern_eggex)
161 except error.Strict as e:
162 raise error.Expr(e.msg, e.location)
163
164 return value.Bool(matched)
165
166
167class Trim(vm._Callable):
168 """ Implements `trimStart()`, `trimEnd()`, and `trim()` """
169
170 def __init__(self, anchor):
171 # type: (int) -> None
172 assert anchor in (START, END, START
173 | END), ("Anchor must be START, END, or START|END")
174 self.anchor = anchor
175
176 def Call(self, rd):
177 # type: (typed_args.Reader) -> value_t
178 """
179 string => trimStart() # => Str
180 string => trimEnd() # => Str
181 string => trim() # => Str
182 string => trimStart(pattern_str) # => Str
183 string => trimEnd(pattern_str) # => Str
184 string => trim(pattern_str) # => Str
185 string => trimStart(pattern_eggex) # => Str
186 string => trimEnd(pattern_eggex) # => Str
187 string => trim(pattern_eggex) # => Str
188 """
189
190 string = rd.PosStr()
191 pattern_val = rd.OptionalValue()
192 pattern_str = None # type: str
193 pattern_eggex = None # type: value.Eggex
194 if pattern_val:
195 with tagswitch(pattern_val) as case:
196 if case(value_e.Eggex):
197 pattern_eggex = cast(value.Eggex, pattern_val)
198 elif case(value_e.Str):
199 pattern_str = cast(value.Str, pattern_val).s
200 else:
201 raise error.TypeErr(pattern_val,
202 'expected pattern to be Eggex or Str',
203 rd.LeftParenToken())
204 rd.Done()
205
206 start = 0
207 end = len(string)
208 try:
209 if pattern_str is not None:
210 if self.anchor & START:
211 _, _, start = _StrMatchStart(string, pattern_str)
212 if self.anchor & END:
213 _, end, _ = _StrMatchEnd(string, pattern_str)
214 elif pattern_eggex is not None:
215 if self.anchor & START:
216 _, _, start = _EggexMatchStart(string, pattern_eggex)
217 if self.anchor & END:
218 _, end, _ = _EggexMatchEnd(string, pattern_eggex)
219 else:
220 if self.anchor & START:
221 _, start = string_ops.StartsWithWhitespaceByteRange(string)
222 if self.anchor & END:
223 end, _ = string_ops.EndsWithWhitespaceByteRange(string)
224 except error.Strict as e:
225 raise error.Expr(e.msg, e.location)
226
227 res = string[start:end]
228 return value.Str(res)
229
230
231class Upper(vm._Callable):
232
233 def __init__(self):
234 # type: () -> None
235 pass
236
237 def Call(self, rd):
238 # type: (typed_args.Reader) -> value_t
239
240 s = rd.PosStr()
241 rd.Done()
242
243 # TODO: unicode support
244 return value.Str(s.upper())
245
246
247class Lower(vm._Callable):
248
249 def __init__(self):
250 # type: () -> None
251 pass
252
253 def Call(self, rd):
254 # type: (typed_args.Reader) -> value_t
255
256 s = rd.PosStr()
257 rd.Done()
258
259 # TODO: unicode support
260 return value.Str(s.lower())
261
262
263SEARCH = 0
264LEFT_MATCH = 1
265
266
267class SearchMatch(vm._Callable):
268
269 def __init__(self, which_method):
270 # type: (int) -> None
271 self.which_method = which_method
272
273 def Call(self, rd):
274 # type: (typed_args.Reader) -> value_t
275 """
276 s => search(eggex, pos=0)
277 """
278 string = rd.PosStr()
279
280 pattern = rd.PosValue() # Eggex or ERE Str
281 with tagswitch(pattern) as case:
282 if case(value_e.Eggex):
283 eggex_val = cast(value.Eggex, pattern)
284
285 # lazily converts to ERE
286 ere = regex_translate.AsPosixEre(eggex_val)
287 cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
288 capture = eggex_ops.Yes(
289 eggex_val.convert_funcs, eggex_val.convert_toks,
290 eggex_val.capture_names) # type: eggex_ops_t
291
292 elif case(value_e.Str):
293 ere = cast(value.Str, pattern).s
294 cflags = 0
295 capture = eggex_ops.No
296
297 else:
298 # TODO: add method name to this error
299 raise error.TypeErr(pattern, 'expected Eggex or Str',
300 rd.LeftParenToken())
301
302 # It's called 'pos', not 'start' like Python. Python has 2 kinds of
303 # 'start' in its regex API, which can be confusing.
304 pos = mops.BigTruncate(rd.NamedInt('pos', 0))
305 rd.Done()
306
307 # Make it anchored
308 if self.which_method == LEFT_MATCH and not ere.startswith('^'):
309 ere = '^' + ere
310
311 if self.which_method == LEFT_MATCH:
312 eflags = 0 # ^ matches beginning even if pos=5
313 else:
314 eflags = 0 if pos == 0 else REG_NOTBOL # ^ only matches when pos=0
315
316 indices = libc.regex_search(ere, cflags, string, eflags, pos)
317
318 if indices is None:
319 return value.Null
320
321 return RegexMatch(string, indices, capture)
322
323
324class ctx_EvalReplace(object):
325 """For $0, $1, $2, $3, ... replacements in Str => replace()"""
326
327 def __init__(self, mem, arg0, argv):
328 # type: (state.Mem, str, Optional[List[str]]) -> None
329 # argv will be None for Str => replace(Str, Expr)
330 if argv is None:
331 self.pushed_argv = False
332 else:
333 mem.argv_stack.append(state._ArgFrame(argv))
334 self.pushed_argv = True
335
336 # $0 needs to have lexical scoping. So we store it with other locals.
337 # As "0" cannot be parsed as an lvalue, we can safely store arg0 there.
338 assert mem.GetValue("0", scope_e.LocalOnly).tag() == value_e.Undef
339 self.lval = LeftName("0", loc.Missing)
340 mem.SetLocalName(self.lval, value.Str(arg0))
341
342 self.mem = mem
343
344 def __enter__(self):
345 # type: () -> None
346 pass
347
348 def __exit__(self, type, value_, traceback):
349 # type: (Any, Any, Any) -> None
350 self.mem.SetLocalName(self.lval, value.Undef)
351 if self.pushed_argv:
352 self.mem.argv_stack.pop()
353
354
355class Replace(vm._Callable):
356
357 def __init__(self, mem, expr_ev):
358 # type: (state.Mem, expr_eval.ExprEvaluator) -> None
359 self.mem = mem
360 self.expr_ev = expr_ev
361
362 def EvalSubstExpr(self, expr, blame_loc):
363 # type: (value.Expr, loc_t) -> str
364 res = self.expr_ev.EvalExpr(expr.e, blame_loc)
365 if res.tag() == value_e.Str:
366 return cast(value.Str, res).s
367
368 raise error.TypeErr(res, "expected expr to eval to a Str", blame_loc)
369
370 def Call(self, rd):
371 # type: (typed_args.Reader) -> value_t
372 """
373 s => replace(string_val, subst_str, count=-1)
374 s => replace(string_val, subst_expr, count=-1)
375 s => replace(eggex_val, subst_str, count=-1)
376 s => replace(eggex_val, subst_expr, count=-1)
377
378 For count in [0, MAX_INT], there will be no more than count
379 replacements. Any negative count should read as unset, and replace will
380 replace all occurances of the pattern.
381 """
382 string = rd.PosStr()
383
384 string_val = None # type: value.Str
385 eggex_val = None # type: value.Eggex
386 subst_str = None # type: value.Str
387 subst_expr = None # type: value.Expr
388
389 pattern = rd.PosValue()
390 with tagswitch(pattern) as case:
391 if case(value_e.Eggex):
392 # HACK: mycpp will otherwise generate:
393 # value::Eggex* eggex_val ...
394 eggex_val_ = cast(value.Eggex, pattern)
395 eggex_val = eggex_val_
396
397 elif case(value_e.Str):
398 string_val_ = cast(value.Str, pattern)
399 string_val = string_val_
400
401 else:
402 raise error.TypeErr(pattern,
403 'expected pattern to be Eggex or Str',
404 rd.LeftParenToken())
405
406 subst = rd.PosValue()
407 with tagswitch(subst) as case:
408 if case(value_e.Str):
409 subst_str_ = cast(value.Str, subst)
410 subst_str = subst_str_
411
412 elif case(value_e.Expr):
413 subst_expr_ = cast(value.Expr, subst)
414 subst_expr = subst_expr_
415
416 else:
417 raise error.TypeErr(subst,
418 'expected substitution to be Str or Expr',
419 rd.LeftParenToken())
420
421 count = mops.BigTruncate(rd.NamedInt("count", -1))
422 rd.Done()
423
424 if count == 0:
425 return value.Str(string)
426
427 if string_val:
428 if subst_str:
429 s = subst_str.s
430 if subst_expr:
431 # Eval with $0 set to string_val (the matched substring)
432 with ctx_EvalReplace(self.mem, string_val.s, None):
433 s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
434 assert s is not None
435
436 result = string.replace(string_val.s, s, count)
437
438 return value.Str(result)
439
440 if eggex_val:
441 ere = regex_translate.AsPosixEre(eggex_val)
442 cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
443
444 # Walk through the string finding all matches of the compiled ere.
445 # Then, collect unmatched substrings and substitutions into the
446 # `parts` list.
447 pos = 0
448 parts = [] # type: List[str]
449 replace_count = 0
450 while pos < len(string):
451 indices = libc.regex_search(ere, cflags, string, 0, pos)
452 if indices is None:
453 break
454
455 # Collect captures
456 arg0 = None # type: str
457 argv = [] # type: List[str]
458 named_vars = [] # type: List[Tuple[str, value_t]]
459 num_groups = len(indices) / 2
460 for group in xrange(num_groups):
461 start = indices[2 * group]
462 end = indices[2 * group + 1]
463 captured = string[start:end]
464 val = value.Str(captured) # type: value_t
465
466 if len(eggex_val.convert_funcs) and group != 0:
467 convert_func = eggex_val.convert_funcs[group - 1]
468 convert_tok = eggex_val.convert_toks[group - 1]
469
470 if convert_func:
471 val = self.expr_ev.CallConvertFunc(
472 convert_func, val, convert_tok,
473 rd.LeftParenToken())
474
475 # $0, $1, $2 variables are argv values, which must be
476 # strings. Furthermore, they can only be used in string
477 # contexts
478 # eg. "$[1]" != "$1".
479 val_str = val_ops.Stringify(val, rd.LeftParenToken())
480 if group == 0:
481 arg0 = val_str
482 else:
483 argv.append(val_str)
484
485 # $0 cannot be named
486 if group != 0:
487 name = eggex_val.capture_names[group - 2]
488 if name is not None:
489 named_vars.append((name, val))
490
491 if subst_str:
492 s = subst_str.s
493 if subst_expr:
494 with ctx_EvalReplace(self.mem, arg0, argv):
495 with pure_ysh.ctx_Shvar(self.mem, named_vars):
496 s = self.EvalSubstExpr(subst_expr,
497 rd.LeftParenToken())
498 assert s is not None
499
500 start = indices[0]
501 end = indices[1]
502 parts.append(string[pos:start]) # Unmatched substring
503 parts.append(s) # Replacement
504 pos = end # Move to end of match
505
506 replace_count += 1
507 if count != -1 and replace_count == count:
508 break
509
510 parts.append(string[pos:]) # Remaining unmatched substring
511
512 return value.Str("".join(parts))
513
514 raise AssertionError()