osh/string_ops.py

OILS / osh / string_ops.py View on Github | oilshell.org

545 lines, 253 significant

1	"""
2	string_ops.py - String library functions that can be exposed with a saner syntax.
3
4	OSH:
5
6	local y=${x//a*/b}
7
8	YSH:
9
10	var y = x => sub('a*', 'b', :ALL)
11
12	Pass x => sub('a*', 'b', :ALL) => var y
13	"""
14
15	from _devbuild.gen.id_kind_asdl import Id
16	from _devbuild.gen.syntax_asdl import loc, Token, suffix_op
17	from core import pyutil
18	from core import ui
19	from core import error
20	from core.error import e_die, e_strict
21	from mycpp.mylib import log
22	from mycpp import mylib
23	from osh import glob_
24
25	import libc
26	import fastfunc
27
28	from typing import List, Tuple
29
30	_ = log
31
32	# Error types returned by fastfunc.Utf8DecodeOne
33	# Derived from Utf8Error enum from data_lang/utf8.h
34	UTF8_ERR_OVERLONG = -1 # Encodes a codepoint in more bytes than necessary
35	UTF8_ERR_SURROGATE = -2 # Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF)
36	UTF8_ERR_TOO_LARGE = -3 # Encodes a value greater than the max codepoint U+10FFFF
37	UTF8_ERR_BAD_ENCODING = -4 # Encoding doesn't conform to the UTF-8 bit patterns
38	UTF8_ERR_TRUNCATED_BYTES = -5 # It looks like there is another codepoint, but it has been truncated
39
40
41	def Utf8Error_str(error):
42	# type: (int) -> str
43	if error == UTF8_ERR_OVERLONG:
44	return "UTF-8 decode: Overlong"
45	if error == UTF8_ERR_SURROGATE:
46	return "UTF-8 decode: Surrogate range"
47	if error == UTF8_ERR_TOO_LARGE:
48	return "UTF-8 decode: Integer too large"
49	if error == UTF8_ERR_BAD_ENCODING:
50	return "UTF-8 decode: Bad encoding"
51	if error == UTF8_ERR_TRUNCATED_BYTES:
52	return "UTF-8 decode: Truncated bytes"
53
54	raise AssertionError(0)
55
56
57	def DecodeUtf8Char(s, start):
58	# type: (str, int) -> int
59	"""Given a string and start index, decode the Unicode char immediately
60	following the start index. The start location is in bytes and should be
61	found using a function like NextUtf8Char or PreviousUtf8Char.
62
63	If the codepoint in invalid, we raise an `error.Expr`. (This is different
64	from {Next,Previous}Utf8Char which raises an `error.Strict` on encoding
65	errors.)
66	"""
67	codepoint_or_error, _bytes_read = fastfunc.Utf8DecodeOne(s, start)
68	if codepoint_or_error < 0:
69	raise error.Expr(
70	"%s at offset %d in string of %d bytes" %
71	(Utf8Error_str(codepoint_or_error), start, len(s)), loc.Missing)
72	return codepoint_or_error
73
74
75	def NextUtf8Char(s, i):
76	# type: (str, int) -> int
77	"""Given a string and a byte offset, returns the byte position after the
78	character at this position. Usually this is the position of the next
79	character, but for the last character in the string, it's the position just
80	past the end of the string.
81
82	Validates UTF-8.
83	"""
84	codepoint_or_error, bytes_read = fastfunc.Utf8DecodeOne(s, i)
85	if codepoint_or_error < 0:
86	e_strict(
87	"%s at offset %d in string of %d bytes" %
88	(Utf8Error_str(codepoint_or_error), i, len(s)), loc.Missing)
89	return i + bytes_read
90
91
92	_INVALID_START = 'Invalid start of UTF-8 sequence'
93
94
95	def _Utf8CharLen(starting_byte):
96	# type: (int) -> int
97	if (starting_byte >> 7) == 0b0:
98	return 1
99	elif (starting_byte >> 5) == 0b110:
100	return 2
101	elif (starting_byte >> 4) == 0b1110:
102	return 3
103	elif (starting_byte >> 3) == 0b11110:
104	return 4
105	else:
106	e_strict(_INVALID_START, loc.Missing)
107
108
109	def PreviousUtf8Char(s, i):
110	# type: (str, int) -> int
111	"""Given a string and a byte offset, returns the position of the character
112	before that offset. To start (find the first byte of the last character),
113	pass len(s) for the initial value of i.
114
115	Validates UTF-8.
116	"""
117	# All bytes in a valid UTF-8 string have one of the following formats:
118	#
119	# 0xxxxxxx (1-byte char)
120	# 110xxxxx (start of 2-byte char)
121	# 1110xxxx (start of 3-byte char)
122	# 11110xxx (start of 4-byte char)
123	# 10xxxxxx (continuation byte)
124	#
125	# Any byte that starts with 10... MUST be a continuation byte,
126	# otherwise it must be the start of a character (or just invalid
127	# data).
128	#
129	# Walking backward, we stop at the first non-continuaton byte
130	# found. We try to interpret it as a valid UTF-8 character starting
131	# byte, and check that it indicates the correct length, based on how
132	# far we've moved from the original byte. Possible problems:
133	# * byte we stopped on does not have a valid value (e.g., 11111111)
134	# * start byte indicates more or fewer continuation bytes than we've seen
135	# * no start byte at beginning of array
136	#
137	# Note that because we are going backward, on malformed input, we
138	# won't error out in the same place as when parsing the string
139	# forwards as normal.
140	orig_i = i
141
142	while i > 0:
143	i -= 1
144	byte_as_int = mylib.ByteAt(s, i)
145	if (byte_as_int >> 6) != 0b10:
146	offset = orig_i - i
147	if offset != _Utf8CharLen(byte_as_int):
148	# Leaving a generic error for now, but if we want to, it's not
149	# hard to calculate the position where things go wrong. Note
150	# that offset might be more than 4, for an invalid utf-8 string.
151	e_strict(_INVALID_START, loc.Missing)
152	return i
153
154	e_strict(_INVALID_START, loc.Missing)
155
156
157	def CountUtf8Chars(s):
158	# type: (str) -> int
159	"""Returns the number of utf-8 characters in the byte string 's'.
160
161	TODO: Raise exception rather than returning a string, so we can set the exit
162	code of the command to 1 ?
163
164	$ echo ${#bad}
165	Invalid utf-8 at index 3 of string 'bad': 'ab\xffd'
166	$ echo $?
167	1
168	"""
169	num_chars = 0
170	num_bytes = len(s)
171	i = 0
172	while i < num_bytes:
173	i = NextUtf8Char(s, i)
174	num_chars += 1
175	return num_chars
176
177
178	def AdvanceUtf8Chars(s, num_chars, byte_offset):
179	# type: (str, int, int) -> int
180	"""Starting from byte offset, advance by N UTF-8 runes
181
182	Returns a byte offset.
183
184	Used for shell slicing.
185	"""
186	num_bytes = len(s)
187	i = byte_offset # current byte position
188
189	for _ in xrange(num_chars):
190	# Neither bash or zsh checks out of bounds for slicing. Either begin or
191	# length.
192	if i >= num_bytes:
193	return i
194	#raise RuntimeError('Out of bounds')
195
196	i = NextUtf8Char(s, i)
197
198	return i
199
200
201	# Limited Unicode codepoints for whitespace characters.
202	# Oils intentionally does not include characters from <USP>, as that set
203	# depends on the version of the Unicode standard used.
204	#
205	# See discussion on the original pull request which added this list here:
206	#
207	# https://github.com/oilshell/oil/pull/1836#issuecomment-1942173520
208	#
209	# See also the Mozilla Javascript documentation, and the note on how
210	# changes to the standard affected Javascript:
211	#
212	# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#white_space
213
214	SPACES = [
215	0x0009, # Horizontal tab (\t)
216	0x000A, # Newline (\n)
217	0x000B, # Vertical tab (\v)
218	0x000C, # Form feed (\f)
219	0x000D, # Carriage return (\r)
220	0x0020, # Normal space
221	0x00A0, # No-break space <NBSP>
222	0xFEFF, # Zero-width no-break space <ZWNBSP>
223	]
224
225
226	def _IsSpace(codepoint):
227	# type: (int) -> bool
228	return codepoint in SPACES
229
230
231	def StartsWithWhitespaceByteRange(s):
232	# type: (str) -> Tuple[int, int]
233	"""Returns the range of 's' which has leading whitespace characters.
234
235	If 's' has no leading whitespace, an valid but empty range is returned.
236
237	The returned range is given as byte positions, and is a half-open range
238	"[start, end)" which is returned as a tuple.
239
240	Used for shell functions like 'trimStart' to match then trim whitespace.
241	"""
242	len_s = len(s)
243	i = 0
244	while i < len_s:
245	codepoint = DecodeUtf8Char(s, i)
246	if not _IsSpace(codepoint):
247	break
248
249	try:
250	i = NextUtf8Char(s, i)
251	except error.Strict:
252	assert False, "DecodeUtf8Char should have caught any encoding errors"
253
254	start = 0
255	end = i
256	return (start, end)
257
258
259	def EndsWithWhitespaceByteRange(s):
260	# type: (str) -> Tuple[int, int]
261	"""Returns the range of 's' which has trailing whitespace characters.
262
263	If 's' has no leading whitespace, an valid but empty range is returned.
264
265	The returned range is given as byte positions, and is a half-open range
266	"[start, end)" which is returned as a tuple.
267
268	Used for shell functions like 'trimEnd' to match then trim whitespace.
269	"""
270	len_s = len(s)
271	i = len_s
272	while i > 0:
273	# TODO: Gracefully handle surrogate pairs and overlong encodings when
274	# finding the start of each character.
275	prev = PreviousUtf8Char(s, i)
276
277	codepoint = DecodeUtf8Char(s, prev)
278	if not _IsSpace(codepoint):
279	break
280
281	i = prev
282
283	start = i
284	end = len_s
285	return (start, end)
286
287
288	# Implementation without Python regex:
289	#
290	# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
291	# regexec. in a loop. fnmatch() does NOT given positions of matches.
292	#
293	# (2) Strip -- % %% # ## -
294	#
295	# a. Fast path for constant strings.
296	# b. Convert to POSIX extended regex, to see if it matches at ALL. If it
297	# doesn't match, short circuit out? We can't do this with fnmatch.
298	# c. If it does match, call fnmatch() iteratively over prefixes / suffixes.
299	#
300	# - # shortest prefix - [:1], [:2], [:3] until it matches
301	# - ## longest prefix - [:-1] [:-2], [:3]. Works because fnmatch does not
302	# match prefixes, it matches EXACTLY.
303	# - % shortest suffix - [-1:] [-2:] [-3:] ...
304	# - %% longest suffix - [1:] [2:] [3:]
305	#
306	# See remove_pattern() in subst.c for bash, and trimsub() in eval.c for
307	# mksh. Dash doesn't implement it.
308
309	# TODO:
310	# - Unicode support: Convert both pattern, string, and replacement to unicode,
311	# then the result back at the end.
312	# - Compile time errors for [[:space:]] ?
313
314
315	def DoUnarySuffixOp(s, op_tok, arg, is_extglob):
316	# type: (str, Token, str, bool) -> str
317	"""Helper for ${x#prefix} and family."""
318
319	id_ = op_tok.id
320
321	# Fast path for constant strings.
322	# TODO: Should be LooksLikeExtendedGlob!
323	if not is_extglob and not glob_.LooksLikeGlob(arg):
324	# It doesn't look like a glob, but we glob-escaped it (e.g. [ -> \[). So
325	# reverse it. NOTE: We also do this check in Globber.Expand(). It would
326	# be nice to somehow store the original string rather than
327	# escaping/unescaping.
328	arg = glob_.GlobUnescape(arg)
329
330	if id_ in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
331	# explicit check for non-empty arg (len for mycpp)
332	if len(arg) and s.startswith(arg):
333	return s[len(arg):]
334	else:
335	return s
336
337	elif id_ in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
338	# need explicit check for non-empty arg (len for mycpp)
339	if len(arg) and s.endswith(arg):
340	return s[:-len(arg)]
341	else:
342	return s
343
344	# These operators take glob arguments, we don't implement that obscure case.
345	elif id_ == Id.VOp1_Comma: # Only lowercase the first letter
346	if arg != '':
347	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
348	if len(s):
349	return s[0].lower() + s[1:]
350	else:
351	return s
352
353	elif id_ == Id.VOp1_DComma:
354	if arg != '':
355	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
356	return s.lower()
357
358	elif id_ == Id.VOp1_Caret: # Only uppercase the first letter
359	if arg != '':
360	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
361	if len(s):
362	return s[0].upper() + s[1:]
363	else:
364	return s
365
366	elif id_ == Id.VOp1_DCaret:
367	if arg != '':
368	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
369	return s.upper()
370
371	else: # e.g. ^ ^^ , ,,
372	raise AssertionError(id_)
373
374	# For patterns, do fnmatch() in a loop.
375	#
376	# TODO:
377	# - Another potential fast path:
378	# v=aabbccdd
379	# echo ${v#*b} # strip shortest prefix
380	#
381	# If the whole thing doesn't match 'b', then no test can succeed. So we
382	# can fail early. Conversely echo ${v%%c} and 'c*'.
383	#
384	# (Although honestly this whole construct is nuts and should be deprecated.)
385
386	n = len(s)
387
388	if id_ == Id.VOp1_Pound: # shortest prefix
389	# 'abcd': match '', 'a', 'ab', 'abc', ...
390	i = 0
391	while True:
392	assert i <= n
393	#log('Matching pattern %r with %r', arg, s[:i])
394	if libc.fnmatch(arg, s[:i]):
395	return s[i:]
396	if i >= n:
397	break
398	i = NextUtf8Char(s, i)
399	return s
400
401	elif id_ == Id.VOp1_DPound: # longest prefix
402	# 'abcd': match 'abc', 'ab', 'a'
403	i = n
404	while True:
405	assert i >= 0
406	#log('Matching pattern %r with %r', arg, s[:i])
407	if libc.fnmatch(arg, s[:i]):
408	return s[i:]
409	if i == 0:
410	break
411	i = PreviousUtf8Char(s, i)
412	return s
413
414	elif id_ == Id.VOp1_Percent: # shortest suffix
415	# 'abcd': match 'abcd', 'abc', 'ab', 'a'
416	i = n
417	while True:
418	assert i >= 0
419	#log('Matching pattern %r with %r', arg, s[:i])
420	if libc.fnmatch(arg, s[i:]):
421	return s[:i]
422	if i == 0:
423	break
424	i = PreviousUtf8Char(s, i)
425	return s
426
427	elif id_ == Id.VOp1_DPercent: # longest suffix
428	# 'abcd': match 'abc', 'bc', 'c', ...
429	i = 0
430	while True:
431	assert i <= n
432	#log('Matching pattern %r with %r', arg, s[:i])
433	if libc.fnmatch(arg, s[i:]):
434	return s[:i]
435	if i >= n:
436	break
437	i = NextUtf8Char(s, i)
438	return s
439
440	else:
441	raise NotImplementedError(ui.PrettyId(id_))
442
443
444	def _AllMatchPositions(s, regex):
445	# type: (str, str) -> List[Tuple[int, int]]
446	"""Returns a list of all (start, end) match positions of the regex against
447	s.
448
449	(If there are no matches, it returns the empty list.)
450	"""
451	matches = [] # type: List[Tuple[int, int]]
452	pos = 0
453	n = len(s)
454	while pos < n: # needed to prevent infinite loop in (.*) case
455	m = libc.regex_first_group_match(regex, s, pos)
456	if m is None:
457	break
458	matches.append(m)
459	start, end = m
460	pos = end # advance position
461	return matches
462
463
464	def _PatSubAll(s, regex, replace_str):
465	# type: (str, str, str) -> str
466	parts = [] # type: List[str]
467	prev_end = 0
468	for start, end in _AllMatchPositions(s, regex):
469	parts.append(s[prev_end:start])
470	parts.append(replace_str)
471	prev_end = end
472	parts.append(s[prev_end:])
473	return ''.join(parts)
474
475
476	class GlobReplacer(object):
477
478	def __init__(self, regex, replace_str, slash_tok):
479	# type: (str, str, Token) -> None
480
481	# TODO: It would be nice to cache the compilation of the regex here,
482	# instead of just the string. That would require more sophisticated use of
483	# the Python/C API in libc.c, which we might want to avoid.
484	self.regex = regex
485	self.replace_str = replace_str
486	self.slash_tok = slash_tok
487
488	def __repr__(self):
489	# type: () -> str
490	return '<_GlobReplacer regex %r r %r>' % (self.regex, self.replace_str)
491
492	def Replace(self, s, op):
493	# type: (str, suffix_op.PatSub) -> str
494
495	regex = '(%s)' % self.regex # make it a group
496
497	if op.replace_mode == Id.Lit_Slash:
498	# Avoid infinite loop when replacing all copies of empty string
499	if len(self.regex) == 0:
500	return s
501
502	try:
503	# loop over matches
504	return _PatSubAll(s, regex, self.replace_str)
505	except RuntimeError as e:
506	# Not sure if this is possible since we convert from glob:
507	# libc.regex_first_group_match raises RuntimeError on regex syntax
508	# error.
509	msg = e.message # type: str
510	e_die('Error matching regex %r: %s' % (regex, msg),
511	self.slash_tok)
512
513	if op.replace_mode == Id.Lit_Pound:
514	regex = '^' + regex
515	elif op.replace_mode == Id.Lit_Percent:
516	regex = regex + '$'
517
518	m = libc.regex_first_group_match(regex, s, 0)
519	#log('regex = %r, s = %r, match = %r', regex, s, m)
520	if m is None:
521	return s
522	start, end = m
523	return s[:start] + self.replace_str + s[end:]
524
525
526	def ShellQuoteB(s):
527	# type: (str) -> str
528	"""Quote by adding backslashes.
529
530	Used for autocompletion, so it's friendlier for display on the
531	command line. We use the strategy above for other use cases.
532	"""
533	# There's no way to escape a newline! Bash prints ^J for some reason, but
534	# we're more explicit. This will happen if there's a newline on a file
535	# system or a completion plugin returns a newline.
536
537	# NOTE: tabs CAN be escaped with \.
538	s = s.replace('\r', '<INVALID CR>').replace('\n', '<INVALID NEWLINE>')
539
540	# ~ for home dir
541	# ! for history
542	# * [] ? for glob
543	# {} for brace expansion
544	# space because it separates words
545	return pyutil.BackslashEscape(s, ' `~!$&*()[]{}\\\|;\'"<>?')