osh/braces.py

OILS / osh / braces.py View on Github | oilshell.org

539 lines, 326 significant

1	#!/usr/bin/env python2
2	"""
3	braces.py - Implementation of {andy,bob}@example.com
4
5	NOTE: bash implements brace expansion in the braces.c file (835 lines). It
6	uses goto!
7
8	Possible optimization flags for Compound:
9	- has Lit_LBrace, LitRBrace -- set during word_parse phase
10	- it if has both, then do BraceDetect
11	- has BracedTuple -- set during BraceDetect
12	- if it does, then do the expansion
13	- has Lit_Star, ?, [ ] -- globbing?
14	- but after expansion do you still have those flags?
15	"""
16	from __future__ import print_function
17
18	from _devbuild.gen.id_kind_asdl import Id, Id_t
19	from _devbuild.gen.syntax_asdl import (
20	Token,
21	CompoundWord,
22	word,
23	word_e,
24	word_t,
25	word_part,
26	word_part_e,
27	word_part_t,
28	)
29	from core.error import p_die
30	from frontend import lexer
31	from frontend import match
32	from mycpp import mylib
33	from mycpp.mylib import log, tagswitch
34	from osh import word_
35
36	from typing import List, Optional, cast, TYPE_CHECKING
37	if TYPE_CHECKING:
38	from frontend.match import SimpleLexer
39
40	_ = log
41
42	# Step has to be strictly positive or negative, so we can use 0 for 'not
43	# specified'.
44	NO_STEP = 0
45
46
47	# The brace language has no syntax errors! But we still need to abort the
48	# parse. TODO: Should we expose a strict version later?
49	class _NotARange(Exception):
50
51	def __init__(self, s):
52	# type: (str) -> None
53	pass
54
55
56	class _RangeParser(object):
57	"""Grammar for ranges:
58
59	step = Dots Int
60	int_range = Int Dots Int step?
61	char_range = Char Dots Char step?
62	range = (int_range \| char_range) Eof # ensure no extra tokens!
63	"""
64
65	def __init__(self, lexer, blame_tok):
66	# type: (SimpleLexer, Token) -> None
67	self.lexer = lexer
68	self.blame_tok = blame_tok
69
70	self.token_type = Id.Undefined_Tok
71	self.token_val = ''
72
73	def _Next(self):
74	# type: () -> None
75	"""Move to the next token."""
76	self.token_type, self.token_val = self.lexer.Next()
77
78	def _Eat(self, token_type):
79	# type: (Id_t) -> str
80	if self.token_type != token_type:
81	raise _NotARange('Expected %d, got %d' %
82	(token_type, self.token_type))
83	val = self.token_val
84	self._Next()
85	return val
86
87	def _ParseStep(self):
88	# type: () -> int
89	self._Next() # past Dots
90	step = int(self._Eat(Id.Range_Int))
91	if step == 0:
92	p_die("Step can't be 0", self.blame_tok)
93	return step
94
95	def _ParseRange(self, range_kind):
96	# type: (Id_t) -> word_part.BracedRange
97	start = self.token_val
98	self._Next() # past Char
99
100	self._Eat(Id.Range_Dots)
101	end = self._Eat(range_kind)
102
103	if self.token_type == Id.Range_Dots:
104	step = self._ParseStep()
105	else:
106	step = NO_STEP
107
108	part = word_part.BracedRange(self.blame_tok, range_kind, start, end,
109	step)
110	return part
111
112	def Parse(self):
113	# type: () -> word_part.BracedRange
114	self._Next()
115	if self.token_type == Id.Range_Int:
116	part = self._ParseRange(self.token_type)
117
118	# Check step validity and fill in a default
119	start = int(part.start)
120	end = int(part.end)
121	if start < end:
122	if part.step == NO_STEP:
123	part.step = 1
124	if part.step <= 0: # 0 step is not allowed
125	p_die(
126	'Invalid step %d for ascending integer range' %
127	part.step, self.blame_tok)
128	elif start > end:
129	if part.step == NO_STEP:
130	part.step = -1
131	if part.step >= 0: # 0 step is not allowed
132	p_die(
133	'Invalid step %d for descending integer range' %
134	part.step, self.blame_tok)
135	else:
136	# {1..1} singleton range is dumb but I suppose consistent
137	part.step = 1
138
139	elif self.token_type == Id.Range_Char:
140	part = self._ParseRange(self.token_type)
141
142	# Compare integers because mycpp doesn't support < on strings!
143	start_num = ord(part.start[0])
144	end_num = ord(part.end[0])
145
146	# Check step validity and fill in a default
147	if start_num < end_num:
148	if part.step == NO_STEP:
149	part.step = 1
150	if part.step <= 0: # 0 step is not allowed
151	p_die(
152	'Invalid step %d for ascending character range' %
153	part.step, self.blame_tok)
154	elif start_num > end_num:
155	if part.step == NO_STEP:
156	part.step = -1
157	if part.step >= 0: # 0 step is not allowed
158	p_die(
159	'Invalid step %d for descending character range' %
160	part.step, self.blame_tok)
161	else:
162	# {a..a} singleton range is dumb but I suppose consistent
163	part.step = 1
164
165	# Check matching cases
166	upper1 = part.start.isupper()
167	upper2 = part.end.isupper()
168	if upper1 != upper2:
169	p_die('Mismatched cases in character range', self.blame_tok)
170
171	else:
172	raise _NotARange('')
173
174	# prevent unexpected trailing tokens
175	self._Eat(Id.Eol_Tok)
176	return part
177
178
179	def _RangePartDetect(tok):
180	# type: (Token) -> Optional[word_part.BracedRange]
181	"""Parse the token and return a new word_part if it looks like a range."""
182
183	lx = match.BraceRangeLexer(lexer.TokenVal(tok))
184	p = _RangeParser(lx, tok)
185	try:
186	part = p.Parse()
187	except _NotARange as e:
188	return None
189	return part
190
191
192	class _StackFrame(object):
193
194	def __init__(self, cur_parts):
195	# type: (List[word_part_t]) -> None
196	self.cur_parts = cur_parts
197	self.alt_part = word_part.BracedTuple([])
198	self.saw_comma = False
199
200
201	def BraceDetect(w):
202	# type: (CompoundWord) -> Optional[word.BracedTree]
203	"""Return a new word if the input word looks like a brace expansion.
204
205	e.g. {a,b} or {1..10..2} (TODO)
206	Do we want to accept {01..02} ? zsh does make some attempt to do this too.
207
208	NOTE: This is an iterative algorithm that uses a stack. The grammar-based
209	approach didn't seem natural.
210
211	It's not LL(1) because of 'part*'. And not LL(k) even? Maybe it be handled
212	with an LR parser? In any case the imperative algorithm with 'early return'
213	for a couple cases is fairly simple.
214
215	Grammar:
216	# an alternative is a literal, possibly empty, or another brace_expr
217
218	part = <any part except Literal>
219	alt = part* \| brace_expr
220
221	# a brace_expr is group of at least 2 braced and comma-separated
222	# alternatives, with optional prefix and suffix.
223	brace_expr = part* '{' alt ',' alt (',' alt)* '}' part*
224	"""
225	# Errors:
226	# }a{ - stack depth dips below 0
227	# {a,b}{ - Stack depth doesn't end at 0
228	# {a} - no comma, and also not an numeric range
229
230	cur_parts = [] # type: List[word_part_t]
231	stack = [] # type: List[_StackFrame]
232
233	found = False
234
235	for i, part in enumerate(w.parts):
236	append = True
237	UP_part = part
238	if part.tag() == word_part_e.Literal:
239	part = cast(Token, UP_part)
240	id_ = part.id
241	if id_ == Id.Lit_LBrace:
242	# Save prefix parts. Start new parts list.
243	new_frame = _StackFrame(cur_parts)
244	stack.append(new_frame)
245	cur_parts = [] # clear
246	append = False
247	found = True # assume found, but can early exit with None later
248
249	elif id_ == Id.Lit_Comma: # Append a new alternative.
250	# NOTE: Should we allow this:
251	# ,{a,b}
252	# or force this:
253	# \,{a,b}
254	# ? We're forcing braces right now but not commas.
255	if len(stack):
256	stack[-1].saw_comma = True
257	stack[-1].alt_part.words.append(CompoundWord(cur_parts))
258	cur_parts = [] # clear
259	append = False
260
261	elif id_ == Id.Lit_RBrace:
262	if len(stack) == 0: # e.g. echo {a,b}{ -- unbalanced {
263	return None # do not expand ANYTHING because of invalid syntax
264
265	# Detect {1..10} and {1..10..2}
266
267	#log('stack[-1]: %s', stack[-1])
268	#log('cur_parts: %s', cur_parts)
269
270	range_part = None # type: Optional[word_part_t]
271	# only allow {1..3}, not {a,1..3}
272	if not stack[-1].saw_comma and len(cur_parts) == 1:
273	# It must be ONE part. For example, -1..-100..-2 is initially
274	# lexed as a single Lit_Chars token.
275	part2 = cur_parts[0]
276	if part2.tag() == word_part_e.Literal:
277	tok = cast(Token, part2)
278	if tok.id == Id.Lit_Chars:
279	range_part = _RangePartDetect(tok)
280	if range_part:
281	frame = stack.pop()
282	cur_parts = frame.cur_parts
283	cur_parts.append(range_part)
284	append = False
285
286	# It doesn't look like a range -- process it as the last element in
287	# {a,b,c}
288
289	if not range_part:
290	if not stack[
291	-1].saw_comma: # {foo} is not a real alternative
292	return None # early return
293
294	stack[-1].alt_part.words.append(CompoundWord(cur_parts))
295
296	frame = stack.pop()
297	cur_parts = frame.cur_parts
298	cur_parts.append(frame.alt_part)
299	append = False
300
301	if append:
302	cur_parts.append(part)
303
304	if len(stack) != 0:
305	return None
306
307	if found:
308	return word.BracedTree(cur_parts)
309	else:
310	return None
311
312
313	def BraceDetectAll(words):
314	# type: (List[CompoundWord]) -> List[word_t]
315	"""Return a new list of words, possibly with BracedTree instances."""
316	out = [] # type: List[word_t]
317	for w in words:
318	# The shortest possible brace expansion is {,}. This heuristic prevents
319	# a lot of garbage from being created, since otherwise nearly every word
320	# would be checked. We could be even more precise but this is cheap.
321	if len(w.parts) >= 3:
322	brace_tree = BraceDetect(w)
323	if brace_tree:
324	out.append(brace_tree)
325	continue
326	out.append(w)
327	return out
328
329
330	def _LeadingZeros(s):
331	# type: (str) -> int
332	n = 0
333	for c in s:
334	if c == '0':
335	n += 1
336	else:
337	break
338	return n
339
340
341	def _IntToString(i, width):
342	# type: (int, int) -> str
343	s = str(i)
344	n = len(s)
345	if n < width: # width might be 0
346	# pad with zeros
347	pad = '0' * (width - n)
348	return pad + s
349	else:
350	return s
351
352
353	def _RangeStrings(part):
354	# type: (word_part.BracedRange) -> List[str]
355
356	if part.kind == Id.Range_Int:
357	nums = [] # type: List[str]
358
359	z1 = _LeadingZeros(part.start)
360	z2 = _LeadingZeros(part.end)
361
362	if z1 == 0 and z2 == 0:
363	width = 0
364	else:
365	if z1 < z2:
366	width = len(part.end)
367	else:
368	width = len(part.start)
369
370	n = int(part.start)
371	end = int(part.end)
372	step = part.step
373	if step > 0:
374	while True:
375	nums.append(_IntToString(n, width))
376	n += step
377	if n > end:
378	break
379	else:
380	while True:
381	nums.append(_IntToString(n, width))
382	n += step
383	if n < end:
384	break
385
386	return nums
387
388	else: # Id.Range_Char
389	chars = [] # type: List[str]
390
391	n = ord(part.start)
392	ord_end = ord(part.end)
393	step = part.step
394	if step > 0:
395	while True:
396	chars.append(chr(n))
397	n += step
398	if n > ord_end:
399	break
400	else:
401	while True:
402	chars.append(chr(n))
403	n += step
404	if n < ord_end:
405	break
406
407	return chars
408
409
410	def _ExpandPart(
411	parts, # type: List[word_part_t]
412	first_alt_index, # type: int
413	suffixes, # type: List[List[word_part_t]]
414	):
415	# type: (...) -> List[List[word_part_t]]
416	"""Mutually recursive with _BraceExpand.
417
418	Args:
419	parts: input parts
420	first_alt_index: index of the first BracedTuple
421	suffixes: List of suffixes to append.
422	"""
423	out = [] # type: List[List[word_part_t]]
424
425	prefix = parts[:first_alt_index]
426	expand_part = parts[first_alt_index]
427
428	UP_part = expand_part
429	with tagswitch(expand_part) as case:
430	if case(word_part_e.BracedTuple):
431	expand_part = cast(word_part.BracedTuple, UP_part)
432	# Call _BraceExpand on each of the inner words too!
433	expanded_alts = [] # type: List[List[word_part_t]]
434	for w in expand_part.words:
435	expanded_alts.extend(_BraceExpand(w.parts))
436
437	for alt_parts in expanded_alts:
438	for suffix in suffixes:
439	out_parts = [] # type: List[word_part_t]
440	out_parts.extend(prefix)
441	out_parts.extend(alt_parts)
442	out_parts.extend(suffix)
443	out.append(out_parts)
444
445	elif case(word_part_e.BracedRange):
446	expand_part = cast(word_part.BracedRange, UP_part)
447	# Not mutually recursive with _BraceExpand
448	strs = _RangeStrings(expand_part)
449
450	# Often prefix and suffixes are empty, but there's not that much to
451	# optimize
452	# log('prefix %s, suffixes %s, strs %s', prefix, suffixes, strs)
453
454	for s in strs:
455	for suffix in suffixes:
456	out_parts_ = [] # type: List[word_part_t]
457	out_parts_.extend(prefix)
458
459	# TODO: Does it help to preserve location info?
460	# t = Token(Id.Lit_Chars, expand_part.locs[0], s)
461	t = lexer.DummyToken(Id.Lit_Chars, s)
462
463	out_parts_.append(t)
464	out_parts_.extend(suffix)
465	out.append(out_parts_)
466
467	else:
468	raise AssertionError()
469
470	return out
471
472
473	def _BraceExpand(parts):
474	# type: (List[word_part_t]) -> List[List[word_part_t]]
475	"""Mutually recursive with _ExpandPart."""
476
477	# manual GC point, because brace expansion is a separate stage that does a
478	# bunch of computation outside the interpreter
479	mylib.MaybeCollect()
480
481	num_alts = 0
482	first_alt_index = -1
483	for i, part in enumerate(parts):
484	tag = part.tag()
485	if tag in (word_part_e.BracedTuple, word_part_e.BracedRange):
486	num_alts += 1
487	if num_alts == 1:
488	first_alt_index = i
489	elif num_alts == 2:
490	break # don't need to count anymore
491
492	# NOTE: There are TWO recursive calls here, not just one -- one for
493	# nested {}, and one for adjacent {}. This is hard to do iteratively.
494	if num_alts == 0:
495	return [parts]
496
497	elif num_alts == 1:
498	suffix = parts[first_alt_index + 1:]
499	return _ExpandPart(parts, first_alt_index, [suffix])
500
501	else:
502	# Now call it on the tail
503	tail_parts = parts[first_alt_index + 1:]
504	suffixes = _BraceExpand(tail_parts) # recursive call
505	return _ExpandPart(parts, first_alt_index, suffixes)
506
507
508	def BraceExpandWords(words):
509	# type: (List[word_t]) -> List[CompoundWord]
510	out = [] # type: List[CompoundWord]
511	for w in words:
512	UP_w = w
513	with tagswitch(w) as case:
514	if case(word_e.BracedTree):
515	w = cast(word.BracedTree, UP_w)
516	# Note: for the case of {1..100000}, this is a flat list of Token.
517	# Would be nice to optimize, but we don't really know the structure
518	# ahead of time
519	parts_list = _BraceExpand(w.parts)
520	for parts in parts_list:
521	expanded = CompoundWord(parts)
522
523	# Now do tilde detection on brace-expanded word
524	ti = word_.TildeDetect2(expanded)
525	if ti:
526	out.append(ti)
527	else:
528	out.append(expanded)
529
530	elif case(word_e.Compound):
531	w = cast(CompoundWord, UP_w)
532
533	# Already did tilde detection before expansion
534	out.append(w)
535
536	else:
537	raise AssertionError(w.tag())
538
539	return out