OILS / osh / braces.py View on Github | oilshell.org

539 lines, 326 significant
1#!/usr/bin/env python2
2"""
3braces.py - Implementation of {andy,bob}@example.com
4
5NOTE: bash implements brace expansion in the braces.c file (835 lines). It
6uses goto!
7
8Possible optimization flags for Compound:
9- has Lit_LBrace, LitRBrace -- set during word_parse phase
10 - it if has both, then do BraceDetect
11- has BracedTuple -- set during BraceDetect
12 - if it does, then do the expansion
13- has Lit_Star, ?, [ ] -- globbing?
14 - but after expansion do you still have those flags?
15"""
16from __future__ import print_function
17
18from _devbuild.gen.id_kind_asdl import Id, Id_t
19from _devbuild.gen.syntax_asdl import (
20 Token,
21 CompoundWord,
22 word,
23 word_e,
24 word_t,
25 word_part,
26 word_part_e,
27 word_part_t,
28)
29from core.error import p_die
30from frontend import lexer
31from frontend import match
32from mycpp import mylib
33from mycpp.mylib import log, tagswitch
34from osh import word_
35
36from typing import List, Optional, cast, TYPE_CHECKING
37if TYPE_CHECKING:
38 from frontend.match import SimpleLexer
39
40_ = log
41
42# Step has to be strictly positive or negative, so we can use 0 for 'not
43# specified'.
44NO_STEP = 0
45
46
47# The brace language has no syntax errors! But we still need to abort the
48# parse. TODO: Should we expose a strict version later?
49class _NotARange(Exception):
50
51 def __init__(self, s):
52 # type: (str) -> None
53 pass
54
55
56class _RangeParser(object):
57 """Grammar for ranges:
58
59 step = Dots Int
60 int_range = Int Dots Int step?
61 char_range = Char Dots Char step?
62 range = (int_range | char_range) Eof # ensure no extra tokens!
63 """
64
65 def __init__(self, lexer, blame_tok):
66 # type: (SimpleLexer, Token) -> None
67 self.lexer = lexer
68 self.blame_tok = blame_tok
69
70 self.token_type = Id.Undefined_Tok
71 self.token_val = ''
72
73 def _Next(self):
74 # type: () -> None
75 """Move to the next token."""
76 self.token_type, self.token_val = self.lexer.Next()
77
78 def _Eat(self, token_type):
79 # type: (Id_t) -> str
80 if self.token_type != token_type:
81 raise _NotARange('Expected %d, got %d' %
82 (token_type, self.token_type))
83 val = self.token_val
84 self._Next()
85 return val
86
87 def _ParseStep(self):
88 # type: () -> int
89 self._Next() # past Dots
90 step = int(self._Eat(Id.Range_Int))
91 if step == 0:
92 p_die("Step can't be 0", self.blame_tok)
93 return step
94
95 def _ParseRange(self, range_kind):
96 # type: (Id_t) -> word_part.BracedRange
97 start = self.token_val
98 self._Next() # past Char
99
100 self._Eat(Id.Range_Dots)
101 end = self._Eat(range_kind)
102
103 if self.token_type == Id.Range_Dots:
104 step = self._ParseStep()
105 else:
106 step = NO_STEP
107
108 part = word_part.BracedRange(self.blame_tok, range_kind, start, end,
109 step)
110 return part
111
112 def Parse(self):
113 # type: () -> word_part.BracedRange
114 self._Next()
115 if self.token_type == Id.Range_Int:
116 part = self._ParseRange(self.token_type)
117
118 # Check step validity and fill in a default
119 start = int(part.start)
120 end = int(part.end)
121 if start < end:
122 if part.step == NO_STEP:
123 part.step = 1
124 if part.step <= 0: # 0 step is not allowed
125 p_die(
126 'Invalid step %d for ascending integer range' %
127 part.step, self.blame_tok)
128 elif start > end:
129 if part.step == NO_STEP:
130 part.step = -1
131 if part.step >= 0: # 0 step is not allowed
132 p_die(
133 'Invalid step %d for descending integer range' %
134 part.step, self.blame_tok)
135 else:
136 # {1..1} singleton range is dumb but I suppose consistent
137 part.step = 1
138
139 elif self.token_type == Id.Range_Char:
140 part = self._ParseRange(self.token_type)
141
142 # Compare integers because mycpp doesn't support < on strings!
143 start_num = ord(part.start[0])
144 end_num = ord(part.end[0])
145
146 # Check step validity and fill in a default
147 if start_num < end_num:
148 if part.step == NO_STEP:
149 part.step = 1
150 if part.step <= 0: # 0 step is not allowed
151 p_die(
152 'Invalid step %d for ascending character range' %
153 part.step, self.blame_tok)
154 elif start_num > end_num:
155 if part.step == NO_STEP:
156 part.step = -1
157 if part.step >= 0: # 0 step is not allowed
158 p_die(
159 'Invalid step %d for descending character range' %
160 part.step, self.blame_tok)
161 else:
162 # {a..a} singleton range is dumb but I suppose consistent
163 part.step = 1
164
165 # Check matching cases
166 upper1 = part.start.isupper()
167 upper2 = part.end.isupper()
168 if upper1 != upper2:
169 p_die('Mismatched cases in character range', self.blame_tok)
170
171 else:
172 raise _NotARange('')
173
174 # prevent unexpected trailing tokens
175 self._Eat(Id.Eol_Tok)
176 return part
177
178
179def _RangePartDetect(tok):
180 # type: (Token) -> Optional[word_part.BracedRange]
181 """Parse the token and return a new word_part if it looks like a range."""
182
183 lx = match.BraceRangeLexer(lexer.TokenVal(tok))
184 p = _RangeParser(lx, tok)
185 try:
186 part = p.Parse()
187 except _NotARange as e:
188 return None
189 return part
190
191
192class _StackFrame(object):
193
194 def __init__(self, cur_parts):
195 # type: (List[word_part_t]) -> None
196 self.cur_parts = cur_parts
197 self.alt_part = word_part.BracedTuple([])
198 self.saw_comma = False
199
200
201def BraceDetect(w):
202 # type: (CompoundWord) -> Optional[word.BracedTree]
203 """Return a new word if the input word looks like a brace expansion.
204
205 e.g. {a,b} or {1..10..2} (TODO)
206 Do we want to accept {01..02} ? zsh does make some attempt to do this too.
207
208 NOTE: This is an iterative algorithm that uses a stack. The grammar-based
209 approach didn't seem natural.
210
211 It's not LL(1) because of 'part*'. And not LL(k) even? Maybe it be handled
212 with an LR parser? In any case the imperative algorithm with 'early return'
213 for a couple cases is fairly simple.
214
215 Grammar:
216 # an alternative is a literal, possibly empty, or another brace_expr
217
218 part = <any part except Literal>
219 alt = part* | brace_expr
220
221 # a brace_expr is group of at least 2 braced and comma-separated
222 # alternatives, with optional prefix and suffix.
223 brace_expr = part* '{' alt ',' alt (',' alt)* '}' part*
224 """
225 # Errors:
226 # }a{ - stack depth dips below 0
227 # {a,b}{ - Stack depth doesn't end at 0
228 # {a} - no comma, and also not an numeric range
229
230 cur_parts = [] # type: List[word_part_t]
231 stack = [] # type: List[_StackFrame]
232
233 found = False
234
235 for i, part in enumerate(w.parts):
236 append = True
237 UP_part = part
238 if part.tag() == word_part_e.Literal:
239 part = cast(Token, UP_part)
240 id_ = part.id
241 if id_ == Id.Lit_LBrace:
242 # Save prefix parts. Start new parts list.
243 new_frame = _StackFrame(cur_parts)
244 stack.append(new_frame)
245 cur_parts = [] # clear
246 append = False
247 found = True # assume found, but can early exit with None later
248
249 elif id_ == Id.Lit_Comma: # Append a new alternative.
250 # NOTE: Should we allow this:
251 # ,{a,b}
252 # or force this:
253 # \,{a,b}
254 # ? We're forcing braces right now but not commas.
255 if len(stack):
256 stack[-1].saw_comma = True
257 stack[-1].alt_part.words.append(CompoundWord(cur_parts))
258 cur_parts = [] # clear
259 append = False
260
261 elif id_ == Id.Lit_RBrace:
262 if len(stack) == 0: # e.g. echo {a,b}{ -- unbalanced {
263 return None # do not expand ANYTHING because of invalid syntax
264
265 # Detect {1..10} and {1..10..2}
266
267 #log('stack[-1]: %s', stack[-1])
268 #log('cur_parts: %s', cur_parts)
269
270 range_part = None # type: Optional[word_part_t]
271 # only allow {1..3}, not {a,1..3}
272 if not stack[-1].saw_comma and len(cur_parts) == 1:
273 # It must be ONE part. For example, -1..-100..-2 is initially
274 # lexed as a single Lit_Chars token.
275 part2 = cur_parts[0]
276 if part2.tag() == word_part_e.Literal:
277 tok = cast(Token, part2)
278 if tok.id == Id.Lit_Chars:
279 range_part = _RangePartDetect(tok)
280 if range_part:
281 frame = stack.pop()
282 cur_parts = frame.cur_parts
283 cur_parts.append(range_part)
284 append = False
285
286 # It doesn't look like a range -- process it as the last element in
287 # {a,b,c}
288
289 if not range_part:
290 if not stack[
291 -1].saw_comma: # {foo} is not a real alternative
292 return None # early return
293
294 stack[-1].alt_part.words.append(CompoundWord(cur_parts))
295
296 frame = stack.pop()
297 cur_parts = frame.cur_parts
298 cur_parts.append(frame.alt_part)
299 append = False
300
301 if append:
302 cur_parts.append(part)
303
304 if len(stack) != 0:
305 return None
306
307 if found:
308 return word.BracedTree(cur_parts)
309 else:
310 return None
311
312
313def BraceDetectAll(words):
314 # type: (List[CompoundWord]) -> List[word_t]
315 """Return a new list of words, possibly with BracedTree instances."""
316 out = [] # type: List[word_t]
317 for w in words:
318 # The shortest possible brace expansion is {,}. This heuristic prevents
319 # a lot of garbage from being created, since otherwise nearly every word
320 # would be checked. We could be even more precise but this is cheap.
321 if len(w.parts) >= 3:
322 brace_tree = BraceDetect(w)
323 if brace_tree:
324 out.append(brace_tree)
325 continue
326 out.append(w)
327 return out
328
329
330def _LeadingZeros(s):
331 # type: (str) -> int
332 n = 0
333 for c in s:
334 if c == '0':
335 n += 1
336 else:
337 break
338 return n
339
340
341def _IntToString(i, width):
342 # type: (int, int) -> str
343 s = str(i)
344 n = len(s)
345 if n < width: # width might be 0
346 # pad with zeros
347 pad = '0' * (width - n)
348 return pad + s
349 else:
350 return s
351
352
353def _RangeStrings(part):
354 # type: (word_part.BracedRange) -> List[str]
355
356 if part.kind == Id.Range_Int:
357 nums = [] # type: List[str]
358
359 z1 = _LeadingZeros(part.start)
360 z2 = _LeadingZeros(part.end)
361
362 if z1 == 0 and z2 == 0:
363 width = 0
364 else:
365 if z1 < z2:
366 width = len(part.end)
367 else:
368 width = len(part.start)
369
370 n = int(part.start)
371 end = int(part.end)
372 step = part.step
373 if step > 0:
374 while True:
375 nums.append(_IntToString(n, width))
376 n += step
377 if n > end:
378 break
379 else:
380 while True:
381 nums.append(_IntToString(n, width))
382 n += step
383 if n < end:
384 break
385
386 return nums
387
388 else: # Id.Range_Char
389 chars = [] # type: List[str]
390
391 n = ord(part.start)
392 ord_end = ord(part.end)
393 step = part.step
394 if step > 0:
395 while True:
396 chars.append(chr(n))
397 n += step
398 if n > ord_end:
399 break
400 else:
401 while True:
402 chars.append(chr(n))
403 n += step
404 if n < ord_end:
405 break
406
407 return chars
408
409
410def _ExpandPart(
411 parts, # type: List[word_part_t]
412 first_alt_index, # type: int
413 suffixes, # type: List[List[word_part_t]]
414):
415 # type: (...) -> List[List[word_part_t]]
416 """Mutually recursive with _BraceExpand.
417
418 Args:
419 parts: input parts
420 first_alt_index: index of the first BracedTuple
421 suffixes: List of suffixes to append.
422 """
423 out = [] # type: List[List[word_part_t]]
424
425 prefix = parts[:first_alt_index]
426 expand_part = parts[first_alt_index]
427
428 UP_part = expand_part
429 with tagswitch(expand_part) as case:
430 if case(word_part_e.BracedTuple):
431 expand_part = cast(word_part.BracedTuple, UP_part)
432 # Call _BraceExpand on each of the inner words too!
433 expanded_alts = [] # type: List[List[word_part_t]]
434 for w in expand_part.words:
435 expanded_alts.extend(_BraceExpand(w.parts))
436
437 for alt_parts in expanded_alts:
438 for suffix in suffixes:
439 out_parts = [] # type: List[word_part_t]
440 out_parts.extend(prefix)
441 out_parts.extend(alt_parts)
442 out_parts.extend(suffix)
443 out.append(out_parts)
444
445 elif case(word_part_e.BracedRange):
446 expand_part = cast(word_part.BracedRange, UP_part)
447 # Not mutually recursive with _BraceExpand
448 strs = _RangeStrings(expand_part)
449
450 # Often prefix and suffixes are empty, but there's not that much to
451 # optimize
452 # log('prefix %s, suffixes %s, strs %s', prefix, suffixes, strs)
453
454 for s in strs:
455 for suffix in suffixes:
456 out_parts_ = [] # type: List[word_part_t]
457 out_parts_.extend(prefix)
458
459 # TODO: Does it help to preserve location info?
460 # t = Token(Id.Lit_Chars, expand_part.locs[0], s)
461 t = lexer.DummyToken(Id.Lit_Chars, s)
462
463 out_parts_.append(t)
464 out_parts_.extend(suffix)
465 out.append(out_parts_)
466
467 else:
468 raise AssertionError()
469
470 return out
471
472
473def _BraceExpand(parts):
474 # type: (List[word_part_t]) -> List[List[word_part_t]]
475 """Mutually recursive with _ExpandPart."""
476
477 # manual GC point, because brace expansion is a separate stage that does a
478 # bunch of computation outside the interpreter
479 mylib.MaybeCollect()
480
481 num_alts = 0
482 first_alt_index = -1
483 for i, part in enumerate(parts):
484 tag = part.tag()
485 if tag in (word_part_e.BracedTuple, word_part_e.BracedRange):
486 num_alts += 1
487 if num_alts == 1:
488 first_alt_index = i
489 elif num_alts == 2:
490 break # don't need to count anymore
491
492 # NOTE: There are TWO recursive calls here, not just one -- one for
493 # nested {}, and one for adjacent {}. This is hard to do iteratively.
494 if num_alts == 0:
495 return [parts]
496
497 elif num_alts == 1:
498 suffix = parts[first_alt_index + 1:]
499 return _ExpandPart(parts, first_alt_index, [suffix])
500
501 else:
502 # Now call it on the tail
503 tail_parts = parts[first_alt_index + 1:]
504 suffixes = _BraceExpand(tail_parts) # recursive call
505 return _ExpandPart(parts, first_alt_index, suffixes)
506
507
508def BraceExpandWords(words):
509 # type: (List[word_t]) -> List[CompoundWord]
510 out = [] # type: List[CompoundWord]
511 for w in words:
512 UP_w = w
513 with tagswitch(w) as case:
514 if case(word_e.BracedTree):
515 w = cast(word.BracedTree, UP_w)
516 # Note: for the case of {1..100000}, this is a flat list of Token.
517 # Would be nice to optimize, but we don't really know the structure
518 # ahead of time
519 parts_list = _BraceExpand(w.parts)
520 for parts in parts_list:
521 expanded = CompoundWord(parts)
522
523 # Now do tilde detection on brace-expanded word
524 ti = word_.TildeDetect2(expanded)
525 if ti:
526 out.append(ti)
527 else:
528 out.append(expanded)
529
530 elif case(word_e.Compound):
531 w = cast(CompoundWord, UP_w)
532
533 # Already did tilde detection before expansion
534 out.append(w)
535
536 else:
537 raise AssertionError(w.tag())
538
539 return out