OILS / opy / _regtest / src / osh / word_parse.py View on Github | oilshell.org

1239 lines, 679 significant
1#!/usr/bin/env python
2# Copyright 2016 Andy Chu. All rights reserved.
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8"""
9word_parse.py - Parse the shell word language.
10"""
11
12from asdl import const
13
14from osh.meta import Id, Kind, LookupKind
15from core import braces
16from core import word
17from core import tdop
18from core import util
19
20from osh import arith_parse
21from osh.meta import ast, types
22
23word_part_e = ast.word_part_e
24word_e = ast.word_e
25lex_mode_e = types.lex_mode_e
26
27p_die = util.p_die
28log = util.log
29
30# Substitutions can be nested, but which inner subs are allowed depends on the
31# outer sub. See _ReadLeftParts vs. _ReadDoubleQuotedLeftParts.
32
33# lex_mode_e.OUTER
34# All subs and quotes are allowed --
35# $v ${v} $() `` $(()) '' "" $'' $"" <() >()
36#
37# lex_mode_e.DQ
38# Var, Command, Arith, but no quotes
39# $v ${v} $() `` $(())
40# No process substitution.
41#
42# lex_mode_e.ARITH:
43# Similar to DQ: Var, Command, Arith sub. No process sub. bash has no
44# quotes, but we are changing this in oil. We are adding ALL FOUR kinds of
45# quotes , because we need those for associtative array indexing.
46#
47# lex_mode_e.VS_ARG_UNQ
48# Like UNQUOTED, except we stop at }. Everything is allowed, even process
49# substitution.
50#
51# ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
52# ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
53#
54# But space is SIGNIFICANT. ${a:- b }
55# So you should NOT just read a bunch of words after :-, unless you also
56# preserve the space tokens between.
57# In other words, like DS_VS_ARG, except SINGLE Quotes allowed?
58#
59# lex_mode_e.VS_ARG_DQ
60# Can't be lex_mode_e.DQ because here we respect $' and $" tokens, while <(
61# token is not respected.
62#
63# Like VS_ARG_UNQ, but single quotes are NOT respected (they appear
64# literally), and process substitution is not respected (ditto).
65#
66# "" and $'' and $"" are respected, but not ''. I need a matrix for this.
67#
68# Like DQ, except nested "" and $'' and $"" are RESPECTED.
69#
70# It's weird that double quotes are allowed. Not sure why that would be.
71# Unquoted is also allowed, so " a "b" c " $'' and $"" are lame, because they
72# don't appear in the DQ context. I think I should parse those but DISALLOW.
73# You should always make $'' and $"" as a separate var!
74
75class WordParser(object):
76
77 def __init__(self, lexer, line_reader, lex_mode=lex_mode_e.OUTER):
78 self.lexer = lexer
79 self.line_reader = line_reader
80 self.Reset(lex_mode=lex_mode)
81
82 def _Peek(self):
83 """Helper method."""
84 if self.next_lex_mode is not None:
85 self.prev_token = self.cur_token # for completion
86 self.cur_token = self.lexer.Read(self.next_lex_mode)
87 self.token_kind = LookupKind(self.cur_token.id)
88 self.token_type = self.cur_token.id
89
90 self.next_lex_mode = None
91 return self.cur_token
92
93 def _Next(self, lex_mode):
94 """Set the next lex state, but don't actually read a token.
95
96 We need this for proper interactive parsing.
97 """
98 self.next_lex_mode = lex_mode
99
100 def Reset(self, lex_mode=lex_mode_e.OUTER):
101 """Called by interactive loop."""
102 # For _Peek()
103 self.prev_token = None # for completion
104 self.cur_token = None
105 self.token_kind = Kind.Undefined
106 self.token_type = Id.Undefined_Tok
107
108 self.next_lex_mode = lex_mode
109
110 # For newline. TODO: I think we can do this iteratively, without member
111 # state.
112 self.cursor = None
113 self.cursor_was_newline = False
114
115 self.error_stack = []
116
117 def AddErrorContext(self, msg, *args, **kwargs):
118 err = util.ParseError(msg, *args, **kwargs)
119 self.error_stack.append(err)
120
121 def Error(self):
122 return self.error_stack
123
124 def _BadToken(self, msg, token):
125 """
126 Args:
127 msg: format string with a single %s token
128 token: Token
129 """
130 self.AddErrorContext(msg, token, token=token)
131
132 def PrevToken(self):
133 """Inspect state. Used by completion.
134
135 cur_token is usually Id.Op_Newline \n, so we need the previous one.
136 """
137 return self.prev_token
138
139 def _ReadVarOpArg(self, arg_lex_mode, eof_type=Id.Undefined_Tok,
140 empty_ok=True):
141 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
142 # valid, even when unquoted.
143 self._Next(arg_lex_mode)
144 self._Peek()
145
146 w = self._ReadCompoundWord(
147 lex_mode=arg_lex_mode, eof_type=eof_type, empty_ok=empty_ok)
148 # This is for "${s:-}", ${s/a//}, etc. It is analogous to
149 # LooksLikeAssignment where we turn x= into x=''. It has the same
150 # potential problem of not having spids.
151 #
152 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
153 # return a CompoundWord with no parts, which is explicitly checked with a
154 # custom error message.
155 if not w.parts and arg_lex_mode == lex_mode_e.VS_ARG_DQ and empty_ok:
156 w.parts.append(ast.EmptyPart())
157 return w
158
159 def _ReadSliceArg(self):
160 """Read an arithmetic expression for either part of ${a : i+1 : i+2}."""
161 anode = self._ReadArithExpr(do_next=False)
162 return anode
163
164 def _ReadSliceVarOp(self):
165 """ VarOf ':' ArithExpr (':' ArithExpr )? """
166 self._Next(lex_mode_e.ARITH)
167 self._Peek()
168 if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
169 begin = None # no beginning specified
170 else:
171 begin = self._ReadSliceArg()
172 if not begin: return None
173 #print('BEGIN', begin)
174 #print('BVS2', self.cur_token)
175
176 if self.token_type == Id.Arith_RBrace:
177 return ast.Slice(begin, None) # No length specified
178
179 # Id.Arith_Colon is a pun for Id.VOp2_Colon
180 elif self.token_type == Id.Arith_Colon:
181 self._Next(lex_mode_e.ARITH)
182 length = self._ReadSliceArg()
183 if not length: return None
184
185 #print('after colon', self.cur_token)
186 return ast.Slice(begin, length)
187
188 else:
189 self.AddErrorContext("Unexpected token in slice: %s", self.cur_token)
190 return None
191
192 def _ReadPatSubVarOp(self, lex_mode):
193 """
194 Match = ('/' | '#' | '%') WORD
195 VarSub = ...
196 | VarOf '/' Match '/' WORD
197 """
198 do_all = False
199 do_prefix = False
200 do_suffix = False
201
202 pat = self._ReadVarOpArg(lex_mode, eof_type=Id.Lit_Slash, empty_ok=False)
203 if not pat: return None
204
205 if len(pat.parts) == 1:
206 ok, s, quoted = word.StaticEval(pat)
207 if ok and s == '/' and not quoted: # Looks like ${a////c}, read again
208 self._Next(lex_mode)
209 self._Peek()
210 p = ast.LiteralPart(self.cur_token)
211 pat.parts.append(p)
212
213 if len(pat.parts) == 0:
214 self._BadToken("Pattern must not be empty: %r", token=self.cur_token)
215 return None
216 else:
217 first_part = pat.parts[0]
218 if first_part.tag == word_part_e.LiteralPart:
219 lit_id = first_part.token.id
220 if lit_id == Id.Lit_Slash:
221 do_all = True
222 pat.parts.pop(0)
223 elif lit_id == Id.Lit_Pound:
224 do_prefix = True
225 pat.parts.pop(0)
226 elif lit_id == Id.Lit_Percent:
227 do_suffix = True
228 pat.parts.pop(0)
229
230 #self._Peek()
231 if self.token_type == Id.Right_VarSub:
232 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
233 return ast.PatSub(pat, None, do_all, do_prefix, do_suffix)
234
235 elif self.token_type == Id.Lit_Slash:
236 replace = self._ReadVarOpArg(lex_mode) # do not stop at /
237 if not replace: return None
238
239 self._Peek()
240 if self.token_type == Id.Right_VarSub:
241 return ast.PatSub(pat, replace, do_all, do_prefix, do_suffix)
242
243 else:
244 self._BadToken("Expected } after pat sub, got %s", self.cur_token)
245 return None
246
247 else:
248 self._BadToken("Expected } after pat sub, got %s", self.cur_token)
249 return None
250
251 def _ReadSubscript(self):
252 """ Subscript = '[' ('@' | '*' | ArithExpr) ']'
253 """
254 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
255 # expression.
256 t2 = self.lexer.LookAhead(lex_mode_e.ARITH)
257 if t2.id in (Id.Lit_At, Id.Arith_Star):
258 op = ast.WholeArray(t2.id)
259
260 self._Next(lex_mode_e.ARITH) # skip past [
261 self._Peek()
262 self._Next(lex_mode_e.ARITH) # skip past @
263 self._Peek()
264 else:
265 anode = self._ReadArithExpr()
266 if not anode:
267 return None
268 op = ast.ArrayIndex(anode)
269
270 #self._Peek() # Can't do this here. Should the test go elsewhere?
271 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
272 self._BadToken('Expected ] after subscript, got %s', self.cur_token)
273 return None
274
275 self._Next(lex_mode_e.VS_2) # skip past ]
276 self._Peek() # Needed to be in the same spot as no subscript
277
278 return op
279
280 def _ParseVarOf(self):
281 """
282 VarOf = NAME Subscript?
283 | NUMBER # no subscript allowed, none of these are arrays
284 # ${@[1]} doesn't work, even though slicing does
285 | VarSymbol
286 """
287 self._Peek()
288 name_token = self.cur_token
289 self._Next(lex_mode_e.VS_2)
290
291 self._Peek() # Check for []
292 if self.token_type == Id.VOp2_LBracket:
293 bracket_op = self._ReadSubscript()
294 if not bracket_op: return None
295 else:
296 bracket_op = None
297
298 part = ast.BracedVarSub(name_token)
299 part.bracket_op = bracket_op
300 return part
301
302 def _ParseVarExpr(self, arg_lex_mode):
303 """
304 Start parsing at the op -- we already skipped past the name.
305 """
306 part = self._ParseVarOf()
307 if not part: return None
308
309 self._Peek()
310 if self.token_type == Id.Right_VarSub:
311 return part # no ops
312
313 # Or maybe this is a VarOpKind
314
315 op_kind = self.token_kind
316
317 if op_kind == Kind.VTest:
318 op_id = self.token_type
319 arg_word = self._ReadVarOpArg(arg_lex_mode)
320 if self.token_type != Id.Right_VarSub:
321 self._BadToken('Unexpected token after test arg: %s', self.cur_token)
322 return None
323
324 part.suffix_op = ast.StringUnary(op_id, arg_word)
325
326 elif op_kind == Kind.VOp1:
327 op_id = self.token_type
328 arg_word = self._ReadVarOpArg(arg_lex_mode)
329 if self.token_type != Id.Right_VarSub:
330 self._BadToken('Unexpected token after unary op: %s', self.cur_token)
331 return None
332
333 op = ast.StringUnary(op_id, arg_word)
334 part.suffix_op = op
335
336 elif op_kind == Kind.VOp2:
337 if self.token_type == Id.VOp2_Slash:
338 op = self._ReadPatSubVarOp(arg_lex_mode)
339 if not op: return None
340 # Checked by the method above
341 assert self.token_type == Id.Right_VarSub, self.cur_token
342
343 elif self.token_type == Id.VOp2_Colon:
344 op = self._ReadSliceVarOp()
345 if not op: return None
346 if self.token_type != Id.Arith_RBrace:
347 self._BadToken('Unexpected token after slice: %s', self.cur_token)
348 return None
349
350 else:
351 p_die('Unexpected token %s', self.cur_token, token=self.cur_token)
352
353 part.suffix_op = op
354
355 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
356 # mode. It's redundantly checked above.
357 if self.token_type not in (Id.Right_VarSub, Id.Arith_RBrace):
358 self._BadToken('Unexpected token after var sub: %s', self.cur_token)
359 return None
360
361 # Now look for ops
362 return part
363
364 def _ReadBracedBracedVarSub(self, d_quoted=False):
365 """For the ${} expression language.
366
367 NAME = [a-zA-Z_][a-zA-Z0-9_]*
368 NUMBER = [0-9]+ # ${10}, ${11}, ...
369
370 Subscript = '[' ('@' | '*' | ArithExpr) ']'
371 VarSymbol = '!' | '@' | '#' | ...
372 VarOf = NAME Subscript?
373 | NUMBER # no subscript allowed, none of these are arrays
374 # ${@[1]} doesn't work, even though slicing does
375 | VarSymbol
376
377 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
378 STRIP_OP = '#' | '##' | '%' | '%%'
379 CASE_OP = ',' | ',,' | '^' | '^^'
380
381 UnaryOp = TEST_OP | STRIP_OP | CASE_OP | ...
382 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
383 VarExpr = VarOf
384 | VarOf UnaryOp WORD
385 | VarOf ':' ArithExpr (':' ArithExpr )?
386 | VarOf '/' Match '/' WORD
387
388 LengthExpr = '#' VarOf # can't apply operators after length
389
390 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
391 # ${!ref[0]} vs ${!keys[@]} resolved later
392
393 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
394
395 VarSub = LengthExpr
396 | RefOrKeys
397 | PrefixQuery
398 | VarExpr
399
400 NOTES:
401 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
402 slicing ${a:x+1:y+2}
403 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
404 - @ and * are technically arithmetic expressions in this implementation
405 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
406 it's also vectorized.
407
408 Strictness over bash:
409 echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
410 grammar
411 ! and # prefixes can't be composed, even though named refs can be composed
412 with other operators
413 '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip a
414 prefix, and it can also be a literal part of WORD.
415
416 From the parser's point of view, the prefix # can't be combined with
417 UnaryOp/slicing/matching, and the ! can. However
418
419 ${a[@]:1:2} is not allowed
420 ${#a[@]:1:2} is allowed, but gives the wrong answer
421 """
422 left_spid = self.cur_token.span_id
423
424 if d_quoted:
425 arg_lex_mode = lex_mode_e.VS_ARG_DQ
426 else:
427 arg_lex_mode = lex_mode_e.VS_ARG_UNQ
428
429 self._Next(lex_mode_e.VS_1)
430 self._Peek()
431
432 ty = self.token_type
433
434 if ty == Id.VSub_Pound:
435 # Disambiguate
436 t = self.lexer.LookAhead(lex_mode_e.VS_1)
437 #print("\t# LOOKAHEAD", t)
438 if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
439 # e.g. a name, '#' is the prefix
440 self._Next(lex_mode_e.VS_1)
441 part = self._ParseVarOf()
442
443 self._Peek()
444 if self.token_type != Id.Right_VarSub:
445 self._BadToken("Expected } after length expression, got %r",
446 self.cur_token)
447 return None
448
449 part.prefix_op = Id.VSub_Pound # length
450
451 else: # not a prefix, '#' is the variable
452 part = self._ParseVarExpr(arg_lex_mode)
453 if not part: return None
454
455 elif ty == Id.VSub_Bang:
456 t = self.lexer.LookAhead(lex_mode_e.VS_1)
457 #print("\t! LOOKAHEAD", t)
458 if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
459 # e.g. a name, '!' is the prefix
460 # ${!a} -- this is a ref
461 # ${!3} -- this is ref
462 # ${!a[1]} -- this is a ref
463 # ${!a[@]} -- this is a keys
464 # No lookahead -- do it in a second step, or at runtime
465 self._Next(lex_mode_e.VS_1)
466 part = self._ParseVarExpr(arg_lex_mode)
467 if not part: return None
468
469 part.prefix_op = Id.VSub_Bang
470
471 else: # not a prefix, '!' is the variable
472 part = self._ParseVarExpr(arg_lex_mode)
473 if not part: return None
474
475 # VS_NAME, VS_NUMBER, symbol that isn't # or !
476 elif self.token_kind == Kind.VSub:
477 part = self._ParseVarExpr(arg_lex_mode)
478 if not part: return None
479
480 else:
481 # e.g. ${^}
482 p_die('Unexpected token %s', self.cur_token, token=self.cur_token)
483
484 part.spids.append(left_spid)
485
486 # Does this work?
487 right_spid = self.cur_token.span_id
488 part.spids.append(right_spid)
489
490 return part
491
492 def _ReadSingleQuotedPart(self, lex_mode):
493 left = self.cur_token
494 tokens = []
495
496 done = False
497 while not done:
498 self._Next(lex_mode)
499 self._Peek()
500
501 # Kind.Char emitted in DOLLAR_SQ state
502 if self.token_kind in (Kind.Lit, Kind.Char):
503 tokens.append(self.cur_token)
504
505 elif self.token_kind == Kind.Eof:
506 self.AddErrorContext('Unexpected EOF in single-quoted string')
507 return False
508
509 elif self.token_kind == Kind.Right:
510 done = True # assume Id.Right_SingleQuote
511
512 else:
513 raise AssertionError(
514 'Unhandled token in single-quoted part %s (%d)' %
515 (self.cur_token, self.token_kind))
516
517 return ast.SingleQuotedPart(left, tokens)
518
519 def _ReadDoubleQuotedLeftParts(self):
520 """Read substitution parts in a double quoted context."""
521 if self.token_type in (Id.Left_CommandSub, Id.Left_Backtick):
522 return self._ReadCommandSubPart(self.token_type)
523
524 if self.token_type == Id.Left_VarSub:
525 return self._ReadBracedBracedVarSub(d_quoted=True)
526
527 if self.token_type == Id.Left_ArithSub:
528 return self._ReadArithSubPart()
529
530 if self.token_type == Id.Left_ArithSub2:
531 return self._ReadArithSub2Part()
532
533 raise AssertionError(self.cur_token)
534
535 def _ReadLeftParts(self):
536 """Read substitutions and quoted strings."""
537
538 if self.token_type == Id.Left_DoubleQuote:
539 return self._ReadDoubleQuotedPart()
540
541 if self.token_type == Id.Left_DollarDoubleQuote:
542 # NOTE: $"" is treated as "" for now. Does it make sense to add the
543 # token to the part?
544 return self._ReadDoubleQuotedPart()
545
546 if self.token_type == Id.Left_SingleQuote:
547 return self._ReadSingleQuotedPart(lex_mode_e.SQ)
548
549 if self.token_type == Id.Left_DollarSingleQuote:
550 return self._ReadSingleQuotedPart(lex_mode_e.DOLLAR_SQ)
551
552 if self.token_type in (
553 Id.Left_CommandSub, Id.Left_Backtick, Id.Left_ProcSubIn,
554 Id.Left_ProcSubOut):
555 return self._ReadCommandSubPart(self.token_type)
556
557 if self.token_type == Id.Left_VarSub:
558 return self._ReadBracedBracedVarSub(d_quoted=False)
559
560 if self.token_type == Id.Left_ArithSub:
561 return self._ReadArithSubPart()
562
563 if self.token_type == Id.Left_ArithSub2:
564 return self._ReadArithSub2Part()
565
566 raise AssertionError('%s not handled' % self.cur_token)
567
568 def _ReadExtGlobPart(self):
569 """
570 Grammar:
571 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
572 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
573 RIGHT = ')'
574 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
575 CompoundWord includes ExtGlobPart
576 """
577 left_token = self.cur_token
578 arms = []
579 part = ast.ExtGlobPart(left_token, arms) # return value
580 part.spids.append(left_token.span_id)
581
582 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
583 self._Next(lex_mode_e.EXTGLOB) # advance past LEFT
584
585 read_word = False # did we just a read a word? To handle @(||).
586
587 while True:
588 self._Peek()
589 #log('t %r', self.cur_token)
590
591 if self.token_type == Id.Right_ExtGlob:
592 if not read_word:
593 arms.append(ast.CompoundWord())
594 part.spids.append(self.cur_token.span_id)
595 break
596
597 elif self.token_type == Id.Op_Pipe:
598 if not read_word:
599 arms.append(ast.CompoundWord())
600 read_word = False
601 self._Next(lex_mode_e.EXTGLOB)
602
603 # lex mode EXTGLOB should only produce these 4 kinds of tokens
604 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.ExtGlob):
605 w = self._ReadCompoundWord(lex_mode=lex_mode_e.EXTGLOB)
606 arms.append(w)
607 read_word = True
608
609 elif self.token_kind == Kind.Eof:
610 self.AddErrorContext(
611 'Unexpected EOF reading extended glob that began here',
612 token=left_token)
613 return None
614
615 else:
616 raise AssertionError('Unexpected token %r' % self.cur_token)
617
618 return part
619
620 def _ReadDoubleQuotedPart(self, eof_type=Id.Undefined_Tok, here_doc=False):
621 """
622 Args:
623 eof_type: for stopping at }, Id.Lit_RBrace
624 here_doc: Whether we are reading in a here doc context
625
626 Also ${foo%%a b c} # treat this as double quoted. until you hit
627 """
628 quoted_part = ast.DoubleQuotedPart()
629 left_spid = const.NO_INTEGER
630 right_spid = const.NO_INTEGER # gets set later
631
632 if self.cur_token is not None: # None in here doc case
633 left_spid = self.cur_token.span_id
634
635 done = False
636 while not done:
637 self._Next(lex_mode_e.DQ)
638 self._Peek()
639 #print(self.cur_token)
640
641 if self.token_type == eof_type: # e.g. stop at }
642 done = True
643 continue
644
645 elif self.token_kind == Kind.Lit:
646 if self.token_type == Id.Lit_EscapedChar:
647 part = ast.EscapedLiteralPart(self.cur_token)
648 else:
649 part = ast.LiteralPart(self.cur_token)
650 quoted_part.parts.append(part)
651
652 elif self.token_kind == Kind.Left:
653 part = self._ReadDoubleQuotedLeftParts()
654 if not part:
655 return None
656 quoted_part.parts.append(part)
657
658 elif self.token_kind == Kind.VSub:
659 part = ast.SimpleVarSub(self.cur_token)
660 quoted_part.parts.append(part)
661
662 elif self.token_kind == Kind.Right:
663 assert self.token_type == Id.Right_DoubleQuote
664 if here_doc:
665 # Turn Id.Right_DoubleQuote into a literal part
666 quoted_part.parts.append(ast.LiteralPart(self.cur_token))
667 else:
668 done = True # assume Id.Right_DoubleQuote
669 right_spid = self.cur_token.span_id
670
671 elif self.token_kind == Kind.Eof:
672 if here_doc: # here docs will have an EOF in their token stream
673 done = True
674 else:
675 self.AddErrorContext(
676 'Unexpected EOF reading double-quoted string that began here',
677 span_id=left_spid)
678 return False
679
680 else:
681 raise AssertionError(self.cur_token)
682
683 quoted_part.spids.extend((left_spid, right_spid))
684 return quoted_part
685
686 def _ReadCommandSubPart(self, token_type):
687 """
688 NOTE: This is not in the grammar, because word parts aren't in the grammar!
689
690 command_sub = '$(' command_list ')'
691 """
692 left_token = self.cur_token
693 left_spid = left_token.span_id
694
695 #print('_ReadCommandSubPart', self.cur_token)
696 self._Next(lex_mode_e.OUTER) # advance past $( or `
697
698 # Set the lexer in a state so ) becomes the EOF token.
699 #print('_ReadCommandSubPart lexer.PushHint ) -> EOF')
700 if token_type in (
701 Id.Left_CommandSub, Id.Left_ProcSubIn, Id.Left_ProcSubOut):
702 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
703 elif token_type == Id.Left_Backtick:
704 self.lexer.PushHint(Id.Left_Backtick, Id.Eof_Backtick)
705 else:
706 raise AssertionError(self.token_type)
707
708 from osh import parse_lib
709 c_parser = parse_lib.MakeParserForCommandSub(self.line_reader, self.lexer)
710
711 node = c_parser.ParseWholeFile() # `` and $() allowed
712 if not node:
713 # Example of parse error:
714 # echo $(cat |) OR
715 # echo `cat |`
716 error_stack = c_parser.Error()
717 self.error_stack.extend(error_stack)
718 print(self.error_stack)
719 self.AddErrorContext('Error parsing command list in command sub')
720 return None
721
722 # Hm this creates its own word parser, which is thrown away?
723 #print('X', self.cur_token)
724 right_spid = c_parser.w_parser.cur_token.span_id
725
726 cs_part = ast.CommandSubPart(node, left_token)
727 cs_part.spids.append(left_spid)
728 cs_part.spids.append(right_spid)
729 return cs_part
730
731 def _ReadArithExpr(self, do_next=True):
732 """Read and parse an arithmetic expression in various contexts.
733
734 $(( 1+2 ))
735 (( a=1+2 ))
736 ${a[ 1+2 ]}
737 ${a : 1+2 : 1+2}
738
739 See tests/arith-context.test.sh for ambiguous cases.
740
741 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
742
743 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
744
745 TODO: Instead of having an eof_type. I think we should use just run the
746 arith parser until it's done. That will take care of both : and ]. We
747 switch the state back.
748
749 See the assertion in ArithParser.Parse() -- unexpected extra input.
750 """
751 if do_next:
752 self._Next(lex_mode_e.ARITH)
753 # calls self.ReadWord(lex_mode_e.ARITH)
754 a_parser = tdop.TdopParser(arith_parse.SPEC, self)
755 anode = a_parser.Parse()
756 if not anode:
757 error_stack = a_parser.Error()
758 self.error_stack.extend(error_stack)
759 return anode # could be None
760
761 def _ReadArithSubPart(self):
762 """
763 Read an arith substitution, which contains an arith expression, e.g.
764 $((a + 1)).
765 """
766 left_span_id = self.cur_token.span_id
767
768 # The second one needs to be disambiguated in stuff like stuff like:
769 # $(echo $(( 1+2 )) )
770 self.lexer.PushHint(Id.Op_RParen, Id.Right_ArithSub)
771
772 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
773 # could save the lexer/reader state here, and retry if the arithmetic parse
774 # fails. But we can almost always catch this at parse time. There could
775 # be some exceptions like:
776 # $((echo * foo)) # looks like multiplication
777 # $((echo / foo)) # looks like division
778
779 anode = self._ReadArithExpr()
780 if not anode:
781 self.AddErrorContext("Error parsing arith sub part")
782 return None
783
784 if self.token_type != Id.Arith_RParen:
785 self._BadToken('Expected first paren to end arith sub, got %s',
786 self.cur_token)
787 return None
788
789 self._Next(lex_mode_e.OUTER) # TODO: This could be DQ or ARITH too
790
791 # PROBLEM: $(echo $(( 1 + 2 )) )
792 # Two right parens break the Id.Eof_RParen scheme
793 self._Peek()
794 if self.token_type != Id.Right_ArithSub:
795 self._BadToken('Expected second paren to end arith sub, got %s',
796 self.cur_token)
797 return None
798 right_span_id = self.cur_token.span_id
799
800 node = ast.ArithSubPart(anode)
801 node.spids.append(left_span_id)
802 node.spids.append(right_span_id)
803 return node
804
805 def _ReadArithSub2Part(self):
806 """Non-standard arith sub $[a + 1]."""
807 left_span_id = self.cur_token.span_id
808
809 anode = self._ReadArithExpr()
810 if not anode:
811 self.AddErrorContext("Error parsing arith sub part")
812 return None
813
814 if self.token_type != Id.Arith_RBracket:
815 self.AddErrorContext("Expected ], got %s", self.cur_token)
816 return None
817 right_span_id = self.cur_token.span_id
818
819 node = ast.ArithSubPart(anode)
820 node.spids.append(left_span_id)
821 node.spids.append(right_span_id)
822 return node
823
824 def ReadDParen(self):
825 """Read ((1+ 2)) -- command context.
826
827 We're using the word parser because it's very similar to _ReadArithExpr
828 above.
829 """
830 # The second one needs to be disambiguated in stuff like stuff like:
831 # TODO: Be consistent with ReadForExpression below and use lex_mode_e.ARITH?
832 # Then you can get rid of this.
833 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
834
835 anode = self._ReadArithExpr()
836 if not anode:
837 self.AddErrorContext("Error parsing dparen statement")
838 return None
839
840 #print('xx ((', self.cur_token)
841 if self.token_type != Id.Arith_RParen:
842 self._BadToken('Expected first paren to end arith sub, got %s',
843 self.cur_token)
844 return None
845 self._Next(lex_mode_e.OUTER)
846
847 # PROBLEM: $(echo $(( 1 + 2 )) )
848 self._Peek()
849 if self.token_type != Id.Op_DRightParen:
850 self._BadToken('Expected second paren to end arith sub, got %s',
851 self.cur_token)
852 return None
853 self._Next(lex_mode_e.OUTER)
854
855 return anode
856
857 def ReadForExpression(self):
858 """Read ((i=0; i<5; ++i)) -- part of command context.
859
860 """
861 # No PushHint because we're in arith state.
862 #self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
863
864 self._Next(lex_mode_e.ARITH) # skip over ((
865
866 self._Peek()
867 if self.token_type == Id.Arith_Semi:
868 #print('Got empty init')
869 init_node = None
870 else:
871 init_node = self._ReadArithExpr(do_next=False)
872 if not init_node:
873 self.AddErrorContext("Error parsing for init")
874 return None
875 self._Next(lex_mode_e.ARITH)
876 #print('INIT',init_node)
877
878 self._Peek()
879 if self.token_type == Id.Arith_Semi:
880 #print('Got empty condition')
881 cond_node = None
882 else:
883 cond_node = self._ReadArithExpr(do_next=False)
884 if not cond_node:
885 self.AddErrorContext("Error parsing for cond")
886 return None
887 self._Next(lex_mode_e.ARITH)
888 #print('COND',cond_node)
889
890 self._Peek()
891 if self.token_type == Id.Arith_RParen:
892 #print('Got empty update')
893 update_node = None
894 else:
895 update_node = self._ReadArithExpr(do_next=False)
896 if not update_node:
897 self.AddErrorContext("Error parsing for update")
898 return None
899 self._Next(lex_mode_e.ARITH)
900 #print('UPDATE',update_node)
901
902 #print('TT', self.cur_token)
903 # Second paren
904 self._Peek()
905 if self.token_type != Id.Arith_RParen:
906 self._BadToken('Expected right paren to end for loop expression, got %s',
907 self.cur_token)
908 return None
909 self._Next(lex_mode_e.OUTER)
910
911 return ast.ForExpr(init_node, cond_node, update_node)
912
913 def _ReadArrayLiteralPart(self):
914 self._Next(lex_mode_e.OUTER) # advance past (
915 self._Peek()
916 if self.cur_token.id != Id.Op_LParen:
917 self.AddErrorContext('Expected ( after =', token=self.cur_token)
918 return None
919
920 # MUST use a new word parser (with same lexer).
921 w_parser = WordParser(self.lexer, self.line_reader)
922 words = []
923 while True:
924 w = w_parser.ReadWord(lex_mode_e.OUTER)
925 if not w:
926 self.error_stack.extend(w_parser.Error())
927 return None
928
929 if w.tag == word_e.TokenWord:
930 word_id = word.CommandId(w)
931 if word_id == Id.Right_ArrayLiteral:
932 break
933 # Unlike command parsing, array parsing allows embedded \n.
934 elif word_id == Id.Op_Newline:
935 continue
936 else:
937 self.AddErrorContext(
938 'Unexpected word in array literal: %s', w, word=w)
939 return None
940
941 words.append(w)
942
943 words2 = braces.BraceDetectAll(words)
944 words3 = word.TildeDetectAll(words2)
945
946 return ast.ArrayLiteralPart(words3)
947
948 def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok,
949 lex_mode=lex_mode_e.OUTER, empty_ok=True):
950 """
951 Precondition: Looking at the first token of the first word part
952 Postcondition: Looking at the token after, e.g. space or operator
953
954 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
955 could be an operator delimiting a compound word. Can we change lexer modes
956 and remove this special case?
957 """
958 #print('_ReadCompoundWord', lex_mode)
959 word = ast.CompoundWord()
960
961 num_parts = 0
962 done = False
963 while not done:
964 allow_done = empty_ok or num_parts != 0
965 self._Peek()
966 #print('CW',self.cur_token)
967 if allow_done and self.token_type == eof_type:
968 done = True # e.g. for ${foo//pat/replace}
969
970 # Keywords like "for" are treated like literals
971 elif self.token_kind in (
972 Kind.Lit, Kind.KW, Kind.Assign, Kind.ControlFlow, Kind.BoolUnary,
973 Kind.BoolBinary):
974 if self.token_type == Id.Lit_EscapedChar:
975 part = ast.EscapedLiteralPart(self.cur_token)
976 else:
977 part = ast.LiteralPart(self.cur_token)
978 #part.xspans.append(self.cur_token.span_id)
979
980 word.parts.append(part)
981
982 if self.token_type == Id.Lit_VarLike:
983 #print('@', self.cursor)
984 #print('@', self.cur_token)
985
986 t = self.lexer.LookAhead(lex_mode_e.OUTER)
987 if t.id == Id.Op_LParen:
988 self.lexer.PushHint(Id.Op_RParen, Id.Right_ArrayLiteral)
989 part2 = self._ReadArrayLiteralPart()
990 if not part2:
991 self.AddErrorContext('_ReadArrayLiteralPart failed')
992 return False
993 word.parts.append(part2)
994
995 elif self.token_kind == Kind.VSub:
996 part = ast.SimpleVarSub(self.cur_token)
997 word.parts.append(part)
998
999 elif self.token_kind == Kind.ExtGlob:
1000 part = self._ReadExtGlobPart()
1001 if not part:
1002 return None
1003 word.parts.append(part)
1004
1005 elif self.token_kind == Kind.Left:
1006 #print('_ReadLeftParts')
1007 part = self._ReadLeftParts()
1008 if not part:
1009 return None
1010 word.parts.append(part)
1011
1012 # NOT done yet, will advance below
1013 elif self.token_kind == Kind.Right:
1014 # Still part of the word; will be done on the next iter.
1015 if self.token_type == Id.Right_DoubleQuote:
1016 pass
1017 elif self.token_type == Id.Right_CommandSub:
1018 pass
1019 elif self.token_type == Id.Right_Subshell:
1020 # LEXER HACK for (case x in x) ;; esac )
1021 assert self.next_lex_mode is None # Rewind before it's used
1022 if self.lexer.MaybeUnreadOne():
1023 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1024 self._Next(lex_mode)
1025 done = True
1026 else:
1027 done = True
1028
1029 elif self.token_kind == Kind.Ignored:
1030 done = True
1031
1032 else:
1033 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1034 # so to test for ESAC, we can read ) before getting a chance to
1035 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1036 # token and do it again.
1037
1038 # We get Id.Op_RParen at top level: case x in x) ;; esac
1039 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1040 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1041 assert self.next_lex_mode is None # Rewind before it's used
1042 if self.lexer.MaybeUnreadOne():
1043 if self.token_type == Id.Eof_RParen:
1044 # Redo translation
1045 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1046 self._Next(lex_mode)
1047
1048 done = True # anything we don't recognize means we're done
1049
1050 if not done:
1051 self._Next(lex_mode)
1052 num_parts += 1
1053 return word
1054
1055 def _ReadArithWord(self):
1056 """Helper function for ReadArithWord."""
1057 #assert self.token_type != Id.Undefined_Tok
1058 self._Peek()
1059 #print('_ReadArithWord', self.cur_token)
1060
1061 if self.token_kind == Kind.Unknown:
1062 self.AddErrorContext("Unknown token in arith context: %s",
1063 self.cur_token, token=self.cur_token)
1064 return None, False
1065
1066 elif self.token_kind == Kind.Eof:
1067 # Just return EOF token
1068 w = ast.TokenWord(self.cur_token)
1069 return w, False
1070 #self.AddErrorContext("Unexpected EOF in arith context: %s",
1071 # self.cur_token, token=self.cur_token)
1072 #return None, False
1073
1074 elif self.token_kind == Kind.Ignored:
1075 # Space should be ignored. TODO: change this to SPACE_SPACE and
1076 # SPACE_NEWLINE? or SPACE_TOK.
1077 self._Next(lex_mode_e.ARITH)
1078 return None, True # Tell wrapper to try again
1079
1080 elif self.token_kind in (Kind.Arith, Kind.Right):
1081 # Id.Right_ArithSub IS just a normal token, handled by ArithParser
1082 self._Next(lex_mode_e.ARITH)
1083 w = ast.TokenWord(self.cur_token)
1084 return w, False
1085
1086 elif self.token_kind in (Kind.Lit, Kind.Left):
1087 w = self._ReadCompoundWord(lex_mode=lex_mode_e.ARITH)
1088 if not w:
1089 return None, True
1090 return w, False
1091
1092 elif self.token_kind == Kind.VSub:
1093 part = ast.SimpleVarSub(self.cur_token)
1094 self._Next(lex_mode_e.ARITH)
1095 w = ast.CompoundWord([part])
1096 return w, False
1097
1098 else:
1099 self._BadToken("Unexpected token parsing arith sub: %s", self.cur_token)
1100 return None, False
1101
1102 raise AssertionError("Shouldn't get here")
1103
1104 def _ReadWord(self, lex_mode):
1105 """Helper function for Read().
1106
1107 Returns:
1108 2-tuple (word, need_more)
1109 word: Word, or None if there was an error, or need_more is set
1110 need_more: True if the caller should call us again
1111 """
1112 #print('_Read', lex_mode, self.cur_token)
1113 self._Peek()
1114
1115 if self.token_kind == Kind.Eof:
1116 # No advance
1117 return ast.TokenWord(self.cur_token), False
1118
1119 # Allow Arith for ) at end of for loop?
1120 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1121 self._Next(lex_mode)
1122 if self.token_type == Id.Op_Newline:
1123 if self.cursor_was_newline:
1124 #print('SKIP(nl)', self.cur_token)
1125 return None, True
1126
1127 return ast.TokenWord(self.cur_token), False
1128
1129 elif self.token_kind == Kind.Right:
1130 #print('WordParser.Read: Kind.Right', self.cur_token)
1131 if self.token_type not in (
1132 Id.Right_Subshell, Id.Right_FuncDef, Id.Right_CasePat,
1133 Id.Right_ArrayLiteral):
1134 raise AssertionError(self.cur_token)
1135
1136 self._Next(lex_mode)
1137 return ast.TokenWord(self.cur_token), False
1138
1139 elif self.token_kind in (Kind.Ignored, Kind.WS):
1140 self._Next(lex_mode)
1141 return None, True # tell Read() to try again
1142
1143 elif self.token_kind in (
1144 Kind.VSub, Kind.Lit, Kind.Left, Kind.KW, Kind.Assign, Kind.ControlFlow,
1145 Kind.BoolUnary, Kind.BoolBinary, Kind.ExtGlob):
1146 # We're beginning a word. If we see Id.Lit_Pound, change to
1147 # lex_mode_e.COMMENT and read until end of line. (TODO: How to add
1148 # comments to AST?)
1149
1150 # TODO: Can we do the same thing for Tilde here? Enter a state where we
1151 # look for / too.
1152 if self.token_type == Id.Lit_Pound:
1153 self._Next(lex_mode_e.COMMENT)
1154 self._Peek()
1155
1156 # NOTE: The # could be the last character in the file. It can't be
1157 # Eof_{RParen,Backtick} because #) and #` are comments.
1158 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
1159 self.cur_token
1160
1161 # The next iteration will go into Kind.Ignored and set lex state to
1162 # lex_mode_e.OUTER/etc.
1163 return None, True # tell Read() to try again after comment
1164
1165 else:
1166 w = self._ReadCompoundWord(lex_mode=lex_mode)
1167 if not w:
1168 self.AddErrorContext(
1169 'Error reading command word', token=self.cur_token)
1170 return None, False
1171 return w, False
1172
1173 else:
1174 raise AssertionError(
1175 'Unhandled: %s (%s)' % (self.cur_token, self.token_kind))
1176
1177 raise AssertionError("Shouldn't get here")
1178
1179 def LookAhead(self):
1180 """Look ahead to the next token.
1181
1182 For the command parser to recognize func () { } and array= (1 2 3). And
1183 probably coprocesses.
1184 """
1185 assert self.token_type != Id.Undefined_Tok
1186 if self.cur_token.id == Id.WS_Space:
1187 t = self.lexer.LookAhead(lex_mode_e.OUTER)
1188 else:
1189 t = self.cur_token
1190 return t.id
1191
1192 def ReadWord(self, lex_mode):
1193 """Read the next Word.
1194
1195 Returns:
1196 Word, or None if there was an error
1197 """
1198 # Implementation note: This is an stateful/iterative function that calls
1199 # the stateless "_ReadWord" function.
1200 while True:
1201 if lex_mode == lex_mode_e.ARITH:
1202 # TODO: Can this be unified?
1203 w, need_more = self._ReadArithWord()
1204 elif lex_mode in (
1205 lex_mode_e.OUTER, lex_mode_e.DBRACKET, lex_mode_e.BASH_REGEX):
1206 w, need_more = self._ReadWord(lex_mode)
1207 else:
1208 raise AssertionError('Invalid lex state %s' % lex_mode)
1209 if not need_more:
1210 break
1211
1212 if not w: # Assumes AddErrorContext was already called
1213 return None
1214
1215 self.cursor = w
1216
1217 # TODO: Do consolidation of newlines in the lexer?
1218 # Note that there can be an infinite (Id.Ignored_Comment Id.Op_Newline
1219 # Id.Ignored_Comment Id.Op_Newline) sequence, so we have to keep track of
1220 # the last non-ignored token.
1221 self.cursor_was_newline = (word.CommandId(self.cursor) == Id.Op_Newline)
1222 return self.cursor
1223
1224 def ReadHereDocBody(self):
1225 """
1226 Sort of like Read(), except we're in a double quoted context, but not using
1227 double quotes.
1228
1229 Returns:
1230 CompoundWord. NOTE: We could also just use a DoubleQuotedPart for both
1231 cases?
1232 """
1233 w = ast.CompoundWord()
1234 dq = self._ReadDoubleQuotedPart(here_doc=True)
1235 if not dq:
1236 self.AddErrorContext('Error parsing here doc body')
1237 return False
1238 w.parts.append(dq)
1239 return w