OILS / osh / word_parse.py View on Github | oilshell.org

2217 lines, 1183 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91)
92from core import alloc
93from core.error import p_die
94from mycpp.mylib import log
95from core import pyutil
96from core import ui
97from frontend import consts
98from frontend import lexer
99from frontend import reader
100from osh import tdop
101from osh import arith_parse
102from osh import braces
103from osh import word_
104from osh import word_compile
105from mycpp.mylib import tagswitch
106
107from typing import List, Optional, Tuple, cast
108from typing import TYPE_CHECKING
109if TYPE_CHECKING:
110 from frontend.lexer import Lexer
111 from frontend.parse_lib import ParseContext
112 from frontend.reader import _Reader
113 from osh.cmd_parse import VarChecker
114
115unused1 = log
116unused2 = Id_str
117
118KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
119
120
121class WordEmitter(object):
122 """Common interface for [ and [["""
123
124 def __init__(self):
125 # type: () -> None
126 """Empty constructor for mycpp."""
127 pass
128
129 def ReadWord(self, lex_mode):
130 # type: (lex_mode_t) -> word_t
131 raise NotImplementedError()
132
133
134class WordParser(WordEmitter):
135
136 def __init__(self, parse_ctx, lexer, line_reader):
137 # type: (ParseContext, Lexer, _Reader) -> None
138 self.parse_ctx = parse_ctx
139 self.lexer = lexer
140 self.line_reader = line_reader
141 self.arena = line_reader.arena
142
143 self.parse_opts = parse_ctx.parse_opts
144 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
145 self.parse_opts)
146 self.Reset()
147
148 def Init(self, lex_mode):
149 # type: (lex_mode_t) -> None
150 """Used to parse arithmetic, see ParseContext."""
151 self.next_lex_mode = lex_mode
152
153 def Reset(self):
154 # type: () -> None
155 """Called by interactive loop."""
156 # For _GetToken()
157 self.cur_token = None # type: Token
158 self.token_kind = Kind.Undefined
159 self.token_type = Id.Undefined_Tok
160
161 self.next_lex_mode = lex_mode_e.ShCommand
162
163 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
164 # comments
165 self.emit_doc_token = False
166 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
167 # multiline mode.
168 self.multiline = False
169
170 # For detecting invalid \n\n in multiline mode. Counts what we got
171 # directly from the lexer.
172 self.newline_state = 0
173 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
174 # that consume words.
175 self.returned_newline = False
176
177 # For integration with pgen2
178 self.buffered_word = None # type: word_t
179
180 def _GetToken(self):
181 # type: () -> None
182 """Call this when you need to make a decision based on any of:
183
184 self.token_type
185 self.token_kind
186 self.cur_token
187 """
188 if self.next_lex_mode == lex_mode_e.Undefined:
189 return # _SetNext() not called, so do nothing
190
191 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
192 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
193
194 self.cur_token = self.lexer.Read(real_mode)
195
196 # MUTATE TOKEN for fake lexer mode.
197 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
198 if (is_fake and self.cur_token.id
199 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
200 self.cur_token.id = Id.Lit_Chars
201
202 self.token_type = self.cur_token.id
203 self.token_kind = consts.GetKind(self.token_type)
204
205 # number of consecutive newlines, ignoring whitespace
206 if self.token_type == Id.Op_Newline:
207 self.newline_state += 1
208 elif self.token_kind != Kind.WS:
209 self.newline_state = 0
210
211 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
212 self.next_lex_mode = lex_mode_e.Undefined
213
214 def _SetNext(self, lex_mode):
215 # type: (lex_mode_t) -> None
216 """Set the next lex state, but don't actually read a token.
217
218 We need this for proper interactive parsing.
219 """
220 self.next_lex_mode = lex_mode
221
222 def _ReadVarOpArg(self, arg_lex_mode):
223 # type: (lex_mode_t) -> rhs_word_t
224
225 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
226 # valid, even when unquoted.
227 self._SetNext(arg_lex_mode)
228 self._GetToken()
229
230 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
231 True) # empty_ok
232
233 # If the Compound has no parts, and we're in a double-quoted VarSub
234 # arg, and empty_ok, then return Empty. This is so it can evaluate to
235 # the empty string and not get elided.
236 #
237 # Examples:
238 # - "${s:-}", "${s/%pat/}"
239 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
240 # has the same potential problem of not having Token location info.
241 #
242 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
243 # return a Compound with no parts, which is explicitly checked with a
244 # custom error message.
245 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
246 return rhs_word.Empty
247
248 return w
249
250 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
251 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
252 """Return a CompoundWord.
253
254 Helper function for _ReadVarOpArg and used directly by
255 _ReadPatSubVarOp.
256 """
257 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
258 #log('w %s', w)
259 tilde = word_.TildeDetect(w)
260 if tilde:
261 w = tilde
262 return w
263
264 def _ReadSliceVarOp(self):
265 # type: () -> suffix_op.Slice
266 """
267 Looking token after first ':'
268
269 ArithExpr? (':' ArithExpr? )? '}'
270 """
271 self._NextNonSpace()
272
273 cur_id = self.token_type
274
275 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
276 begin = arith_expr.EmptyZero # type: arith_expr_t
277 else:
278 begin = self.a_parser.Parse()
279 cur_id = self.a_parser.CurrentId() # advance
280
281 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
282 no_length = None # type: Optional[arith_expr_t] # No length specified
283 return suffix_op.Slice(begin, no_length)
284
285 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
286 colon_tok = self.cur_token
287 self._NextNonSpace()
288
289 if self.token_type == Id.Arith_RBrace:
290 # quirky bash behavior:
291 # ${a:1:} or ${a::} means length ZERO
292 # but ${a:1} or ${a:} means length N
293 if self.parse_opts.strict_parse_slice():
294 p_die(
295 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
296 colon_tok)
297
298 length = arith_expr.EmptyZero
299 else:
300 length = self._ReadArithExpr(Id.Arith_RBrace)
301
302 return suffix_op.Slice(begin, length)
303
304 else:
305 p_die("Expected : or } in slice", self.cur_token)
306
307 raise AssertionError() # for MyPy
308
309 def _ReadPatSubVarOp(self):
310 # type: () -> suffix_op.PatSub
311 """Looking at the first '/' after VarOf:
312
313 VarSub = ...
314 | VarOf '/' Match ( '/' WORD? )?
315 Match = '/' WORD # can't be empty
316 | '#' WORD? # may be empty
317 | '%' WORD?
318 """
319 slash_tok = self.cur_token # location info
320 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
321
322 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
323
324 self._GetToken()
325 if self.token_type == Id.Right_DollarBrace:
326 pat = CompoundWord([])
327 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
328 slash_tok)
329
330 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
331 replace_mode = self.token_type
332 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
333
334 # Bash quirk:
335 # echo ${x/#/replace} has an empty pattern
336 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
337 empty_ok = replace_mode != Id.Lit_Slash
338 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
339 empty_ok)
340 #log('pat 1 %r', pat)
341
342 if self.token_type == Id.Lit_Slash:
343 # read until }
344 replace = self._ReadVarOpArg(
345 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
346 #log('r 1 %r', replace)
347 else:
348 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
349 replace = rhs_word.Empty
350
351 self._GetToken()
352 if self.token_type != Id.Right_DollarBrace:
353 # This happens on invalid code
354 p_die(
355 "Expected } after replacement string, got %s" %
356 ui.PrettyId(self.token_type), self.cur_token)
357
358 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
359
360 def _ReadSubscript(self):
361 # type: () -> bracket_op_t
362 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
363 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
364 # expression.
365 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
366 if next_id in (Id.Lit_At, Id.Arith_Star):
367 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
368
369 self._SetNext(lex_mode_e.Arith) # skip past [
370 self._GetToken()
371 self._SetNext(lex_mode_e.Arith) # skip past @
372 self._GetToken()
373 else:
374 self._SetNext(lex_mode_e.Arith) # skip past [
375 anode = self._ReadArithExpr(Id.Arith_RBracket)
376 op = bracket_op.ArrayIndex(anode)
377
378 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
379 p_die('Expected ] to close subscript', self.cur_token)
380
381 self._SetNext(lex_mode_e.VSub_2) # skip past ]
382 self._GetToken() # Needed to be in the same spot as no subscript
383
384 return op
385
386 def _ParseVarOf(self):
387 # type: () -> BracedVarSub
388 """
389 VarOf = NAME Subscript?
390 | NUMBER # no subscript allowed, none of these are arrays
391 # ${@[1]} doesn't work, even though slicing does
392 | VarSymbol
393 """
394 self._GetToken()
395 name_token = self.cur_token
396 self._SetNext(lex_mode_e.VSub_2)
397
398 self._GetToken() # Check for []
399 if self.token_type == Id.VOp2_LBracket:
400 bracket_op = self._ReadSubscript()
401 else:
402 bracket_op = None
403
404 part = BracedVarSub.CreateNull()
405 part.token = name_token
406 part.var_name = lexer.TokenVal(name_token)
407 part.bracket_op = bracket_op
408 return part
409
410 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
411 # type: (lex_mode_t, bool) -> BracedVarSub
412 """Start parsing at the op -- we already skipped past the name."""
413 part = self._ParseVarOf()
414
415 self._GetToken()
416 if self.token_type == Id.Right_DollarBrace:
417 return part # no ops
418
419 op_kind = self.token_kind
420
421 if op_kind == Kind.VTest:
422 tok = self.cur_token
423 arg_word = self._ReadVarOpArg(arg_lex_mode)
424 if self.token_type != Id.Right_DollarBrace:
425 p_die('Expected } to close ${', self.cur_token)
426
427 part.suffix_op = suffix_op.Unary(tok, arg_word)
428
429 elif op_kind == Kind.VOpYsh:
430 tok = self.cur_token
431 arg_word = self._ReadVarOpArg(arg_lex_mode)
432 if self.token_type != Id.Right_DollarBrace:
433 p_die('Expected } to close ${', self.cur_token)
434
435 UP_arg_word = arg_word
436 with tagswitch(arg_word) as case:
437 if case(rhs_word_e.Empty):
438 pass
439 elif case(rhs_word_e.Compound):
440 arg_word = cast(CompoundWord, UP_arg_word)
441 # This handles ${x|html} and ${x %.3f} now
442 # However I think ${x %.3f} should be statically parsed? It can enter
443 # the printf lexer modes.
444 ok, arg, quoted = word_.StaticEval(arg_word)
445 if not ok or quoted:
446 p_die('Expected a constant argument',
447 loc.Word(arg_word))
448
449 part.suffix_op = suffix_op.Static(tok, arg)
450
451 elif op_kind == Kind.VOp0:
452 part.suffix_op = self.cur_token # Nullary
453 self._SetNext(lex_mode_e.VSub_2) # Expecting }
454 self._GetToken()
455
456 elif op_kind == Kind.VOp1: # % %% # ## etc.
457 tok = self.cur_token
458 # Weird exception that all shells have: these operators take a glob
459 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
460 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
461 if self.token_type != Id.Right_DollarBrace:
462 p_die('Expected } to close ${', self.cur_token)
463
464 part.suffix_op = suffix_op.Unary(tok, arg_word)
465
466 elif op_kind == Kind.VOp2: # / : [ ]
467 if self.token_type == Id.VOp2_Slash:
468 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
469 part.suffix_op = patsub_op
470
471 # Checked by the method above
472 assert self.token_type == Id.Right_DollarBrace, self.cur_token
473
474 elif self.token_type == Id.VOp2_Colon:
475 part.suffix_op = self._ReadSliceVarOp()
476 # NOTE: } in arithmetic mode.
477 if self.token_type != Id.Arith_RBrace:
478 # Token seems off; doesn't point to X in # ${a:1:2 X
479 p_die('Expected } to close ${', self.cur_token)
480
481 else:
482 # TODO: Does this ever happen?
483 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
484
485 elif op_kind == Kind.VOp3: # ${prefix@} etc.
486 if allow_query:
487 part.suffix_op = self.cur_token # Nullary
488 self._SetNext(lex_mode_e.VSub_2) # Expecting }
489 self._GetToken()
490 else:
491 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
492
493 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
494 # mode. It's redundantly checked above.
495 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
496 # ${a.} or ${!a.}
497 p_die('Expected } to close ${', self.cur_token)
498
499 # Now look for ops
500 return part
501
502 def _ReadZshVarSub(self, left_token):
503 # type: (Token) -> word_part.ZshVarSub
504
505 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
506
507 # Can be empty
508 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
509 True)
510 self._GetToken()
511 return word_part.ZshVarSub(left_token, w, self.cur_token)
512
513 def ReadBracedVarSub(self, left_token):
514 # type: (Token) -> Tuple[BracedVarSub, Token]
515 """ For YSH expressions like var x = ${x:-"default"}. """
516 part = self._ReadBracedVarSub(left_token, d_quoted=False)
517 last_token = self.cur_token
518 return part, last_token
519
520 def _ReadBracedVarSub(self, left_token, d_quoted):
521 # type: (Token, bool) -> BracedVarSub
522 """For the ${} expression language.
523
524 NAME = [a-zA-Z_][a-zA-Z0-9_]*
525 NUMBER = [0-9]+ # ${10}, ${11}, ...
526
527 Subscript = '[' ('@' | '*' | ArithExpr) ']'
528 VarSymbol = '!' | '@' | '#' | ...
529 VarOf = NAME Subscript?
530 | NUMBER # no subscript allowed, none of these are arrays
531 # ${@[1]} doesn't work, even though slicing does
532 | VarSymbol
533
534 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
535
536 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
537 STRIP_OP = '#' | '##' | '%' | '%%'
538 CASE_OP = ',' | ',,' | '^' | '^^'
539 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
540
541 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
542 # SPACE is operator not %
543 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
544 VarExpr = VarOf
545 | VarOf NULLARY_OP
546 | VarOf UnaryOp WORD
547 | VarOf YSH_UNARY STATIC_WORD
548 | VarOf ':' ArithExpr (':' ArithExpr )?
549 | VarOf '/' Match '/' WORD
550
551 LengthExpr = '#' VarOf # can't apply operators after length
552
553 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
554 # ${!ref[0]} vs ${!keys[@]} resolved later
555
556 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
557
558 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
559
560 VarSub = LengthExpr
561 | RefOrKeys
562 | PrefixQuery
563 | VarExpr
564 | BuiltinSub
565
566 NOTES:
567 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
568 slicing ${a:x+1:y+2}
569 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
570 - @ and * are technically arithmetic expressions in this implementation
571 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
572 it's also vectorized.
573
574 Strictness over bash:
575 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
576 grammar
577 - ! and # prefixes can't be composed, even though named refs can be
578 composed with other operators
579 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
580 a prefix, and it can also be a literal part of WORD.
581
582 From the parser's point of view, the prefix # can't be combined with
583 UnaryOp/slicing/matching, and the ! can. However
584
585 - ${a[@]:1:2} is not allowed
586 - ${#a[@]:1:2} is allowed, but gives the wrong answer
587 """
588 if d_quoted:
589 arg_lex_mode = lex_mode_e.VSub_ArgDQ
590 else:
591 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
592
593 self._SetNext(lex_mode_e.VSub_1)
594 self._GetToken()
595
596 ty = self.token_type
597 first_tok = self.cur_token
598
599 if ty == Id.VSub_Pound:
600 # Disambiguate
601 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
602 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
603 # e.g. a name, '#' is the prefix
604 self._SetNext(lex_mode_e.VSub_1)
605 part = self._ParseVarOf()
606
607 self._GetToken()
608 if self.token_type != Id.Right_DollarBrace:
609 p_die('Expected } after length expression', self.cur_token)
610
611 part.prefix_op = first_tok
612
613 else: # not a prefix, '#' is the variable
614 part = self._ParseVarExpr(arg_lex_mode)
615
616 elif ty == Id.VSub_Bang:
617 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
618 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
619 # e.g. a name, '!' is the prefix
620 # ${!a} -- this is a ref
621 # ${!3} -- this is ref
622 # ${!a[1]} -- this is a ref
623 # ${!a[@]} -- this is a keys
624 # No lookahead -- do it in a second step, or at runtime
625 self._SetNext(lex_mode_e.VSub_1)
626 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
627
628 part.prefix_op = first_tok
629
630 else: # not a prefix, '!' is the variable
631 part = self._ParseVarExpr(arg_lex_mode)
632
633 elif ty == Id.VSub_Dot:
634 # Note: this will become a new builtin_sub type, so this method must
635 # return word_part_t rather than BracedVarSub. I don't think that
636 # should cause problems.
637 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
638
639 # VS_NAME, VS_NUMBER, symbol that isn't # or !
640 elif self.token_kind == Kind.VSub:
641 part = self._ParseVarExpr(arg_lex_mode)
642
643 else:
644 # e.g. ${^}
645 p_die('Unexpected token in ${}', self.cur_token)
646
647 part.left = left_token # attach the argument
648 part.right = self.cur_token
649 return part
650
651 def _ReadSingleQuoted(self, left_token, lex_mode):
652 # type: (Token, lex_mode_t) -> SingleQuoted
653 """Internal method to read a word_part."""
654 tokens = [] # type: List[Token]
655 # In command mode, we never disallow backslashes like '\'
656 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
657 False)
658 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
659 node = SingleQuoted(left_token, sval, right_quote)
660 return node
661
662 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
663 # type: (lex_mode_t, Token, List[Token], bool) -> Token
664 """Appends to out_tokens; returns last token
665
666 Used by expr_parse.py
667 """
668 # TODO: Remove and use out_tokens
669 tokens = [] # type: List[Token]
670
671 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
672 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
673
674 expected_end_tokens = 3 if left_token.id in (
675 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
676 Id.Left_BTSingleQuote) else 1
677 num_end_tokens = 0
678
679 while num_end_tokens < expected_end_tokens:
680 self._SetNext(lex_mode)
681 self._GetToken()
682
683 # Kind.Char emitted in lex_mode.SQ_C
684 if self.token_kind in (Kind.Lit, Kind.Char):
685 tok = self.cur_token
686 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
687 # r'one\two' or c'one\\two'
688 if no_backslashes and lexer.TokenContains(tok, '\\'):
689 p_die(
690 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
691 tok)
692
693 if is_ysh_expr:
694 # Disallow var x = $'\001'. Arguably we don't need these
695 # checks because u'\u{1}' is the way to write it.
696 if self.token_type == Id.Char_Octal3:
697 p_die(
698 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
699 tok)
700
701 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
702 # disallow \xH
703 p_die(
704 r'Invalid hex escape in YSH string (must be \xHH)',
705 tok)
706
707 tokens.append(tok)
708
709 elif self.token_kind == Kind.Unknown:
710 tok = self.cur_token
711 assert tok.id == Id.Unknown_Backslash, tok
712
713 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
714 if is_ysh_expr or not self.parse_opts.parse_backslash():
715 p_die(
716 "Invalid char escape in C-style string literal (OILS-ERR-11)",
717 tok)
718
719 tokens.append(tok)
720
721 elif self.token_kind == Kind.Eof:
722 p_die('Unexpected EOF in single-quoted string that began here',
723 left_token)
724
725 elif self.token_kind == Kind.Right:
726 # assume Id.Right_SingleQuote
727 num_end_tokens += 1
728 tokens.append(self.cur_token)
729
730 else:
731 raise AssertionError(self.cur_token)
732
733 if self.token_kind != Kind.Right:
734 num_end_tokens = 0 # we need three in a ROW
735
736 if expected_end_tokens == 1:
737 tokens.pop()
738 elif expected_end_tokens == 3: # Get rid of spurious end tokens
739 tokens.pop()
740 tokens.pop()
741 tokens.pop()
742
743 # Remove space from ''' r''' $''' in both expression mode and command mode
744 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
745 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
746 word_compile.RemoveLeadingSpaceSQ(tokens)
747
748 # Validation after lexing - same 2 checks in j8.LexerDecoder
749 is_u_string = left_token.id in (Id.Left_USingleQuote,
750 Id.Left_UTSingleQuote)
751
752 for tok in tokens:
753 # u'\yff' is not valid, but b'\yff' is
754 if is_u_string and tok.id == Id.Char_YHex:
755 p_die(
756 r"%s escapes not allowed in u'' strings" %
757 lexer.TokenVal(tok), tok)
758
759 out_tokens.extend(tokens)
760 return self.cur_token
761
762 def _ReadDoubleQuotedLeftParts(self):
763 # type: () -> word_part_t
764 """Read substitution parts in a double quoted context."""
765 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
766 return self._ReadCommandSub(self.token_type, d_quoted=True)
767
768 if self.token_type == Id.Left_DollarBrace:
769 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
770
771 if self.token_type == Id.Left_DollarDParen:
772 return self._ReadArithSub()
773
774 if self.token_type == Id.Left_DollarBracket:
775 return self._ReadExprSub(lex_mode_e.DQ)
776
777 raise AssertionError(self.cur_token)
778
779 def _ReadYshSingleQuoted(self, left_id):
780 # type: (Id_t) -> CompoundWord
781 """Read YSH style strings
782
783 r'' u'' b''
784 r''' ''' u''' ''' b''' '''
785 """
786 #log('BEF self.cur_token %s', self.cur_token)
787 if left_id == Id.Left_RSingleQuote:
788 lexer_mode = lex_mode_e.SQ_Raw
789 triple_left_id = Id.Left_RTSingleQuote
790 elif left_id == Id.Left_USingleQuote:
791 lexer_mode = lex_mode_e.J8_Str
792 triple_left_id = Id.Left_UTSingleQuote
793 elif left_id == Id.Left_BSingleQuote:
794 lexer_mode = lex_mode_e.J8_Str
795 triple_left_id = Id.Left_BTSingleQuote
796 else:
797 raise AssertionError(left_id)
798
799 # Needed for syntax checks
800 left_tok = self.cur_token
801 left_tok.id = left_id
802
803 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
804
805 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
806 self._SetNext(lex_mode_e.ShCommand)
807 self._GetToken()
808
809 assert self.token_type == Id.Left_SingleQuote
810 # HACK: magically transform the third ' in u''' to
811 # Id.Left_UTSingleQuote, so that ''' is the terminator
812 left_tok = self.cur_token
813 left_tok.id = triple_left_id
814
815 # Handles stripping leading whitespace
816 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
817
818 # Advance and validate
819 self._SetNext(lex_mode_e.ShCommand)
820
821 self._GetToken()
822 if self.token_kind not in KINDS_THAT_END_WORDS:
823 p_die('Unexpected token after YSH single-quoted string',
824 self.cur_token)
825
826 return CompoundWord([sq_part])
827
828 def _ReadUnquotedLeftParts(self, triple_out):
829 # type: (Optional[BoolParamBox]) -> word_part_t
830 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
831
832 If triple_out is set, then we try parsing triple quoted strings,
833 and set its value to True if we got one.
834 """
835 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
836 # Note: $"" is a synonym for "". It might make sense if it added
837 # \n \0 \x00 \u{123} etc. But that's not what bash does!
838 dq_part = self._ReadDoubleQuoted(self.cur_token)
839 # Got empty word "" and there's a " after
840 if (triple_out and len(dq_part.parts) == 0 and
841 self.lexer.ByteLookAhead() == '"'):
842
843 self._SetNext(lex_mode_e.ShCommand)
844 self._GetToken()
845 # HACK: magically transform the third " in """ to
846 # Id.Left_TDoubleQuote, so that """ is the terminator
847 left_dq_token = self.cur_token
848 left_dq_token.id = Id.Left_TDoubleQuote
849 triple_out.b = True # let caller know we got it
850 return self._ReadDoubleQuoted(left_dq_token)
851
852 return dq_part
853
854 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
855 Id.Left_DollarSingleQuote):
856 if self.token_type == Id.Left_SingleQuote:
857 lexer_mode = lex_mode_e.SQ_Raw
858 triple_left_id = Id.Left_TSingleQuote
859 elif self.token_type == Id.Left_RSingleQuote:
860 lexer_mode = lex_mode_e.SQ_Raw
861 triple_left_id = Id.Left_RTSingleQuote
862 else:
863 lexer_mode = lex_mode_e.SQ_C
864 # there is no such thing as $'''
865 triple_left_id = Id.Undefined_Tok
866
867 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
868
869 # Got empty '' or r'' and there's a ' after
870 # u'' and b'' are handled in _ReadYshSingleQuoted
871 if (triple_left_id != Id.Undefined_Tok and
872 triple_out is not None and len(sq_part.sval) == 0 and
873 self.lexer.ByteLookAhead() == "'"):
874
875 self._SetNext(lex_mode_e.ShCommand)
876 self._GetToken()
877
878 # HACK: magically transform the third ' in ''' to
879 # Id.Left_TSingleQuote, so that ''' is the terminator
880 left_sq_token = self.cur_token
881 left_sq_token.id = triple_left_id
882
883 triple_out.b = True # let caller know we got it
884 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
885
886 return sq_part
887
888 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
889 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
890 return self._ReadCommandSub(self.token_type, d_quoted=False)
891
892 if self.token_type == Id.Left_DollarBrace:
893 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
894
895 if self.token_type == Id.Left_DollarDParen:
896 return self._ReadArithSub()
897
898 if self.token_type == Id.Left_DollarBracket:
899 return self._ReadExprSub(lex_mode_e.ShCommand)
900
901 if self.token_type == Id.Left_DollarBraceZsh:
902 return self._ReadZshVarSub(self.cur_token)
903
904 raise AssertionError(self.cur_token)
905
906 def _ReadExtGlob(self):
907 # type: () -> word_part.ExtGlob
908 """
909 Grammar:
910 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
911 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
912 RIGHT = ')'
913 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
914 Compound includes ExtGlob
915 """
916 left_token = self.cur_token
917 right_token = None # type: Token
918 arms = [] # type: List[CompoundWord]
919
920 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
921 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
922
923 read_word = False # did we just a read a word? To handle @(||).
924
925 while True:
926 self._GetToken()
927
928 if self.token_type == Id.Right_ExtGlob:
929 if not read_word:
930 arms.append(CompoundWord([]))
931 right_token = self.cur_token
932 break
933
934 elif self.token_type == Id.Op_Pipe:
935 if not read_word:
936 arms.append(CompoundWord([]))
937 read_word = False
938 self._SetNext(lex_mode_e.ExtGlob)
939
940 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
941 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
942 Kind.ExtGlob):
943 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
944 arms.append(w)
945 read_word = True
946
947 elif self.token_kind == Kind.Eof:
948 p_die('Unexpected EOF reading extended glob that began here',
949 left_token)
950
951 else:
952 raise AssertionError(self.cur_token)
953
954 return word_part.ExtGlob(left_token, arms, right_token)
955
956 def _ReadBashRegexGroup(self):
957 # type: () -> word_part.BashRegexGroup
958 """
959 Grammar:
960 BashRegexGroup = '(' WORD? ')
961 """
962 left_token = self.cur_token
963 assert left_token.id == Id.BashRegex_LParen, left_token
964
965 right_token = None # type: Token
966 arms = [] # type: List[CompoundWord]
967
968 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
969 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
970
971 self._GetToken()
972 if self.token_type == Id.Right_BashRegexGroup: # empty ()
973 return word_part.BashRegexGroup(left_token, None, self.cur_token)
974
975 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
976 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
977 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
978 # To allow bash style [[ s =~ (a b) ]]
979 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
980 arms.append(w)
981
982 self._GetToken()
983 if self.token_type != Id.Right_BashRegexGroup:
984 p_die('Expected ) to close bash regex group', self.cur_token)
985
986 return word_part.BashRegexGroup(left_token, w, self.cur_token)
987
988 p_die('Expected word after ( opening bash regex group', self.cur_token)
989
990 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
991 # type: (Optional[Token], bool, List[word_part_t]) -> None
992 """
993 Args:
994 left_token: A token if we are reading a double quoted part, or None if
995 we're reading a here doc.
996 is_ysh_expr: Whether to disallow backticks and invalid char escapes
997 out_parts: list of word_part to append to
998 """
999 if left_token:
1000 if left_token.id in (Id.Left_TDoubleQuote,
1001 Id.Left_DollarTDoubleQuote):
1002 expected_end_tokens = 3
1003 else:
1004 expected_end_tokens = 1
1005 else:
1006 expected_end_tokens = 1000 # here doc will break
1007
1008 num_end_tokens = 0
1009 while num_end_tokens < expected_end_tokens:
1010 self._SetNext(lex_mode_e.DQ)
1011 self._GetToken()
1012
1013 if self.token_kind == Kind.Lit:
1014 if self.token_type == Id.Lit_EscapedChar:
1015 tok = self.cur_token
1016 ch = lexer.TokenSliceLeft(tok, 1)
1017 part = word_part.EscapedLiteral(tok,
1018 ch) # type: word_part_t
1019 else:
1020 if self.token_type == Id.Lit_BadBackslash:
1021 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1022 # YSH.
1023 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1024 # recursion (unless parse_backslash)
1025 if (is_ysh_expr or
1026 not self.parse_opts.parse_backslash()):
1027 p_die(
1028 "Invalid char escape in double quoted string (OILS-ERR-12)",
1029 self.cur_token)
1030 elif self.token_type == Id.Lit_Dollar:
1031 if is_ysh_expr or not self.parse_opts.parse_dollar():
1032 p_die("Literal $ should be quoted like \$",
1033 self.cur_token)
1034
1035 part = self.cur_token
1036 out_parts.append(part)
1037
1038 elif self.token_kind == Kind.Left:
1039 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1040 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1041 self.cur_token)
1042
1043 part = self._ReadDoubleQuotedLeftParts()
1044 out_parts.append(part)
1045
1046 elif self.token_kind == Kind.VSub:
1047 tok = self.cur_token
1048 part = SimpleVarSub(tok)
1049 out_parts.append(part)
1050 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1051 # later.
1052
1053 elif self.token_kind == Kind.Right:
1054 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1055 if left_token:
1056 num_end_tokens += 1
1057
1058 # In a here doc, the right quote is literal!
1059 out_parts.append(self.cur_token)
1060
1061 elif self.token_kind == Kind.Eof:
1062 if left_token:
1063 p_die(
1064 'Unexpected EOF reading double-quoted string that began here',
1065 left_token)
1066 else: # here docs will have an EOF in their token stream
1067 break
1068
1069 else:
1070 raise AssertionError(self.cur_token)
1071
1072 if self.token_kind != Kind.Right:
1073 num_end_tokens = 0 # """ must be CONSECUTIVE
1074
1075 if expected_end_tokens == 1:
1076 out_parts.pop()
1077 elif expected_end_tokens == 3:
1078 out_parts.pop()
1079 out_parts.pop()
1080 out_parts.pop()
1081
1082 # Remove space from """ in both expression mode and command mode
1083 if (left_token and left_token.id
1084 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1085 word_compile.RemoveLeadingSpaceDQ(out_parts)
1086
1087 # Return nothing, since we appended to 'out_parts'
1088
1089 def _ReadDoubleQuoted(self, left_token):
1090 # type: (Token) -> DoubleQuoted
1091 """Helper function for "hello $name".
1092
1093 Args:
1094 eof_type: for stopping at }, Id.Lit_RBrace
1095 here_doc: Whether we are reading in a here doc context
1096
1097 Also ${foo%%a b c} # treat this as double quoted. until you hit
1098 """
1099 parts = [] # type: List[word_part_t]
1100 self._ReadLikeDQ(left_token, False, parts)
1101
1102 right_quote = self.cur_token
1103 return DoubleQuoted(left_token, parts, right_quote)
1104
1105 def ReadDoubleQuoted(self, left_token, parts):
1106 # type: (Token, List[word_part_t]) -> Token
1107 """For expression mode.
1108
1109 Read var x = "${dir:-}/$name"; etc.
1110 """
1111 self._ReadLikeDQ(left_token, True, parts)
1112 return self.cur_token
1113
1114 def _ReadCommandSub(self, left_id, d_quoted=False):
1115 # type: (Id_t, bool) -> CommandSub
1116 """
1117 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1118
1119 command_sub = '$(' command_list ')'
1120 | '@(' command_list ')'
1121 | '<(' command_list ')'
1122 | '>(' command_list ')'
1123 | ` command_list `
1124 """
1125 left_token = self.cur_token
1126
1127 # Set the lexer in a state so ) becomes the EOF token.
1128 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1129 Id.Left_ProcSubOut):
1130 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1131
1132 right_id = Id.Eof_RParen
1133 self.lexer.PushHint(Id.Op_RParen, right_id)
1134 c_parser = self.parse_ctx.MakeParserForCommandSub(
1135 self.line_reader, self.lexer, right_id)
1136 # NOTE: This doesn't use something like main_loop because we don't want
1137 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1138 node = c_parser.ParseCommandSub()
1139
1140 right_token = c_parser.w_parser.cur_token
1141
1142 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1143 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1144 # test/osh2oil.
1145
1146 right_id = Id.Eof_Backtick
1147 self.lexer.PushHint(Id.Left_Backtick, right_id)
1148 c_parser = self.parse_ctx.MakeParserForCommandSub(
1149 self.line_reader, self.lexer, right_id)
1150 node = c_parser.ParseCommandSub()
1151 right_token = c_parser.w_parser.cur_token
1152
1153 elif left_id == Id.Left_Backtick:
1154 if not self.parse_opts.parse_backticks():
1155 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1156 left_token)
1157
1158 self._SetNext(lex_mode_e.Backtick) # advance past `
1159
1160 parts = [] # type: List[str]
1161 while True:
1162 self._GetToken()
1163 #log("TOK %s", self.cur_token)
1164
1165 if self.token_type == Id.Backtick_Quoted:
1166 # Remove leading \
1167 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1168
1169 elif self.token_type == Id.Backtick_DoubleQuote:
1170 # Compatibility: If backticks are double quoted, then double quotes
1171 # within them have to be \"
1172 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1173 # is)
1174 if d_quoted:
1175 # Remove leading \
1176 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1177 else:
1178 parts.append(lexer.TokenVal(self.cur_token))
1179
1180 elif self.token_type == Id.Backtick_Other:
1181 parts.append(lexer.TokenVal(self.cur_token))
1182
1183 elif self.token_type == Id.Backtick_Right:
1184 break
1185
1186 elif self.token_type == Id.Eof_Real:
1187 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1188 p_die('Unexpected EOF while looking for closing backtick',
1189 left_token)
1190
1191 else:
1192 raise AssertionError(self.cur_token)
1193
1194 self._SetNext(lex_mode_e.Backtick)
1195
1196 # Calculate right SPID on CommandSub BEFORE re-parsing.
1197 right_token = self.cur_token
1198
1199 code_str = ''.join(parts)
1200 #log('code %r', code_str)
1201
1202 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1203 # won't have the same location info as MakeParserForCommandSub(), because
1204 # the lexer is different.
1205 arena = self.parse_ctx.arena
1206 #arena = alloc.Arena()
1207 line_reader = reader.StringLineReader(code_str, arena)
1208 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1209 src = source.Reparsed('backticks', left_token, right_token)
1210 with alloc.ctx_SourceCode(arena, src):
1211 node = c_parser.ParseCommandSub()
1212
1213 else:
1214 raise AssertionError(left_id)
1215
1216 return CommandSub(left_token, node, right_token)
1217
1218 def _ReadExprSub(self, lex_mode):
1219 # type: (lex_mode_t) -> word_part.ExprSub
1220 """$[d->key] $[obj.method()] etc."""
1221 left_token = self.cur_token
1222
1223 self._SetNext(lex_mode_e.Expr)
1224 enode, right_token = self.parse_ctx.ParseYshExpr(
1225 self.lexer, grammar_nt.ysh_expr_sub)
1226
1227 self._SetNext(lex_mode) # Move past ]
1228 return word_part.ExprSub(left_token, enode, right_token)
1229
1230 def ParseVarDecl(self, kw_token):
1231 # type: (Token) -> command.VarDecl
1232 """
1233 oil_var_decl: name_type_list '=' testlist end_stmt
1234
1235 Note that assignments must end with \n ; } or EOF. Unlike shell
1236 assignments, we disallow:
1237
1238 var x = 42 | wc -l
1239 var x = 42 && echo hi
1240 """
1241 self._SetNext(lex_mode_e.Expr)
1242 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1243 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1244 # wants
1245 if last_token.id == Id.Op_RBrace:
1246 last_token.id = Id.Lit_RBrace
1247
1248 # Let the CommandParser see the Op_Semi or Op_Newline.
1249 self.buffered_word = last_token
1250 self._SetNext(lex_mode_e.ShCommand) # always back to this
1251 return enode
1252
1253 def ParseMutation(self, kw_token, var_checker):
1254 # type: (Token, VarChecker) -> command.Mutation
1255 """
1256 setvar i = 42
1257 setvar i += 1
1258 setvar a[i] = 42
1259 setvar a[i] += 1
1260 setvar d.key = 42
1261 setvar d.key += 1
1262 """
1263 self._SetNext(lex_mode_e.Expr)
1264 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1265 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1266 # wants
1267 if last_token.id == Id.Op_RBrace:
1268 last_token.id = Id.Lit_RBrace
1269
1270 for lhs in enode.lhs:
1271 UP_lhs = lhs
1272 with tagswitch(lhs) as case:
1273 if case(y_lhs_e.Var):
1274 lhs = cast(Token, UP_lhs)
1275 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1276
1277 # Note: this does not cover cases like
1278 # setvar (a[0])[1] = v
1279 # setvar (d.key).other = v
1280 # This leaks into catching all typos statically, which may be
1281 # possible if 'use' makes all names explicit.
1282 elif case(y_lhs_e.Subscript):
1283 lhs = cast(Subscript, UP_lhs)
1284 if lhs.obj.tag() == expr_e.Var:
1285 v = cast(expr.Var, lhs.obj)
1286 var_checker.Check(kw_token.id, v.name, v.left)
1287
1288 elif case(y_lhs_e.Attribute):
1289 lhs = cast(Attribute, UP_lhs)
1290 if lhs.obj.tag() == expr_e.Var:
1291 v = cast(expr.Var, lhs.obj)
1292 var_checker.Check(kw_token.id, v.name, v.left)
1293
1294 # Let the CommandParser see the Op_Semi or Op_Newline.
1295 self.buffered_word = last_token
1296 self._SetNext(lex_mode_e.ShCommand) # always back to this
1297 return enode
1298
1299 def ParseBareDecl(self):
1300 # type: () -> expr_t
1301 """
1302 x = {name: val}
1303 """
1304 self._SetNext(lex_mode_e.Expr)
1305 self._GetToken()
1306 enode, last_token = self.parse_ctx.ParseYshExpr(
1307 self.lexer, grammar_nt.command_expr)
1308 if last_token.id == Id.Op_RBrace:
1309 last_token.id = Id.Lit_RBrace
1310 self.buffered_word = last_token
1311 self._SetNext(lex_mode_e.ShCommand)
1312 return enode
1313
1314 def ParseYshExprForCommand(self):
1315 # type: () -> expr_t
1316
1317 # Fudge for this case
1318 # for x in(y) {
1319 # versus
1320 # for x in (y) {
1321 #
1322 # In the former case, ReadWord on 'in' puts the lexer past (.
1323 # Also see LookPastSpace in CommandParers.
1324 # A simpler solution would be nicer.
1325
1326 if self.token_type == Id.Op_LParen:
1327 self.lexer.MaybeUnreadOne()
1328
1329 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1330
1331 self._SetNext(lex_mode_e.ShCommand)
1332 return enode
1333
1334 def ParseCommandExpr(self):
1335 # type: () -> expr_t
1336 """
1337 = 1+2
1338 """
1339 enode, last_token = self.parse_ctx.ParseYshExpr(
1340 self.lexer, grammar_nt.command_expr)
1341
1342 # In some cases, such as the case statement, we expect *the lexer* to be
1343 # pointing at the token right after the expression. But the expression
1344 # parser must have read to the `last_token`. Unreading places the lexer
1345 # back in the expected state. Ie:
1346 #
1347 # case (x) { case (x) {
1348 # (else) { = x } (else) { = x }
1349 # ^ The lexer is here ^ Unread to here
1350 # } }
1351 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1352 Id.Op_RBrace), last_token
1353 if last_token.id != Id.Eof_Real:
1354 # Eof_Real is the only token we cannot unread
1355 self.lexer.MaybeUnreadOne()
1356
1357 return enode
1358
1359 def ParseProc(self, node):
1360 # type: (Proc) -> None
1361
1362 # proc name-with-hyphens() must be accepted
1363 self._SetNext(lex_mode_e.ShCommand)
1364 self._GetToken()
1365 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1366 if self.token_type != Id.Lit_Chars:
1367 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1368 self.cur_token)
1369
1370 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1371 # for shell functions. Similar to IsValidVarName().
1372 node.name = self.cur_token
1373
1374 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1375
1376 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1377 assert last_token.id == Id.Op_LBrace
1378 last_token.id = Id.Lit_LBrace
1379 self.buffered_word = last_token
1380
1381 self._SetNext(lex_mode_e.ShCommand)
1382
1383 def ParseFunc(self, node):
1384 # type: (Func) -> None
1385 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1386
1387 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1388 assert last_token.id == Id.Op_LBrace
1389 last_token.id = Id.Lit_LBrace
1390 self.buffered_word = last_token
1391
1392 self._SetNext(lex_mode_e.ShCommand)
1393
1394 def ParseYshCasePattern(self):
1395 # type: () -> Tuple[pat_t, Token]
1396 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1397 self.lexer)
1398
1399 if last_token.id == Id.Op_LBrace:
1400 last_token.id = Id.Lit_LBrace
1401 self.buffered_word = last_token
1402
1403 return pat, left_tok
1404
1405 def NewlineOkForYshCase(self):
1406 # type: () -> Id_t
1407 """Check for optional newline and consume it.
1408
1409 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1410 which crop up while parsing Ysh Case Arms. For more details, see
1411 #oil-dev > Progress On YSH Case Grammar on zulip.
1412
1413 Returns a token id which is filled with the choice of
1414
1415 word { echo word }
1416 (3) { echo expr }
1417 /e/ { echo eggex }
1418 } # right brace
1419 """
1420 while True:
1421 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1422
1423 # Cannot lookahead past lines
1424 if next_id == Id.Unknown_Tok:
1425 self.lexer.MoveToNextLine()
1426 continue
1427
1428 next_kind = consts.GetKind(next_id)
1429 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1430 break
1431
1432 self.lexer.Read(lex_mode_e.Expr)
1433
1434 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1435 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1436 else:
1437 # Consume the trailing Op_Newline
1438 self._SetNext(lex_mode_e.ShCommand)
1439 self._GetToken()
1440
1441 return next_id
1442
1443 def _ReadArithExpr(self, end_id):
1444 # type: (Id_t) -> arith_expr_t
1445 """Read and parse an arithmetic expression in various contexts.
1446
1447 $(( 1+2 ))
1448 (( a=1+2 ))
1449 ${a[ 1+2 ]}
1450 ${a : 1+2 : 1+2}
1451
1452 See tests/arith-context.test.sh for ambiguous cases.
1453
1454 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1455
1456 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1457
1458 See the assertion in ArithParser.Parse() -- unexpected extra input.
1459 """
1460 # calls self.ReadWord(lex_mode_e.Arith)
1461 anode = self.a_parser.Parse()
1462 cur_id = self.a_parser.CurrentId()
1463 if end_id != Id.Undefined_Tok and cur_id != end_id:
1464 p_die(
1465 'Unexpected token after arithmetic expression (%s != %s)' %
1466 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1467 loc.Word(self.a_parser.cur_word))
1468 return anode
1469
1470 def _ReadArithSub(self):
1471 # type: () -> word_part.ArithSub
1472 """Read an arith substitution, which contains an arith expression, e.g.
1473
1474 $((a + 1)).
1475 """
1476 left_tok = self.cur_token
1477
1478 # The second one needs to be disambiguated in stuff like stuff like:
1479 # $(echo $(( 1+2 )) )
1480 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1481
1482 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1483 # could save the lexer/reader state here, and retry if the arithmetic parse
1484 # fails. But we can almost always catch this at parse time. There could
1485 # be some exceptions like:
1486 # $((echo * foo)) # looks like multiplication
1487 # $((echo / foo)) # looks like division
1488
1489 # $(( )) is valid
1490 anode = arith_expr.EmptyZero # type: arith_expr_t
1491
1492 self._NextNonSpace()
1493 if self.token_type != Id.Arith_RParen:
1494 anode = self._ReadArithExpr(Id.Arith_RParen)
1495
1496 self._SetNext(lex_mode_e.ShCommand)
1497
1498 # Ensure we get closing )
1499 self._GetToken()
1500 if self.token_type != Id.Right_DollarDParen:
1501 p_die('Expected second ) to end arith sub', self.cur_token)
1502
1503 right_tok = self.cur_token
1504 return word_part.ArithSub(left_tok, anode, right_tok)
1505
1506 def ReadDParen(self):
1507 # type: () -> Tuple[arith_expr_t, Token]
1508 """Read ((1+ 2)) -- command context.
1509
1510 We're using the word parser because it's very similar to _ReadArithExpr
1511 above.
1512
1513 This also returns the terminating Id.Op_DRightParen token for location
1514 info.
1515 """
1516 # (( )) is valid
1517 anode = arith_expr.EmptyZero # type: arith_expr_t
1518
1519 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1520
1521 self._NextNonSpace()
1522 if self.token_type != Id.Arith_RParen:
1523 anode = self._ReadArithExpr(Id.Arith_RParen)
1524
1525 self._SetNext(lex_mode_e.ShCommand)
1526
1527 # Ensure we get the second )
1528 self._GetToken()
1529 right = self.cur_token
1530 if right.id != Id.Op_DRightParen:
1531 p_die('Expected second ) to end arith statement', right)
1532
1533 self._SetNext(lex_mode_e.ShCommand)
1534
1535 return anode, right
1536
1537 def _NextNonSpace(self):
1538 # type: () -> None
1539 """Advance in lex_mode_e.Arith until non-space token.
1540
1541 Same logic as _ReadWord, but used in
1542 $(( ))
1543 (( ))
1544 for (( ))
1545
1546 You can read self.token_type after this, without calling _GetToken.
1547 """
1548 while True:
1549 self._SetNext(lex_mode_e.Arith)
1550 self._GetToken()
1551 if self.token_kind not in (Kind.Ignored, Kind.WS):
1552 break
1553
1554 def ReadForExpression(self):
1555 # type: () -> command.ForExpr
1556 """Read ((i=0; i<5; ++i)) -- part of command context."""
1557 self._NextNonSpace() # skip over ((
1558 cur_id = self.token_type # for end of arith expressions
1559
1560 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1561 init_node = arith_expr.EmptyZero # type: arith_expr_t
1562 else:
1563 init_node = self.a_parser.Parse()
1564 cur_id = self.a_parser.CurrentId()
1565 self._NextNonSpace()
1566
1567 # It's odd to keep track of both cur_id and self.token_type in this
1568 # function, but it works, and is tested in 'test/parse_error.sh
1569 # arith-integration'
1570 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1571 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1572
1573 self._GetToken()
1574 cur_id = self.token_type
1575
1576 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1577 # empty condition is TRUE
1578 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1579 else:
1580 cond_node = self.a_parser.Parse()
1581 cur_id = self.a_parser.CurrentId()
1582
1583 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1584 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1585
1586 self._NextNonSpace()
1587 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1588 update_node = arith_expr.EmptyZero # type: arith_expr_t
1589 else:
1590 update_node = self._ReadArithExpr(Id.Arith_RParen)
1591
1592 self._NextNonSpace()
1593 if self.token_type != Id.Arith_RParen:
1594 p_die('Expected ) to end for loop expression', self.cur_token)
1595 self._SetNext(lex_mode_e.ShCommand)
1596
1597 # redirects is None, will be assigned in CommandEvaluator
1598 node = command.ForExpr.CreateNull()
1599 node.init = init_node
1600 node.cond = cond_node
1601 node.update = update_node
1602 return node
1603
1604 def _ReadArrayLiteral(self):
1605 # type: () -> word_part_t
1606 """a=(1 2 3)
1607
1608 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1609
1610 We want:
1611
1612 A=(['x']=1 ["x"]=2 [$x$y]=3)
1613
1614 Maybe allow this as a literal string? Because I think I've seen it before?
1615 Or maybe force people to patch to learn the rule.
1616
1617 A=([x]=4)
1618
1619 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1620 Maybe enforce that ALL have keys or NONE of have keys.
1621 """
1622 self._SetNext(lex_mode_e.ShCommand) # advance past (
1623 self._GetToken()
1624 if self.cur_token.id != Id.Op_LParen:
1625 p_die('Expected ( after =', self.cur_token)
1626 left_token = self.cur_token
1627 right_token = None # type: Token
1628
1629 # MUST use a new word parser (with same lexer).
1630 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1631 words = [] # type: List[CompoundWord]
1632 done = False
1633 while not done:
1634 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1635 with tagswitch(w) as case:
1636 if case(word_e.Operator):
1637 tok = cast(Token, w)
1638 if tok.id == Id.Right_ShArrayLiteral:
1639 right_token = tok
1640 done = True # can't use break here
1641 # Unlike command parsing, array parsing allows embedded \n.
1642 elif tok.id == Id.Op_Newline:
1643 continue
1644 else:
1645 p_die('Unexpected token in array literal', loc.Word(w))
1646
1647 elif case(word_e.Compound):
1648 words.append(cast(CompoundWord, w))
1649
1650 else:
1651 raise AssertionError()
1652
1653 if len(words) == 0: # a=() is empty indexed array
1654 # Needed for type safety, doh
1655 no_words = [] # type: List[word_t]
1656 node = ShArrayLiteral(left_token, no_words, right_token)
1657 return node
1658
1659 pairs = [] # type: List[AssocPair]
1660 # If the first one is a key/value pair, then the rest are assumed to be.
1661 pair = word_.DetectAssocPair(words[0])
1662 if pair:
1663 pairs.append(pair)
1664
1665 n = len(words)
1666 for i in xrange(1, n):
1667 w2 = words[i]
1668 pair = word_.DetectAssocPair(w2)
1669 if not pair:
1670 p_die("Expected associative array pair", loc.Word(w2))
1671
1672 pairs.append(pair)
1673
1674 # invariant List?
1675 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1676
1677 # Brace detection for arrays but NOT associative arrays
1678 words2 = braces.BraceDetectAll(words)
1679 words3 = word_.TildeDetectAll(words2)
1680 return ShArrayLiteral(left_token, words3, right_token)
1681
1682 def ParseProcCallArgs(self, start_symbol):
1683 # type: (int) -> ArgList
1684 """ json write (x) """
1685 self.lexer.MaybeUnreadOne()
1686
1687 arg_list = ArgList.CreateNull(alloc_lists=True)
1688 arg_list.left = self.cur_token
1689 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1690 return arg_list
1691
1692 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1693 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1694 """Helper for _ReadCompoundWord3."""
1695 done = False
1696
1697 if self.token_type == Id.Lit_EscapedChar:
1698 tok = self.cur_token
1699 assert tok.length == 2
1700 ch = lexer.TokenSliceLeft(tok, 1)
1701 if not self.parse_opts.parse_backslash():
1702 if not pyutil.IsValidCharEscape(ch):
1703 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1704 self.cur_token)
1705
1706 part = word_part.EscapedLiteral(self.cur_token,
1707 ch) # type: word_part_t
1708 else:
1709 part = self.cur_token
1710
1711 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1712 parts.append(part)
1713 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1714 # _ReadWord.
1715 next_id = self.lexer.LookPastSpace(lex_mode)
1716 if next_id == Id.Op_LParen:
1717 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1718 part2 = self._ReadArrayLiteral()
1719 parts.append(part2)
1720
1721 # Array literal must be the last part of the word.
1722 self._SetNext(lex_mode)
1723 self._GetToken()
1724 # EOF, whitespace, newline, Right_Subshell
1725 if self.token_kind not in KINDS_THAT_END_WORDS:
1726 p_die('Unexpected token after array literal',
1727 self.cur_token)
1728 done = True
1729
1730 elif (is_first and self.parse_opts.parse_at() and
1731 self.token_type == Id.Lit_Splice):
1732
1733 splice_tok = self.cur_token
1734 part2 = word_part.Splice(splice_tok,
1735 lexer.TokenSliceLeft(splice_tok, 1))
1736
1737 parts.append(part2)
1738
1739 # @words must be the last part of the word
1740 self._SetNext(lex_mode)
1741 self._GetToken()
1742 # EOF, whitespace, newline, Right_Subshell
1743 if self.token_kind not in KINDS_THAT_END_WORDS:
1744 p_die('Unexpected token after array splice', self.cur_token)
1745 done = True
1746
1747 elif (is_first and self.parse_opts.parse_at() and
1748 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1749 part2 = self._ReadExprSub(lex_mode_e.DQ)
1750 parts.append(part2)
1751
1752 # @[split(x)]
1753 self._SetNext(lex_mode)
1754 self._GetToken()
1755 # EOF, whitespace, newline, Right_Subshell
1756 if self.token_kind not in KINDS_THAT_END_WORDS:
1757 p_die('Unexpected token after Expr splice', self.cur_token)
1758 done = True
1759
1760 elif (is_first and self.parse_opts.parse_at() and
1761 self.token_type == Id.Lit_AtLBraceDot):
1762 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1763
1764 elif (is_first and self.parse_opts.parse_at_all() and
1765 self.token_type == Id.Lit_At):
1766 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1767 # at the beginning of a word to be reserved.
1768
1769 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1770 # @_argv and
1771 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1772 self.cur_token)
1773
1774 else:
1775 # not a literal with lookahead; append it
1776 parts.append(part)
1777
1778 return done
1779
1780 def _ReadCompoundWord(self, lex_mode):
1781 # type: (lex_mode_t) -> CompoundWord
1782 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1783
1784 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1785 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1786 """
1787 Precondition: Looking at the first token of the first word part
1788 Postcondition: Looking at the token after, e.g. space or operator
1789
1790 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1791 could be an operator delimiting a compound word. Can we change lexer modes
1792 and remove this special case?
1793 """
1794 w = CompoundWord([])
1795 num_parts = 0
1796 brace_count = 0
1797 done = False
1798 is_triple_quoted = None # type: Optional[BoolParamBox]
1799
1800 while not done:
1801 self._GetToken()
1802
1803 allow_done = empty_ok or num_parts != 0
1804 if allow_done and self.token_type == eof_type:
1805 done = True # e.g. for ${foo//pat/replace}
1806
1807 # Keywords like "for" are treated like literals
1808 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1809 Kind.ControlFlow, Kind.BoolUnary,
1810 Kind.BoolBinary):
1811
1812 # Syntax error for { and }
1813 if self.token_type == Id.Lit_LBrace:
1814 brace_count += 1
1815 elif self.token_type == Id.Lit_RBrace:
1816 brace_count -= 1
1817 elif self.token_type == Id.Lit_Dollar:
1818 if not self.parse_opts.parse_dollar():
1819 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1820 next_byte = self.lexer.ByteLookAhead()
1821 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1822 if next_byte == '/':
1823 #log('next_byte %r', next_byte)
1824 pass
1825
1826 p_die('Literal $ should be quoted like \$',
1827 self.cur_token)
1828
1829 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1830 w.parts)
1831
1832 elif self.token_kind == Kind.VSub:
1833 vsub_token = self.cur_token
1834
1835 part = SimpleVarSub(vsub_token) # type: word_part_t
1836 w.parts.append(part)
1837
1838 elif self.token_kind == Kind.ExtGlob:
1839 # If parse_at, we can take over @( to start @(seq 3)
1840 # Users can also use look at ,(*.py|*.sh)
1841 if (self.parse_opts.parse_at() and
1842 self.token_type == Id.ExtGlob_At and num_parts == 0):
1843 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1844 d_quoted=False)
1845 # RARE mutation of tok.id!
1846 cs_part.left_token.id = Id.Left_AtParen
1847 part = cs_part # for type safety
1848
1849 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1850 # a=(one two)x and @arrayfunc(3)x.
1851 self._GetToken()
1852 if self.token_kind not in KINDS_THAT_END_WORDS:
1853 p_die('Unexpected token after @()', self.cur_token)
1854 done = True
1855
1856 else:
1857 part = self._ReadExtGlob()
1858 w.parts.append(part)
1859
1860 elif self.token_kind == Kind.BashRegex:
1861 if self.token_type == Id.BashRegex_LParen: # Opening (
1862 part = self._ReadBashRegexGroup()
1863 w.parts.append(part)
1864 else:
1865 assert self.token_type == Id.BashRegex_AllowedInParens
1866 p_die('Invalid token in bash regex', self.cur_token)
1867
1868 elif self.token_kind == Kind.Left:
1869 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1870 lex_mode == lex_mode_e.ShCommand and
1871 num_parts == 0)
1872
1873 # Save allocation
1874 if try_triple_quote:
1875 is_triple_quoted = BoolParamBox(False)
1876
1877 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1878 w.parts.append(part)
1879
1880 # NOT done yet, will advance below
1881 elif self.token_kind == Kind.Right:
1882 # Still part of the word; will be done on the next iter.
1883 if self.token_type == Id.Right_DoubleQuote:
1884 pass
1885 # Never happens, no PushHint for this case.
1886 #elif self.token_type == Id.Right_DollarParen:
1887 # pass
1888 elif self.token_type == Id.Right_Subshell:
1889 # LEXER HACK for (case x in x) ;; esac )
1890 # Rewind before it's used
1891 assert self.next_lex_mode == lex_mode_e.Undefined
1892 if self.lexer.MaybeUnreadOne():
1893 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1894 self._SetNext(lex_mode)
1895 done = True
1896 else:
1897 done = True
1898
1899 elif self.token_kind == Kind.Ignored:
1900 done = True
1901
1902 else:
1903 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1904 # so to test for ESAC, we can read ) before getting a chance to
1905 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1906 # token and do it again.
1907
1908 # We get Id.Op_RParen at top level: case x in x) ;; esac
1909 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1910 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1911 # Rewind before it's used
1912 assert self.next_lex_mode == lex_mode_e.Undefined
1913 if self.lexer.MaybeUnreadOne():
1914 if self.token_type == Id.Eof_RParen:
1915 # Redo translation
1916 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1917 self._SetNext(lex_mode)
1918
1919 done = True # anything we don't recognize means we're done
1920
1921 if not done:
1922 self._SetNext(lex_mode)
1923 num_parts += 1
1924
1925 if (self.parse_opts.parse_brace() and num_parts > 1 and
1926 brace_count != 0):
1927 # accept { and }, but not foo{
1928 p_die(
1929 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1930 loc.Word(w))
1931
1932 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1933 p_die('Unexpected parts after triple quoted string',
1934 loc.WordPart(w.parts[-1]))
1935
1936 if 0:
1937 from _devbuild.gen.syntax_asdl import word_part_str
1938 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1939 WORD_HIST[word_key] += 1
1940 return w
1941
1942 def _ReadArithWord(self):
1943 # type: () -> Optional[word_t]
1944 """ Helper for ReadArithWord() """
1945 self._GetToken()
1946
1947 if self.token_kind == Kind.Unknown:
1948 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1949 p_die(
1950 'Unexpected token while parsing arithmetic: %r' %
1951 lexer.TokenVal(self.cur_token), self.cur_token)
1952
1953 elif self.token_kind == Kind.Eof:
1954 return self.cur_token
1955
1956 elif self.token_kind == Kind.Ignored:
1957 # Space should be ignored.
1958 self._SetNext(lex_mode_e.Arith)
1959 return None
1960
1961 elif self.token_kind in (Kind.Arith, Kind.Right):
1962 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1963 self._SetNext(lex_mode_e.Arith)
1964 return self.cur_token
1965
1966 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1967 return self._ReadCompoundWord(lex_mode_e.Arith)
1968
1969 else:
1970 raise AssertionError(self.cur_token)
1971
1972 def _ReadWord(self, word_mode):
1973 # type: (lex_mode_t) -> Optional[word_t]
1974 """Helper function for ReadWord()."""
1975
1976 # Change the pseudo lexer mode to a real lexer mode
1977 if word_mode == lex_mode_e.ShCommandFakeBrack:
1978 lex_mode = lex_mode_e.ShCommand
1979 else:
1980 lex_mode = word_mode
1981
1982 self._GetToken()
1983
1984 if self.token_kind == Kind.Eof:
1985 # No advance
1986 return self.cur_token
1987
1988 # Allow Arith for ) at end of for loop?
1989 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1990 self._SetNext(lex_mode)
1991
1992 # Newlines are complicated. See 3x2 matrix in the comment about
1993 # self.multiline and self.newline_state above.
1994 if self.token_type == Id.Op_Newline:
1995 if self.multiline:
1996 if self.newline_state > 1:
1997 # This points at a blank line, but at least it gives the line number
1998 p_die('Invalid blank line in multiline mode',
1999 self.cur_token)
2000 return None
2001
2002 if self.returned_newline: # skip
2003 return None
2004
2005 return self.cur_token
2006
2007 elif self.token_kind == Kind.Right:
2008 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2009 Id.Right_CasePat,
2010 Id.Right_ShArrayLiteral):
2011 raise AssertionError(self.cur_token)
2012
2013 self._SetNext(lex_mode)
2014 return self.cur_token
2015
2016 elif self.token_kind in (Kind.Ignored, Kind.WS):
2017 self._SetNext(lex_mode)
2018 return None
2019
2020 else:
2021 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2022 Kind.Left, Kind.KW, Kind.ControlFlow,
2023 Kind.BoolUnary, Kind.BoolBinary,
2024 Kind.ExtGlob,
2025 Kind.BashRegex), 'Unhandled token kind'
2026
2027 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2028 self.parse_opts.parse_bracket() and
2029 self.token_type == Id.Lit_LBracket):
2030 # Change [ from Kind.Lit -> Kind.Op
2031 # So CommandParser can treat
2032 # assert [42 === x]
2033 # like
2034 # json write (x)
2035 bracket_word = self.cur_token
2036 bracket_word.id = Id.Op_LBracket
2037
2038 self._SetNext(lex_mode)
2039 return bracket_word
2040
2041 # We're beginning a word. If we see Id.Lit_Pound, change to
2042 # lex_mode_e.Comment and read until end of line.
2043 if self.token_type == Id.Lit_Pound:
2044 self._SetNext(lex_mode_e.Comment)
2045 self._GetToken()
2046
2047 # NOTE: The # could be the last character in the file. It can't be
2048 # Eof_{RParen,Backtick} because #) and #` are comments.
2049 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2050 self.cur_token
2051
2052 # The next iteration will go into Kind.Ignored and set lex state to
2053 # lex_mode_e.ShCommand/etc.
2054 return None # tell ReadWord() to try again after comment
2055
2056 elif self.token_type == Id.Lit_TPound: ### doc comment
2057 self._SetNext(lex_mode_e.Comment)
2058 self._GetToken()
2059
2060 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2061 return self.cur_token
2062
2063 return None # tell ReadWord() to try again after comment
2064
2065 else:
2066 # r'' u'' b''
2067 if (self.token_type == Id.Lit_Chars and
2068 self.lexer.LookAheadOne(
2069 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2070
2071 # When shopt -s parse_raw_string:
2072 # echo r'hi' is like echo 'hi'
2073 #
2074 # echo u'\u{3bc}' b'\yff' works
2075
2076 tok = self.cur_token
2077 if self.parse_opts.parse_ysh_string():
2078 if lexer.TokenEquals(tok, 'r'):
2079 left_id = Id.Left_RSingleQuote
2080 elif lexer.TokenEquals(tok, 'u'):
2081 left_id = Id.Left_USingleQuote
2082 elif lexer.TokenEquals(tok, 'b'):
2083 left_id = Id.Left_BSingleQuote
2084 else:
2085 left_id = Id.Undefined_Tok
2086
2087 if left_id != Id.Undefined_Tok:
2088 # skip the r, and then 'foo' will be read as normal
2089 self._SetNext(lex_mode_e.ShCommand)
2090
2091 self._GetToken()
2092 assert self.token_type == Id.Left_SingleQuote, self.token_type
2093
2094 # Read the word in a different lexer mode
2095 return self._ReadYshSingleQuoted(left_id)
2096
2097 return self._ReadCompoundWord(lex_mode)
2098
2099 def ParseVarRef(self):
2100 # type: () -> BracedVarSub
2101 """DYNAMIC parsing of what's inside ${!ref}
2102
2103 # Same as VarOf production
2104 VarRefExpr = VarOf EOF
2105 """
2106 self._SetNext(lex_mode_e.VSub_1)
2107
2108 self._GetToken()
2109 if self.token_kind != Kind.VSub:
2110 p_die('Expected var name', self.cur_token)
2111
2112 part = self._ParseVarOf()
2113 # NOTE: no ${ } means no part.left and part.right
2114 part.left = part.token # cheat to make test pass
2115 part.right = part.token
2116
2117 self._GetToken()
2118 if self.token_type != Id.Eof_Real:
2119 p_die('Expected end of var ref expression', self.cur_token)
2120 return part
2121
2122 def LookPastSpace(self):
2123 # type: () -> Id_t
2124 """Look ahead to the next token.
2125
2126 For the CommandParser to recognize
2127 array= (1 2 3)
2128 YSH for ( versus bash for ((
2129 YSH if ( versus if test
2130 YSH while ( versus while test
2131 YSH bare assignment 'grep =' versus 'grep foo'
2132 """
2133 assert self.token_type != Id.Undefined_Tok
2134 if self.cur_token.id == Id.WS_Space:
2135 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2136 else:
2137 id_ = self.cur_token.id
2138 return id_
2139
2140 def LookAheadFuncParens(self):
2141 # type: () -> bool
2142 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2143 assert self.token_type != Id.Undefined_Tok
2144
2145 # We have to handle 2 cases because we buffer a token
2146 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2147 return self.lexer.LookAheadFuncParens(1) # go back one char
2148
2149 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2150 return self.lexer.LookAheadFuncParens(0)
2151
2152 else:
2153 return False
2154
2155 def ReadWord(self, word_mode):
2156 # type: (lex_mode_t) -> word_t
2157 """Read the next word, using the given lexer mode.
2158
2159 This is a stateful wrapper for the stateless _ReadWord function.
2160 """
2161 assert word_mode in (lex_mode_e.ShCommand,
2162 lex_mode_e.ShCommandFakeBrack,
2163 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2164
2165 if self.buffered_word: # For integration with pgen2
2166 w = self.buffered_word
2167 self.buffered_word = None
2168 else:
2169 while True:
2170 w = self._ReadWord(word_mode)
2171 if w is not None:
2172 break
2173
2174 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2175 return w
2176
2177 def ReadArithWord(self):
2178 # type: () -> word_t
2179 while True:
2180 w = self._ReadArithWord()
2181 if w is not None:
2182 break
2183 return w
2184
2185 def ReadHereDocBody(self, parts):
2186 # type: (List[word_part_t]) -> None
2187 """
2188 A here doc is like a double quoted context, except " isn't special.
2189 """
2190 self._ReadLikeDQ(None, False, parts)
2191 # Returns nothing
2192
2193 def ReadForPlugin(self):
2194 # type: () -> CompoundWord
2195 """For $PS1, $PS4, etc.
2196
2197 This is just like reading a here doc line. "\n" is allowed, as
2198 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2199 """
2200 w = CompoundWord([])
2201 self._ReadLikeDQ(None, False, w.parts)
2202 return w
2203
2204 def EmitDocToken(self, b):
2205 # type: (bool) -> None
2206 self.emit_doc_token = b
2207
2208 def Multiline(self, b):
2209 # type: (bool) -> None
2210 self.multiline = b
2211
2212
2213if 0:
2214 import collections
2215 WORD_HIST = collections.Counter()
2216
2217# vim: sw=4