OILS / osh / word_parse.py View on Github | oilshell.org

2216 lines, 1182 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91)
92from core import alloc
93from core.error import p_die
94from mycpp.mylib import log
95from core import pyutil
96from core import ui
97from frontend import consts
98from frontend import lexer
99from frontend import reader
100from osh import tdop
101from osh import arith_parse
102from osh import braces
103from osh import word_
104from osh import word_compile
105from mycpp.mylib import tagswitch
106
107from typing import List, Optional, Tuple, cast
108from typing import TYPE_CHECKING
109if TYPE_CHECKING:
110 from frontend.lexer import Lexer
111 from frontend.parse_lib import ParseContext
112 from frontend.reader import _Reader
113 from osh.cmd_parse import VarChecker
114
115unused1 = log
116unused2 = Id_str
117
118KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
119
120
121class WordEmitter(object):
122 """Common interface for [ and [["""
123
124 def __init__(self):
125 # type: () -> None
126 """Empty constructor for mycpp."""
127 pass
128
129 def ReadWord(self, lex_mode):
130 # type: (lex_mode_t) -> word_t
131 raise NotImplementedError()
132
133
134class WordParser(WordEmitter):
135
136 def __init__(self, parse_ctx, lexer, line_reader):
137 # type: (ParseContext, Lexer, _Reader) -> None
138 self.parse_ctx = parse_ctx
139 self.lexer = lexer
140 self.line_reader = line_reader
141 self.arena = line_reader.arena
142
143 self.parse_opts = parse_ctx.parse_opts
144 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
145 self.parse_opts)
146 self.Reset()
147
148 def Init(self, lex_mode):
149 # type: (lex_mode_t) -> None
150 """Used to parse arithmetic, see ParseContext."""
151 self.next_lex_mode = lex_mode
152
153 def Reset(self):
154 # type: () -> None
155 """Called by interactive loop."""
156 # For _GetToken()
157 self.cur_token = None # type: Token
158 self.token_kind = Kind.Undefined
159 self.token_type = Id.Undefined_Tok
160
161 self.next_lex_mode = lex_mode_e.ShCommand
162
163 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
164 # comments
165 self.emit_doc_token = False
166 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
167 # multiline mode.
168 self.multiline = False
169
170 # For detecting invalid \n\n in multiline mode. Counts what we got
171 # directly from the lexer.
172 self.newline_state = 0
173 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
174 # that consume words.
175 self.returned_newline = False
176
177 # For integration with pgen2
178 self.buffered_word = None # type: word_t
179
180 def _GetToken(self):
181 # type: () -> None
182 """Call this when you need to make a decision based on any of:
183
184 self.token_type
185 self.token_kind
186 self.cur_token
187 """
188 if self.next_lex_mode == lex_mode_e.Undefined:
189 return # _SetNext() not called, so do nothing
190
191 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
192 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
193
194 self.cur_token = self.lexer.Read(real_mode)
195
196 # MUTATE TOKEN for fake lexer mode.
197 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
198 if (is_fake and self.cur_token.id
199 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
200 self.cur_token.id = Id.Lit_Chars
201
202 self.token_type = self.cur_token.id
203 self.token_kind = consts.GetKind(self.token_type)
204
205 # number of consecutive newlines, ignoring whitespace
206 if self.token_type == Id.Op_Newline:
207 self.newline_state += 1
208 elif self.token_kind != Kind.WS:
209 self.newline_state = 0
210
211 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
212 self.next_lex_mode = lex_mode_e.Undefined
213
214 def _SetNext(self, lex_mode):
215 # type: (lex_mode_t) -> None
216 """Set the next lex state, but don't actually read a token.
217
218 We need this for proper interactive parsing.
219 """
220 self.next_lex_mode = lex_mode
221
222 def _ReadVarOpArg(self, arg_lex_mode):
223 # type: (lex_mode_t) -> rhs_word_t
224
225 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
226 # valid, even when unquoted.
227 self._SetNext(arg_lex_mode)
228 self._GetToken()
229
230 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
231 True) # empty_ok
232
233 # If the Compound has no parts, and we're in a double-quoted VarSub
234 # arg, and empty_ok, then return Empty. This is so it can evaluate to
235 # the empty string and not get elided.
236 #
237 # Examples:
238 # - "${s:-}", "${s/%pat/}"
239 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
240 # has the same potential problem of not having Token location info.
241 #
242 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
243 # return a Compound with no parts, which is explicitly checked with a
244 # custom error message.
245 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
246 return rhs_word.Empty
247
248 return w
249
250 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
251 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
252 """Return a CompoundWord.
253
254 Helper function for _ReadVarOpArg and used directly by
255 _ReadPatSubVarOp.
256 """
257 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
258 #log('w %s', w)
259 tilde = word_.TildeDetect(w)
260 if tilde:
261 w = tilde
262 return w
263
264 def _ReadSliceVarOp(self):
265 # type: () -> suffix_op.Slice
266 """
267 Looking token after first ':'
268
269 ArithExpr? (':' ArithExpr? )? '}'
270 """
271 self._NextNonSpace()
272
273 cur_id = self.token_type
274
275 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
276 begin = arith_expr.EmptyZero # type: arith_expr_t
277 else:
278 begin = self.a_parser.Parse()
279 cur_id = self.a_parser.CurrentId() # advance
280
281 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
282 no_length = None # type: Optional[arith_expr_t] # No length specified
283 return suffix_op.Slice(begin, no_length)
284
285 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
286 self._NextNonSpace()
287
288 if self.token_type != Id.Arith_RBrace:
289 length = self._ReadArithExpr(Id.Arith_RBrace)
290 else:
291 # quirky bash behavior:
292 # ${a:1:} or ${a::} means length ZERO
293 # but ${a:1} or ${a:} means length N
294 if self.parse_opts.strict_parse_slice():
295 p_die(
296 "Explicit slice length required - zero or N (strict_parse_slice)",
297 self.cur_token)
298
299 length = arith_expr.EmptyZero
300
301 return suffix_op.Slice(begin, length)
302
303 else:
304 p_die("Expected : or } in slice", self.cur_token)
305
306 raise AssertionError() # for MyPy
307
308 def _ReadPatSubVarOp(self):
309 # type: () -> suffix_op.PatSub
310 """Looking at the first '/' after VarOf:
311
312 VarSub = ...
313 | VarOf '/' Match ( '/' WORD? )?
314 Match = '/' WORD # can't be empty
315 | '#' WORD? # may be empty
316 | '%' WORD?
317 """
318 slash_tok = self.cur_token # location info
319 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
320
321 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
322
323 self._GetToken()
324 if self.token_type == Id.Right_DollarBrace:
325 pat = CompoundWord([])
326 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
327 slash_tok)
328
329 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
330 replace_mode = self.token_type
331 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
332
333 # Bash quirk:
334 # echo ${x/#/replace} has an empty pattern
335 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
336 empty_ok = replace_mode != Id.Lit_Slash
337 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
338 empty_ok)
339 #log('pat 1 %r', pat)
340
341 if self.token_type == Id.Lit_Slash:
342 # read until }
343 replace = self._ReadVarOpArg(
344 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
345 #log('r 1 %r', replace)
346 else:
347 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
348 replace = rhs_word.Empty
349
350 self._GetToken()
351 if self.token_type != Id.Right_DollarBrace:
352 # This happens on invalid code
353 p_die(
354 "Expected } after replacement string, got %s" %
355 ui.PrettyId(self.token_type), self.cur_token)
356
357 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
358
359 def _ReadSubscript(self):
360 # type: () -> bracket_op_t
361 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
362 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
363 # expression.
364 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
365 if next_id in (Id.Lit_At, Id.Arith_Star):
366 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
367
368 self._SetNext(lex_mode_e.Arith) # skip past [
369 self._GetToken()
370 self._SetNext(lex_mode_e.Arith) # skip past @
371 self._GetToken()
372 else:
373 self._SetNext(lex_mode_e.Arith) # skip past [
374 anode = self._ReadArithExpr(Id.Arith_RBracket)
375 op = bracket_op.ArrayIndex(anode)
376
377 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
378 p_die('Expected ] to close subscript', self.cur_token)
379
380 self._SetNext(lex_mode_e.VSub_2) # skip past ]
381 self._GetToken() # Needed to be in the same spot as no subscript
382
383 return op
384
385 def _ParseVarOf(self):
386 # type: () -> BracedVarSub
387 """
388 VarOf = NAME Subscript?
389 | NUMBER # no subscript allowed, none of these are arrays
390 # ${@[1]} doesn't work, even though slicing does
391 | VarSymbol
392 """
393 self._GetToken()
394 name_token = self.cur_token
395 self._SetNext(lex_mode_e.VSub_2)
396
397 self._GetToken() # Check for []
398 if self.token_type == Id.VOp2_LBracket:
399 bracket_op = self._ReadSubscript()
400 else:
401 bracket_op = None
402
403 part = BracedVarSub.CreateNull()
404 part.token = name_token
405 part.var_name = lexer.TokenVal(name_token)
406 part.bracket_op = bracket_op
407 return part
408
409 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
410 # type: (lex_mode_t, bool) -> BracedVarSub
411 """Start parsing at the op -- we already skipped past the name."""
412 part = self._ParseVarOf()
413
414 self._GetToken()
415 if self.token_type == Id.Right_DollarBrace:
416 return part # no ops
417
418 op_kind = self.token_kind
419
420 if op_kind == Kind.VTest:
421 tok = self.cur_token
422 arg_word = self._ReadVarOpArg(arg_lex_mode)
423 if self.token_type != Id.Right_DollarBrace:
424 p_die('Expected } to close ${', self.cur_token)
425
426 part.suffix_op = suffix_op.Unary(tok, arg_word)
427
428 elif op_kind == Kind.VOpYsh:
429 tok = self.cur_token
430 arg_word = self._ReadVarOpArg(arg_lex_mode)
431 if self.token_type != Id.Right_DollarBrace:
432 p_die('Expected } to close ${', self.cur_token)
433
434 UP_arg_word = arg_word
435 with tagswitch(arg_word) as case:
436 if case(rhs_word_e.Empty):
437 pass
438 elif case(rhs_word_e.Compound):
439 arg_word = cast(CompoundWord, UP_arg_word)
440 # This handles ${x|html} and ${x %.3f} now
441 # However I think ${x %.3f} should be statically parsed? It can enter
442 # the printf lexer modes.
443 ok, arg, quoted = word_.StaticEval(arg_word)
444 if not ok or quoted:
445 p_die('Expected a constant argument',
446 loc.Word(arg_word))
447
448 part.suffix_op = suffix_op.Static(tok, arg)
449
450 elif op_kind == Kind.VOp0:
451 part.suffix_op = self.cur_token # Nullary
452 self._SetNext(lex_mode_e.VSub_2) # Expecting }
453 self._GetToken()
454
455 elif op_kind == Kind.VOp1: # % %% # ## etc.
456 tok = self.cur_token
457 # Weird exception that all shells have: these operators take a glob
458 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
459 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
460 if self.token_type != Id.Right_DollarBrace:
461 p_die('Expected } to close ${', self.cur_token)
462
463 part.suffix_op = suffix_op.Unary(tok, arg_word)
464
465 elif op_kind == Kind.VOp2: # / : [ ]
466 if self.token_type == Id.VOp2_Slash:
467 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
468 part.suffix_op = patsub_op
469
470 # Checked by the method above
471 assert self.token_type == Id.Right_DollarBrace, self.cur_token
472
473 elif self.token_type == Id.VOp2_Colon:
474 part.suffix_op = self._ReadSliceVarOp()
475 # NOTE: } in arithmetic mode.
476 if self.token_type != Id.Arith_RBrace:
477 # Token seems off; doesn't point to X in # ${a:1:2 X
478 p_die('Expected } to close ${', self.cur_token)
479
480 else:
481 # TODO: Does this ever happen?
482 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
483
484 elif op_kind == Kind.VOp3: # ${prefix@} etc.
485 if allow_query:
486 part.suffix_op = self.cur_token # Nullary
487 self._SetNext(lex_mode_e.VSub_2) # Expecting }
488 self._GetToken()
489 else:
490 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
491
492 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
493 # mode. It's redundantly checked above.
494 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
495 # ${a.} or ${!a.}
496 p_die('Expected } to close ${', self.cur_token)
497
498 # Now look for ops
499 return part
500
501 def _ReadZshVarSub(self, left_token):
502 # type: (Token) -> word_part.ZshVarSub
503
504 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
505
506 # Can be empty
507 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
508 True)
509 self._GetToken()
510 return word_part.ZshVarSub(left_token, w, self.cur_token)
511
512 def ReadBracedVarSub(self, left_token):
513 # type: (Token) -> Tuple[BracedVarSub, Token]
514 """ For YSH expressions like var x = ${x:-"default"}. """
515 part = self._ReadBracedVarSub(left_token, d_quoted=False)
516 last_token = self.cur_token
517 return part, last_token
518
519 def _ReadBracedVarSub(self, left_token, d_quoted):
520 # type: (Token, bool) -> BracedVarSub
521 """For the ${} expression language.
522
523 NAME = [a-zA-Z_][a-zA-Z0-9_]*
524 NUMBER = [0-9]+ # ${10}, ${11}, ...
525
526 Subscript = '[' ('@' | '*' | ArithExpr) ']'
527 VarSymbol = '!' | '@' | '#' | ...
528 VarOf = NAME Subscript?
529 | NUMBER # no subscript allowed, none of these are arrays
530 # ${@[1]} doesn't work, even though slicing does
531 | VarSymbol
532
533 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
534
535 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
536 STRIP_OP = '#' | '##' | '%' | '%%'
537 CASE_OP = ',' | ',,' | '^' | '^^'
538 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
539
540 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
541 # SPACE is operator not %
542 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
543 VarExpr = VarOf
544 | VarOf NULLARY_OP
545 | VarOf UnaryOp WORD
546 | VarOf YSH_UNARY STATIC_WORD
547 | VarOf ':' ArithExpr (':' ArithExpr )?
548 | VarOf '/' Match '/' WORD
549
550 LengthExpr = '#' VarOf # can't apply operators after length
551
552 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
553 # ${!ref[0]} vs ${!keys[@]} resolved later
554
555 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
556
557 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
558
559 VarSub = LengthExpr
560 | RefOrKeys
561 | PrefixQuery
562 | VarExpr
563 | BuiltinSub
564
565 NOTES:
566 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
567 slicing ${a:x+1:y+2}
568 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
569 - @ and * are technically arithmetic expressions in this implementation
570 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
571 it's also vectorized.
572
573 Strictness over bash:
574 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
575 grammar
576 - ! and # prefixes can't be composed, even though named refs can be
577 composed with other operators
578 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
579 a prefix, and it can also be a literal part of WORD.
580
581 From the parser's point of view, the prefix # can't be combined with
582 UnaryOp/slicing/matching, and the ! can. However
583
584 - ${a[@]:1:2} is not allowed
585 - ${#a[@]:1:2} is allowed, but gives the wrong answer
586 """
587 if d_quoted:
588 arg_lex_mode = lex_mode_e.VSub_ArgDQ
589 else:
590 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
591
592 self._SetNext(lex_mode_e.VSub_1)
593 self._GetToken()
594
595 ty = self.token_type
596 first_tok = self.cur_token
597
598 if ty == Id.VSub_Pound:
599 # Disambiguate
600 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
601 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
602 # e.g. a name, '#' is the prefix
603 self._SetNext(lex_mode_e.VSub_1)
604 part = self._ParseVarOf()
605
606 self._GetToken()
607 if self.token_type != Id.Right_DollarBrace:
608 p_die('Expected } after length expression', self.cur_token)
609
610 part.prefix_op = first_tok
611
612 else: # not a prefix, '#' is the variable
613 part = self._ParseVarExpr(arg_lex_mode)
614
615 elif ty == Id.VSub_Bang:
616 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
617 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
618 # e.g. a name, '!' is the prefix
619 # ${!a} -- this is a ref
620 # ${!3} -- this is ref
621 # ${!a[1]} -- this is a ref
622 # ${!a[@]} -- this is a keys
623 # No lookahead -- do it in a second step, or at runtime
624 self._SetNext(lex_mode_e.VSub_1)
625 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
626
627 part.prefix_op = first_tok
628
629 else: # not a prefix, '!' is the variable
630 part = self._ParseVarExpr(arg_lex_mode)
631
632 elif ty == Id.VSub_Dot:
633 # Note: this will become a new builtin_sub type, so this method must
634 # return word_part_t rather than BracedVarSub. I don't think that
635 # should cause problems.
636 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
637
638 # VS_NAME, VS_NUMBER, symbol that isn't # or !
639 elif self.token_kind == Kind.VSub:
640 part = self._ParseVarExpr(arg_lex_mode)
641
642 else:
643 # e.g. ${^}
644 p_die('Unexpected token in ${}', self.cur_token)
645
646 part.left = left_token # attach the argument
647 part.right = self.cur_token
648 return part
649
650 def _ReadSingleQuoted(self, left_token, lex_mode):
651 # type: (Token, lex_mode_t) -> SingleQuoted
652 """Internal method to read a word_part."""
653 tokens = [] # type: List[Token]
654 # In command mode, we never disallow backslashes like '\'
655 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
656 False)
657 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
658 node = SingleQuoted(left_token, sval, right_quote)
659 return node
660
661 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
662 # type: (lex_mode_t, Token, List[Token], bool) -> Token
663 """Appends to out_tokens; returns last token
664
665 Used by expr_parse.py
666 """
667 # TODO: Remove and use out_tokens
668 tokens = [] # type: List[Token]
669
670 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
671 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
672
673 expected_end_tokens = 3 if left_token.id in (
674 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
675 Id.Left_BTSingleQuote) else 1
676 num_end_tokens = 0
677
678 while num_end_tokens < expected_end_tokens:
679 self._SetNext(lex_mode)
680 self._GetToken()
681
682 # Kind.Char emitted in lex_mode.SQ_C
683 if self.token_kind in (Kind.Lit, Kind.Char):
684 tok = self.cur_token
685 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
686 # r'one\two' or c'one\\two'
687 if no_backslashes and lexer.TokenContains(tok, '\\'):
688 p_die(
689 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
690 tok)
691
692 if is_ysh_expr:
693 # Disallow var x = $'\001'. Arguably we don't need these
694 # checks because u'\u{1}' is the way to write it.
695 if self.token_type == Id.Char_Octal3:
696 p_die(
697 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
698 tok)
699
700 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
701 # disallow \xH
702 p_die(
703 r'Invalid hex escape in YSH string (must be \xHH)',
704 tok)
705
706 tokens.append(tok)
707
708 elif self.token_kind == Kind.Unknown:
709 tok = self.cur_token
710 assert tok.id == Id.Unknown_Backslash, tok
711
712 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
713 if is_ysh_expr or not self.parse_opts.parse_backslash():
714 p_die(
715 "Invalid char escape in C-style string literal (OILS-ERR-11)",
716 tok)
717
718 tokens.append(tok)
719
720 elif self.token_kind == Kind.Eof:
721 p_die('Unexpected EOF in single-quoted string that began here',
722 left_token)
723
724 elif self.token_kind == Kind.Right:
725 # assume Id.Right_SingleQuote
726 num_end_tokens += 1
727 tokens.append(self.cur_token)
728
729 else:
730 raise AssertionError(self.cur_token)
731
732 if self.token_kind != Kind.Right:
733 num_end_tokens = 0 # we need three in a ROW
734
735 if expected_end_tokens == 1:
736 tokens.pop()
737 elif expected_end_tokens == 3: # Get rid of spurious end tokens
738 tokens.pop()
739 tokens.pop()
740 tokens.pop()
741
742 # Remove space from ''' r''' $''' in both expression mode and command mode
743 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
744 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
745 word_compile.RemoveLeadingSpaceSQ(tokens)
746
747 # Validation after lexing - same 2 checks in j8.LexerDecoder
748 is_u_string = left_token.id in (Id.Left_USingleQuote,
749 Id.Left_UTSingleQuote)
750
751 for tok in tokens:
752 # u'\yff' is not valid, but b'\yff' is
753 if is_u_string and tok.id == Id.Char_YHex:
754 p_die(
755 r"%s escapes not allowed in u'' strings" %
756 lexer.TokenVal(tok), tok)
757
758 out_tokens.extend(tokens)
759 return self.cur_token
760
761 def _ReadDoubleQuotedLeftParts(self):
762 # type: () -> word_part_t
763 """Read substitution parts in a double quoted context."""
764 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
765 return self._ReadCommandSub(self.token_type, d_quoted=True)
766
767 if self.token_type == Id.Left_DollarBrace:
768 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
769
770 if self.token_type == Id.Left_DollarDParen:
771 return self._ReadArithSub()
772
773 if self.token_type == Id.Left_DollarBracket:
774 return self._ReadExprSub(lex_mode_e.DQ)
775
776 raise AssertionError(self.cur_token)
777
778 def _ReadYshSingleQuoted(self, left_id):
779 # type: (Id_t) -> CompoundWord
780 """Read YSH style strings
781
782 r'' u'' b''
783 r''' ''' u''' ''' b''' '''
784 """
785 #log('BEF self.cur_token %s', self.cur_token)
786 if left_id == Id.Left_RSingleQuote:
787 lexer_mode = lex_mode_e.SQ_Raw
788 triple_left_id = Id.Left_RTSingleQuote
789 elif left_id == Id.Left_USingleQuote:
790 lexer_mode = lex_mode_e.J8_Str
791 triple_left_id = Id.Left_UTSingleQuote
792 elif left_id == Id.Left_BSingleQuote:
793 lexer_mode = lex_mode_e.J8_Str
794 triple_left_id = Id.Left_BTSingleQuote
795 else:
796 raise AssertionError(left_id)
797
798 # Needed for syntax checks
799 left_tok = self.cur_token
800 left_tok.id = left_id
801
802 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
803
804 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
805 self._SetNext(lex_mode_e.ShCommand)
806 self._GetToken()
807
808 assert self.token_type == Id.Left_SingleQuote
809 # HACK: magically transform the third ' in u''' to
810 # Id.Left_UTSingleQuote, so that ''' is the terminator
811 left_tok = self.cur_token
812 left_tok.id = triple_left_id
813
814 # Handles stripping leading whitespace
815 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
816
817 # Advance and validate
818 self._SetNext(lex_mode_e.ShCommand)
819
820 self._GetToken()
821 if self.token_kind not in KINDS_THAT_END_WORDS:
822 p_die('Unexpected token after YSH single-quoted string',
823 self.cur_token)
824
825 return CompoundWord([sq_part])
826
827 def _ReadUnquotedLeftParts(self, triple_out):
828 # type: (Optional[BoolParamBox]) -> word_part_t
829 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
830
831 If triple_out is set, then we try parsing triple quoted strings,
832 and set its value to True if we got one.
833 """
834 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
835 # Note: $"" is a synonym for "". It might make sense if it added
836 # \n \0 \x00 \u{123} etc. But that's not what bash does!
837 dq_part = self._ReadDoubleQuoted(self.cur_token)
838 # Got empty word "" and there's a " after
839 if (triple_out and len(dq_part.parts) == 0 and
840 self.lexer.ByteLookAhead() == '"'):
841
842 self._SetNext(lex_mode_e.ShCommand)
843 self._GetToken()
844 # HACK: magically transform the third " in """ to
845 # Id.Left_TDoubleQuote, so that """ is the terminator
846 left_dq_token = self.cur_token
847 left_dq_token.id = Id.Left_TDoubleQuote
848 triple_out.b = True # let caller know we got it
849 return self._ReadDoubleQuoted(left_dq_token)
850
851 return dq_part
852
853 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
854 Id.Left_DollarSingleQuote):
855 if self.token_type == Id.Left_SingleQuote:
856 lexer_mode = lex_mode_e.SQ_Raw
857 triple_left_id = Id.Left_TSingleQuote
858 elif self.token_type == Id.Left_RSingleQuote:
859 lexer_mode = lex_mode_e.SQ_Raw
860 triple_left_id = Id.Left_RTSingleQuote
861 else:
862 lexer_mode = lex_mode_e.SQ_C
863 # there is no such thing as $'''
864 triple_left_id = Id.Undefined_Tok
865
866 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
867
868 # Got empty '' or r'' and there's a ' after
869 # u'' and b'' are handled in _ReadYshSingleQuoted
870 if (triple_left_id != Id.Undefined_Tok and
871 triple_out is not None and len(sq_part.sval) == 0 and
872 self.lexer.ByteLookAhead() == "'"):
873
874 self._SetNext(lex_mode_e.ShCommand)
875 self._GetToken()
876
877 # HACK: magically transform the third ' in ''' to
878 # Id.Left_TSingleQuote, so that ''' is the terminator
879 left_sq_token = self.cur_token
880 left_sq_token.id = triple_left_id
881
882 triple_out.b = True # let caller know we got it
883 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
884
885 return sq_part
886
887 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
888 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
889 return self._ReadCommandSub(self.token_type, d_quoted=False)
890
891 if self.token_type == Id.Left_DollarBrace:
892 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
893
894 if self.token_type == Id.Left_DollarDParen:
895 return self._ReadArithSub()
896
897 if self.token_type == Id.Left_DollarBracket:
898 return self._ReadExprSub(lex_mode_e.ShCommand)
899
900 if self.token_type == Id.Left_DollarBraceZsh:
901 return self._ReadZshVarSub(self.cur_token)
902
903 raise AssertionError(self.cur_token)
904
905 def _ReadExtGlob(self):
906 # type: () -> word_part.ExtGlob
907 """
908 Grammar:
909 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
910 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
911 RIGHT = ')'
912 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
913 Compound includes ExtGlob
914 """
915 left_token = self.cur_token
916 right_token = None # type: Token
917 arms = [] # type: List[CompoundWord]
918
919 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
920 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
921
922 read_word = False # did we just a read a word? To handle @(||).
923
924 while True:
925 self._GetToken()
926
927 if self.token_type == Id.Right_ExtGlob:
928 if not read_word:
929 arms.append(CompoundWord([]))
930 right_token = self.cur_token
931 break
932
933 elif self.token_type == Id.Op_Pipe:
934 if not read_word:
935 arms.append(CompoundWord([]))
936 read_word = False
937 self._SetNext(lex_mode_e.ExtGlob)
938
939 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
940 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
941 Kind.ExtGlob):
942 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
943 arms.append(w)
944 read_word = True
945
946 elif self.token_kind == Kind.Eof:
947 p_die('Unexpected EOF reading extended glob that began here',
948 left_token)
949
950 else:
951 raise AssertionError(self.cur_token)
952
953 return word_part.ExtGlob(left_token, arms, right_token)
954
955 def _ReadBashRegexGroup(self):
956 # type: () -> word_part.BashRegexGroup
957 """
958 Grammar:
959 BashRegexGroup = '(' WORD? ')
960 """
961 left_token = self.cur_token
962 assert left_token.id == Id.BashRegex_LParen, left_token
963
964 right_token = None # type: Token
965 arms = [] # type: List[CompoundWord]
966
967 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
968 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
969
970 self._GetToken()
971 if self.token_type == Id.Right_BashRegexGroup: # empty ()
972 return word_part.BashRegexGroup(left_token, None, self.cur_token)
973
974 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
975 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
976 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
977 # To allow bash style [[ s =~ (a b) ]]
978 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
979 arms.append(w)
980
981 self._GetToken()
982 if self.token_type != Id.Right_BashRegexGroup:
983 p_die('Expected ) to close bash regex group', self.cur_token)
984
985 return word_part.BashRegexGroup(left_token, w, self.cur_token)
986
987 p_die('Expected word after ( opening bash regex group', self.cur_token)
988
989 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
990 # type: (Optional[Token], bool, List[word_part_t]) -> None
991 """
992 Args:
993 left_token: A token if we are reading a double quoted part, or None if
994 we're reading a here doc.
995 is_ysh_expr: Whether to disallow backticks and invalid char escapes
996 out_parts: list of word_part to append to
997 """
998 if left_token:
999 if left_token.id in (Id.Left_TDoubleQuote,
1000 Id.Left_DollarTDoubleQuote):
1001 expected_end_tokens = 3
1002 else:
1003 expected_end_tokens = 1
1004 else:
1005 expected_end_tokens = 1000 # here doc will break
1006
1007 num_end_tokens = 0
1008 while num_end_tokens < expected_end_tokens:
1009 self._SetNext(lex_mode_e.DQ)
1010 self._GetToken()
1011
1012 if self.token_kind == Kind.Lit:
1013 if self.token_type == Id.Lit_EscapedChar:
1014 tok = self.cur_token
1015 ch = lexer.TokenSliceLeft(tok, 1)
1016 part = word_part.EscapedLiteral(tok,
1017 ch) # type: word_part_t
1018 else:
1019 if self.token_type == Id.Lit_BadBackslash:
1020 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1021 # YSH.
1022 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1023 # recursion (unless parse_backslash)
1024 if (is_ysh_expr or
1025 not self.parse_opts.parse_backslash()):
1026 p_die(
1027 "Invalid char escape in double quoted string (OILS-ERR-12)",
1028 self.cur_token)
1029 elif self.token_type == Id.Lit_Dollar:
1030 if is_ysh_expr or not self.parse_opts.parse_dollar():
1031 p_die("Literal $ should be quoted like \$",
1032 self.cur_token)
1033
1034 part = self.cur_token
1035 out_parts.append(part)
1036
1037 elif self.token_kind == Kind.Left:
1038 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1039 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1040 self.cur_token)
1041
1042 part = self._ReadDoubleQuotedLeftParts()
1043 out_parts.append(part)
1044
1045 elif self.token_kind == Kind.VSub:
1046 tok = self.cur_token
1047 part = SimpleVarSub(tok)
1048 out_parts.append(part)
1049 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1050 # later.
1051
1052 elif self.token_kind == Kind.Right:
1053 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1054 if left_token:
1055 num_end_tokens += 1
1056
1057 # In a here doc, the right quote is literal!
1058 out_parts.append(self.cur_token)
1059
1060 elif self.token_kind == Kind.Eof:
1061 if left_token:
1062 p_die(
1063 'Unexpected EOF reading double-quoted string that began here',
1064 left_token)
1065 else: # here docs will have an EOF in their token stream
1066 break
1067
1068 else:
1069 raise AssertionError(self.cur_token)
1070
1071 if self.token_kind != Kind.Right:
1072 num_end_tokens = 0 # """ must be CONSECUTIVE
1073
1074 if expected_end_tokens == 1:
1075 out_parts.pop()
1076 elif expected_end_tokens == 3:
1077 out_parts.pop()
1078 out_parts.pop()
1079 out_parts.pop()
1080
1081 # Remove space from """ in both expression mode and command mode
1082 if (left_token and left_token.id
1083 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1084 word_compile.RemoveLeadingSpaceDQ(out_parts)
1085
1086 # Return nothing, since we appended to 'out_parts'
1087
1088 def _ReadDoubleQuoted(self, left_token):
1089 # type: (Token) -> DoubleQuoted
1090 """Helper function for "hello $name".
1091
1092 Args:
1093 eof_type: for stopping at }, Id.Lit_RBrace
1094 here_doc: Whether we are reading in a here doc context
1095
1096 Also ${foo%%a b c} # treat this as double quoted. until you hit
1097 """
1098 parts = [] # type: List[word_part_t]
1099 self._ReadLikeDQ(left_token, False, parts)
1100
1101 right_quote = self.cur_token
1102 return DoubleQuoted(left_token, parts, right_quote)
1103
1104 def ReadDoubleQuoted(self, left_token, parts):
1105 # type: (Token, List[word_part_t]) -> Token
1106 """For expression mode.
1107
1108 Read var x = "${dir:-}/$name"; etc.
1109 """
1110 self._ReadLikeDQ(left_token, True, parts)
1111 return self.cur_token
1112
1113 def _ReadCommandSub(self, left_id, d_quoted=False):
1114 # type: (Id_t, bool) -> CommandSub
1115 """
1116 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1117
1118 command_sub = '$(' command_list ')'
1119 | '@(' command_list ')'
1120 | '<(' command_list ')'
1121 | '>(' command_list ')'
1122 | ` command_list `
1123 """
1124 left_token = self.cur_token
1125
1126 # Set the lexer in a state so ) becomes the EOF token.
1127 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1128 Id.Left_ProcSubOut):
1129 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1130
1131 right_id = Id.Eof_RParen
1132 self.lexer.PushHint(Id.Op_RParen, right_id)
1133 c_parser = self.parse_ctx.MakeParserForCommandSub(
1134 self.line_reader, self.lexer, right_id)
1135 # NOTE: This doesn't use something like main_loop because we don't want
1136 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1137 node = c_parser.ParseCommandSub()
1138
1139 right_token = c_parser.w_parser.cur_token
1140
1141 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1142 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1143 # test/osh2oil.
1144
1145 right_id = Id.Eof_Backtick
1146 self.lexer.PushHint(Id.Left_Backtick, right_id)
1147 c_parser = self.parse_ctx.MakeParserForCommandSub(
1148 self.line_reader, self.lexer, right_id)
1149 node = c_parser.ParseCommandSub()
1150 right_token = c_parser.w_parser.cur_token
1151
1152 elif left_id == Id.Left_Backtick:
1153 if not self.parse_opts.parse_backticks():
1154 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1155 left_token)
1156
1157 self._SetNext(lex_mode_e.Backtick) # advance past `
1158
1159 parts = [] # type: List[str]
1160 while True:
1161 self._GetToken()
1162 #log("TOK %s", self.cur_token)
1163
1164 if self.token_type == Id.Backtick_Quoted:
1165 # Remove leading \
1166 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1167
1168 elif self.token_type == Id.Backtick_DoubleQuote:
1169 # Compatibility: If backticks are double quoted, then double quotes
1170 # within them have to be \"
1171 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1172 # is)
1173 if d_quoted:
1174 # Remove leading \
1175 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1176 else:
1177 parts.append(lexer.TokenVal(self.cur_token))
1178
1179 elif self.token_type == Id.Backtick_Other:
1180 parts.append(lexer.TokenVal(self.cur_token))
1181
1182 elif self.token_type == Id.Backtick_Right:
1183 break
1184
1185 elif self.token_type == Id.Eof_Real:
1186 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1187 p_die('Unexpected EOF while looking for closing backtick',
1188 left_token)
1189
1190 else:
1191 raise AssertionError(self.cur_token)
1192
1193 self._SetNext(lex_mode_e.Backtick)
1194
1195 # Calculate right SPID on CommandSub BEFORE re-parsing.
1196 right_token = self.cur_token
1197
1198 code_str = ''.join(parts)
1199 #log('code %r', code_str)
1200
1201 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1202 # won't have the same location info as MakeParserForCommandSub(), because
1203 # the lexer is different.
1204 arena = self.parse_ctx.arena
1205 #arena = alloc.Arena()
1206 line_reader = reader.StringLineReader(code_str, arena)
1207 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1208 src = source.Reparsed('backticks', left_token, right_token)
1209 with alloc.ctx_SourceCode(arena, src):
1210 node = c_parser.ParseCommandSub()
1211
1212 else:
1213 raise AssertionError(left_id)
1214
1215 return CommandSub(left_token, node, right_token)
1216
1217 def _ReadExprSub(self, lex_mode):
1218 # type: (lex_mode_t) -> word_part.ExprSub
1219 """$[d->key] $[obj.method()] etc."""
1220 left_token = self.cur_token
1221
1222 self._SetNext(lex_mode_e.Expr)
1223 enode, right_token = self.parse_ctx.ParseYshExpr(
1224 self.lexer, grammar_nt.ysh_expr_sub)
1225
1226 self._SetNext(lex_mode) # Move past ]
1227 return word_part.ExprSub(left_token, enode, right_token)
1228
1229 def ParseVarDecl(self, kw_token):
1230 # type: (Token) -> command.VarDecl
1231 """
1232 oil_var_decl: name_type_list '=' testlist end_stmt
1233
1234 Note that assignments must end with \n ; } or EOF. Unlike shell
1235 assignments, we disallow:
1236
1237 var x = 42 | wc -l
1238 var x = 42 && echo hi
1239 """
1240 self._SetNext(lex_mode_e.Expr)
1241 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1242 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1243 # wants
1244 if last_token.id == Id.Op_RBrace:
1245 last_token.id = Id.Lit_RBrace
1246
1247 # Let the CommandParser see the Op_Semi or Op_Newline.
1248 self.buffered_word = last_token
1249 self._SetNext(lex_mode_e.ShCommand) # always back to this
1250 return enode
1251
1252 def ParseMutation(self, kw_token, var_checker):
1253 # type: (Token, VarChecker) -> command.Mutation
1254 """
1255 setvar i = 42
1256 setvar i += 1
1257 setvar a[i] = 42
1258 setvar a[i] += 1
1259 setvar d.key = 42
1260 setvar d.key += 1
1261 """
1262 self._SetNext(lex_mode_e.Expr)
1263 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1264 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1265 # wants
1266 if last_token.id == Id.Op_RBrace:
1267 last_token.id = Id.Lit_RBrace
1268
1269 for lhs in enode.lhs:
1270 UP_lhs = lhs
1271 with tagswitch(lhs) as case:
1272 if case(y_lhs_e.Var):
1273 lhs = cast(Token, UP_lhs)
1274 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1275
1276 # Note: this does not cover cases like
1277 # setvar (a[0])[1] = v
1278 # setvar (d.key).other = v
1279 # This leaks into catching all typos statically, which may be
1280 # possible if 'use' makes all names explicit.
1281 elif case(y_lhs_e.Subscript):
1282 lhs = cast(Subscript, UP_lhs)
1283 if lhs.obj.tag() == expr_e.Var:
1284 v = cast(expr.Var, lhs.obj)
1285 var_checker.Check(kw_token.id, v.name, v.left)
1286
1287 elif case(y_lhs_e.Attribute):
1288 lhs = cast(Attribute, UP_lhs)
1289 if lhs.obj.tag() == expr_e.Var:
1290 v = cast(expr.Var, lhs.obj)
1291 var_checker.Check(kw_token.id, v.name, v.left)
1292
1293 # Let the CommandParser see the Op_Semi or Op_Newline.
1294 self.buffered_word = last_token
1295 self._SetNext(lex_mode_e.ShCommand) # always back to this
1296 return enode
1297
1298 def ParseBareDecl(self):
1299 # type: () -> expr_t
1300 """
1301 x = {name: val}
1302 """
1303 self._SetNext(lex_mode_e.Expr)
1304 self._GetToken()
1305 enode, last_token = self.parse_ctx.ParseYshExpr(
1306 self.lexer, grammar_nt.command_expr)
1307 if last_token.id == Id.Op_RBrace:
1308 last_token.id = Id.Lit_RBrace
1309 self.buffered_word = last_token
1310 self._SetNext(lex_mode_e.ShCommand)
1311 return enode
1312
1313 def ParseYshExprForCommand(self):
1314 # type: () -> expr_t
1315
1316 # Fudge for this case
1317 # for x in(y) {
1318 # versus
1319 # for x in (y) {
1320 #
1321 # In the former case, ReadWord on 'in' puts the lexer past (.
1322 # Also see LookPastSpace in CommandParers.
1323 # A simpler solution would be nicer.
1324
1325 if self.token_type == Id.Op_LParen:
1326 self.lexer.MaybeUnreadOne()
1327
1328 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1329
1330 self._SetNext(lex_mode_e.ShCommand)
1331 return enode
1332
1333 def ParseCommandExpr(self):
1334 # type: () -> expr_t
1335 """
1336 = 1+2
1337 """
1338 enode, last_token = self.parse_ctx.ParseYshExpr(
1339 self.lexer, grammar_nt.command_expr)
1340
1341 # In some cases, such as the case statement, we expect *the lexer* to be
1342 # pointing at the token right after the expression. But the expression
1343 # parser must have read to the `last_token`. Unreading places the lexer
1344 # back in the expected state. Ie:
1345 #
1346 # case (x) { case (x) {
1347 # (else) { = x } (else) { = x }
1348 # ^ The lexer is here ^ Unread to here
1349 # } }
1350 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1351 Id.Op_RBrace), last_token
1352 if last_token.id != Id.Eof_Real:
1353 # Eof_Real is the only token we cannot unread
1354 self.lexer.MaybeUnreadOne()
1355
1356 return enode
1357
1358 def ParseProc(self, node):
1359 # type: (Proc) -> None
1360
1361 # proc name-with-hyphens() must be accepted
1362 self._SetNext(lex_mode_e.ShCommand)
1363 self._GetToken()
1364 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1365 if self.token_type != Id.Lit_Chars:
1366 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1367 self.cur_token)
1368
1369 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1370 # for shell functions. Similar to IsValidVarName().
1371 node.name = self.cur_token
1372
1373 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1374
1375 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1376 assert last_token.id == Id.Op_LBrace
1377 last_token.id = Id.Lit_LBrace
1378 self.buffered_word = last_token
1379
1380 self._SetNext(lex_mode_e.ShCommand)
1381
1382 def ParseFunc(self, node):
1383 # type: (Func) -> None
1384 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1385
1386 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1387 assert last_token.id == Id.Op_LBrace
1388 last_token.id = Id.Lit_LBrace
1389 self.buffered_word = last_token
1390
1391 self._SetNext(lex_mode_e.ShCommand)
1392
1393 def ParseYshCasePattern(self):
1394 # type: () -> Tuple[pat_t, Token]
1395 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1396 self.lexer)
1397
1398 if last_token.id == Id.Op_LBrace:
1399 last_token.id = Id.Lit_LBrace
1400 self.buffered_word = last_token
1401
1402 return pat, left_tok
1403
1404 def NewlineOkForYshCase(self):
1405 # type: () -> Id_t
1406 """Check for optional newline and consume it.
1407
1408 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1409 which crop up while parsing Ysh Case Arms. For more details, see
1410 #oil-dev > Progress On YSH Case Grammar on zulip.
1411
1412 Returns a token id which is filled with the choice of
1413
1414 word { echo word }
1415 (3) { echo expr }
1416 /e/ { echo eggex }
1417 } # right brace
1418 """
1419 while True:
1420 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1421
1422 # Cannot lookahead past lines
1423 if next_id == Id.Unknown_Tok:
1424 self.lexer.MoveToNextLine()
1425 continue
1426
1427 next_kind = consts.GetKind(next_id)
1428 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1429 break
1430
1431 self.lexer.Read(lex_mode_e.Expr)
1432
1433 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1434 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1435 else:
1436 # Consume the trailing Op_Newline
1437 self._SetNext(lex_mode_e.ShCommand)
1438 self._GetToken()
1439
1440 return next_id
1441
1442 def _ReadArithExpr(self, end_id):
1443 # type: (Id_t) -> arith_expr_t
1444 """Read and parse an arithmetic expression in various contexts.
1445
1446 $(( 1+2 ))
1447 (( a=1+2 ))
1448 ${a[ 1+2 ]}
1449 ${a : 1+2 : 1+2}
1450
1451 See tests/arith-context.test.sh for ambiguous cases.
1452
1453 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1454
1455 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1456
1457 See the assertion in ArithParser.Parse() -- unexpected extra input.
1458 """
1459 # calls self.ReadWord(lex_mode_e.Arith)
1460 anode = self.a_parser.Parse()
1461 cur_id = self.a_parser.CurrentId()
1462 if end_id != Id.Undefined_Tok and cur_id != end_id:
1463 p_die(
1464 'Unexpected token after arithmetic expression (%s != %s)' %
1465 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1466 loc.Word(self.a_parser.cur_word))
1467 return anode
1468
1469 def _ReadArithSub(self):
1470 # type: () -> word_part.ArithSub
1471 """Read an arith substitution, which contains an arith expression, e.g.
1472
1473 $((a + 1)).
1474 """
1475 left_tok = self.cur_token
1476
1477 # The second one needs to be disambiguated in stuff like stuff like:
1478 # $(echo $(( 1+2 )) )
1479 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1480
1481 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1482 # could save the lexer/reader state here, and retry if the arithmetic parse
1483 # fails. But we can almost always catch this at parse time. There could
1484 # be some exceptions like:
1485 # $((echo * foo)) # looks like multiplication
1486 # $((echo / foo)) # looks like division
1487
1488 # $(( )) is valid
1489 anode = arith_expr.EmptyZero # type: arith_expr_t
1490
1491 self._NextNonSpace()
1492 if self.token_type != Id.Arith_RParen:
1493 anode = self._ReadArithExpr(Id.Arith_RParen)
1494
1495 self._SetNext(lex_mode_e.ShCommand)
1496
1497 # Ensure we get closing )
1498 self._GetToken()
1499 if self.token_type != Id.Right_DollarDParen:
1500 p_die('Expected second ) to end arith sub', self.cur_token)
1501
1502 right_tok = self.cur_token
1503 return word_part.ArithSub(left_tok, anode, right_tok)
1504
1505 def ReadDParen(self):
1506 # type: () -> Tuple[arith_expr_t, Token]
1507 """Read ((1+ 2)) -- command context.
1508
1509 We're using the word parser because it's very similar to _ReadArithExpr
1510 above.
1511
1512 This also returns the terminating Id.Op_DRightParen token for location
1513 info.
1514 """
1515 # (( )) is valid
1516 anode = arith_expr.EmptyZero # type: arith_expr_t
1517
1518 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1519
1520 self._NextNonSpace()
1521 if self.token_type != Id.Arith_RParen:
1522 anode = self._ReadArithExpr(Id.Arith_RParen)
1523
1524 self._SetNext(lex_mode_e.ShCommand)
1525
1526 # Ensure we get the second )
1527 self._GetToken()
1528 right = self.cur_token
1529 if right.id != Id.Op_DRightParen:
1530 p_die('Expected second ) to end arith statement', right)
1531
1532 self._SetNext(lex_mode_e.ShCommand)
1533
1534 return anode, right
1535
1536 def _NextNonSpace(self):
1537 # type: () -> None
1538 """Advance in lex_mode_e.Arith until non-space token.
1539
1540 Same logic as _ReadWord, but used in
1541 $(( ))
1542 (( ))
1543 for (( ))
1544
1545 You can read self.token_type after this, without calling _GetToken.
1546 """
1547 while True:
1548 self._SetNext(lex_mode_e.Arith)
1549 self._GetToken()
1550 if self.token_kind not in (Kind.Ignored, Kind.WS):
1551 break
1552
1553 def ReadForExpression(self):
1554 # type: () -> command.ForExpr
1555 """Read ((i=0; i<5; ++i)) -- part of command context."""
1556 self._NextNonSpace() # skip over ((
1557 cur_id = self.token_type # for end of arith expressions
1558
1559 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1560 init_node = arith_expr.EmptyZero # type: arith_expr_t
1561 else:
1562 init_node = self.a_parser.Parse()
1563 cur_id = self.a_parser.CurrentId()
1564 self._NextNonSpace()
1565
1566 # It's odd to keep track of both cur_id and self.token_type in this
1567 # function, but it works, and is tested in 'test/parse_error.sh
1568 # arith-integration'
1569 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1570 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1571
1572 self._GetToken()
1573 cur_id = self.token_type
1574
1575 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1576 # empty condition is TRUE
1577 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1578 else:
1579 cond_node = self.a_parser.Parse()
1580 cur_id = self.a_parser.CurrentId()
1581
1582 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1583 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1584
1585 self._NextNonSpace()
1586 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1587 update_node = arith_expr.EmptyZero # type: arith_expr_t
1588 else:
1589 update_node = self._ReadArithExpr(Id.Arith_RParen)
1590
1591 self._NextNonSpace()
1592 if self.token_type != Id.Arith_RParen:
1593 p_die('Expected ) to end for loop expression', self.cur_token)
1594 self._SetNext(lex_mode_e.ShCommand)
1595
1596 # redirects is None, will be assigned in CommandEvaluator
1597 node = command.ForExpr.CreateNull()
1598 node.init = init_node
1599 node.cond = cond_node
1600 node.update = update_node
1601 return node
1602
1603 def _ReadArrayLiteral(self):
1604 # type: () -> word_part_t
1605 """a=(1 2 3)
1606
1607 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1608
1609 We want:
1610
1611 A=(['x']=1 ["x"]=2 [$x$y]=3)
1612
1613 Maybe allow this as a literal string? Because I think I've seen it before?
1614 Or maybe force people to patch to learn the rule.
1615
1616 A=([x]=4)
1617
1618 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1619 Maybe enforce that ALL have keys or NONE of have keys.
1620 """
1621 self._SetNext(lex_mode_e.ShCommand) # advance past (
1622 self._GetToken()
1623 if self.cur_token.id != Id.Op_LParen:
1624 p_die('Expected ( after =', self.cur_token)
1625 left_token = self.cur_token
1626 right_token = None # type: Token
1627
1628 # MUST use a new word parser (with same lexer).
1629 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1630 words = [] # type: List[CompoundWord]
1631 done = False
1632 while not done:
1633 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1634 with tagswitch(w) as case:
1635 if case(word_e.Operator):
1636 tok = cast(Token, w)
1637 if tok.id == Id.Right_ShArrayLiteral:
1638 right_token = tok
1639 done = True # can't use break here
1640 # Unlike command parsing, array parsing allows embedded \n.
1641 elif tok.id == Id.Op_Newline:
1642 continue
1643 else:
1644 p_die('Unexpected token in array literal', loc.Word(w))
1645
1646 elif case(word_e.Compound):
1647 words.append(cast(CompoundWord, w))
1648
1649 else:
1650 raise AssertionError()
1651
1652 if len(words) == 0: # a=() is empty indexed array
1653 # Needed for type safety, doh
1654 no_words = [] # type: List[word_t]
1655 node = ShArrayLiteral(left_token, no_words, right_token)
1656 return node
1657
1658 pairs = [] # type: List[AssocPair]
1659 # If the first one is a key/value pair, then the rest are assumed to be.
1660 pair = word_.DetectAssocPair(words[0])
1661 if pair:
1662 pairs.append(pair)
1663
1664 n = len(words)
1665 for i in xrange(1, n):
1666 w2 = words[i]
1667 pair = word_.DetectAssocPair(w2)
1668 if not pair:
1669 p_die("Expected associative array pair", loc.Word(w2))
1670
1671 pairs.append(pair)
1672
1673 # invariant List?
1674 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1675
1676 # Brace detection for arrays but NOT associative arrays
1677 words2 = braces.BraceDetectAll(words)
1678 words3 = word_.TildeDetectAll(words2)
1679 return ShArrayLiteral(left_token, words3, right_token)
1680
1681 def ParseProcCallArgs(self, start_symbol):
1682 # type: (int) -> ArgList
1683 """ json write (x) """
1684 self.lexer.MaybeUnreadOne()
1685
1686 arg_list = ArgList.CreateNull(alloc_lists=True)
1687 arg_list.left = self.cur_token
1688 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1689 return arg_list
1690
1691 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1692 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1693 """Helper for _ReadCompoundWord3."""
1694 done = False
1695
1696 if self.token_type == Id.Lit_EscapedChar:
1697 tok = self.cur_token
1698 assert tok.length == 2
1699 ch = lexer.TokenSliceLeft(tok, 1)
1700 if not self.parse_opts.parse_backslash():
1701 if not pyutil.IsValidCharEscape(ch):
1702 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1703 self.cur_token)
1704
1705 part = word_part.EscapedLiteral(self.cur_token,
1706 ch) # type: word_part_t
1707 else:
1708 part = self.cur_token
1709
1710 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1711 parts.append(part)
1712 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1713 # _ReadWord.
1714 next_id = self.lexer.LookPastSpace(lex_mode)
1715 if next_id == Id.Op_LParen:
1716 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1717 part2 = self._ReadArrayLiteral()
1718 parts.append(part2)
1719
1720 # Array literal must be the last part of the word.
1721 self._SetNext(lex_mode)
1722 self._GetToken()
1723 # EOF, whitespace, newline, Right_Subshell
1724 if self.token_kind not in KINDS_THAT_END_WORDS:
1725 p_die('Unexpected token after array literal',
1726 self.cur_token)
1727 done = True
1728
1729 elif (is_first and self.parse_opts.parse_at() and
1730 self.token_type == Id.Lit_Splice):
1731
1732 splice_tok = self.cur_token
1733 part2 = word_part.Splice(splice_tok,
1734 lexer.TokenSliceLeft(splice_tok, 1))
1735
1736 parts.append(part2)
1737
1738 # @words must be the last part of the word
1739 self._SetNext(lex_mode)
1740 self._GetToken()
1741 # EOF, whitespace, newline, Right_Subshell
1742 if self.token_kind not in KINDS_THAT_END_WORDS:
1743 p_die('Unexpected token after array splice', self.cur_token)
1744 done = True
1745
1746 elif (is_first and self.parse_opts.parse_at() and
1747 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1748 part2 = self._ReadExprSub(lex_mode_e.DQ)
1749 parts.append(part2)
1750
1751 # @[split(x)]
1752 self._SetNext(lex_mode)
1753 self._GetToken()
1754 # EOF, whitespace, newline, Right_Subshell
1755 if self.token_kind not in KINDS_THAT_END_WORDS:
1756 p_die('Unexpected token after Expr splice', self.cur_token)
1757 done = True
1758
1759 elif (is_first and self.parse_opts.parse_at() and
1760 self.token_type == Id.Lit_AtLBraceDot):
1761 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1762
1763 elif (is_first and self.parse_opts.parse_at_all() and
1764 self.token_type == Id.Lit_At):
1765 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1766 # at the beginning of a word to be reserved.
1767
1768 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1769 # @_argv and
1770 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1771 self.cur_token)
1772
1773 else:
1774 # not a literal with lookahead; append it
1775 parts.append(part)
1776
1777 return done
1778
1779 def _ReadCompoundWord(self, lex_mode):
1780 # type: (lex_mode_t) -> CompoundWord
1781 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1782
1783 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1784 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1785 """
1786 Precondition: Looking at the first token of the first word part
1787 Postcondition: Looking at the token after, e.g. space or operator
1788
1789 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1790 could be an operator delimiting a compound word. Can we change lexer modes
1791 and remove this special case?
1792 """
1793 w = CompoundWord([])
1794 num_parts = 0
1795 brace_count = 0
1796 done = False
1797 is_triple_quoted = None # type: Optional[BoolParamBox]
1798
1799 while not done:
1800 self._GetToken()
1801
1802 allow_done = empty_ok or num_parts != 0
1803 if allow_done and self.token_type == eof_type:
1804 done = True # e.g. for ${foo//pat/replace}
1805
1806 # Keywords like "for" are treated like literals
1807 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1808 Kind.ControlFlow, Kind.BoolUnary,
1809 Kind.BoolBinary):
1810
1811 # Syntax error for { and }
1812 if self.token_type == Id.Lit_LBrace:
1813 brace_count += 1
1814 elif self.token_type == Id.Lit_RBrace:
1815 brace_count -= 1
1816 elif self.token_type == Id.Lit_Dollar:
1817 if not self.parse_opts.parse_dollar():
1818 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1819 next_byte = self.lexer.ByteLookAhead()
1820 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1821 if next_byte == '/':
1822 #log('next_byte %r', next_byte)
1823 pass
1824
1825 p_die('Literal $ should be quoted like \$',
1826 self.cur_token)
1827
1828 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1829 w.parts)
1830
1831 elif self.token_kind == Kind.VSub:
1832 vsub_token = self.cur_token
1833
1834 part = SimpleVarSub(vsub_token) # type: word_part_t
1835 w.parts.append(part)
1836
1837 elif self.token_kind == Kind.ExtGlob:
1838 # If parse_at, we can take over @( to start @(seq 3)
1839 # Users can also use look at ,(*.py|*.sh)
1840 if (self.parse_opts.parse_at() and
1841 self.token_type == Id.ExtGlob_At and num_parts == 0):
1842 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1843 d_quoted=False)
1844 # RARE mutation of tok.id!
1845 cs_part.left_token.id = Id.Left_AtParen
1846 part = cs_part # for type safety
1847
1848 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1849 # a=(one two)x and @arrayfunc(3)x.
1850 self._GetToken()
1851 if self.token_kind not in KINDS_THAT_END_WORDS:
1852 p_die('Unexpected token after @()', self.cur_token)
1853 done = True
1854
1855 else:
1856 part = self._ReadExtGlob()
1857 w.parts.append(part)
1858
1859 elif self.token_kind == Kind.BashRegex:
1860 if self.token_type == Id.BashRegex_LParen: # Opening (
1861 part = self._ReadBashRegexGroup()
1862 w.parts.append(part)
1863 else:
1864 assert self.token_type == Id.BashRegex_AllowedInParens
1865 p_die('Invalid token in bash regex', self.cur_token)
1866
1867 elif self.token_kind == Kind.Left:
1868 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1869 lex_mode == lex_mode_e.ShCommand and
1870 num_parts == 0)
1871
1872 # Save allocation
1873 if try_triple_quote:
1874 is_triple_quoted = BoolParamBox(False)
1875
1876 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1877 w.parts.append(part)
1878
1879 # NOT done yet, will advance below
1880 elif self.token_kind == Kind.Right:
1881 # Still part of the word; will be done on the next iter.
1882 if self.token_type == Id.Right_DoubleQuote:
1883 pass
1884 # Never happens, no PushHint for this case.
1885 #elif self.token_type == Id.Right_DollarParen:
1886 # pass
1887 elif self.token_type == Id.Right_Subshell:
1888 # LEXER HACK for (case x in x) ;; esac )
1889 # Rewind before it's used
1890 assert self.next_lex_mode == lex_mode_e.Undefined
1891 if self.lexer.MaybeUnreadOne():
1892 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1893 self._SetNext(lex_mode)
1894 done = True
1895 else:
1896 done = True
1897
1898 elif self.token_kind == Kind.Ignored:
1899 done = True
1900
1901 else:
1902 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1903 # so to test for ESAC, we can read ) before getting a chance to
1904 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1905 # token and do it again.
1906
1907 # We get Id.Op_RParen at top level: case x in x) ;; esac
1908 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1909 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1910 # Rewind before it's used
1911 assert self.next_lex_mode == lex_mode_e.Undefined
1912 if self.lexer.MaybeUnreadOne():
1913 if self.token_type == Id.Eof_RParen:
1914 # Redo translation
1915 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1916 self._SetNext(lex_mode)
1917
1918 done = True # anything we don't recognize means we're done
1919
1920 if not done:
1921 self._SetNext(lex_mode)
1922 num_parts += 1
1923
1924 if (self.parse_opts.parse_brace() and num_parts > 1 and
1925 brace_count != 0):
1926 # accept { and }, but not foo{
1927 p_die(
1928 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1929 loc.Word(w))
1930
1931 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1932 p_die('Unexpected parts after triple quoted string',
1933 loc.WordPart(w.parts[-1]))
1934
1935 if 0:
1936 from _devbuild.gen.syntax_asdl import word_part_str
1937 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1938 WORD_HIST[word_key] += 1
1939 return w
1940
1941 def _ReadArithWord(self):
1942 # type: () -> Optional[word_t]
1943 """ Helper for ReadArithWord() """
1944 self._GetToken()
1945
1946 if self.token_kind == Kind.Unknown:
1947 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1948 p_die(
1949 'Unexpected token while parsing arithmetic: %r' %
1950 lexer.TokenVal(self.cur_token), self.cur_token)
1951
1952 elif self.token_kind == Kind.Eof:
1953 return self.cur_token
1954
1955 elif self.token_kind == Kind.Ignored:
1956 # Space should be ignored.
1957 self._SetNext(lex_mode_e.Arith)
1958 return None
1959
1960 elif self.token_kind in (Kind.Arith, Kind.Right):
1961 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1962 self._SetNext(lex_mode_e.Arith)
1963 return self.cur_token
1964
1965 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1966 return self._ReadCompoundWord(lex_mode_e.Arith)
1967
1968 else:
1969 raise AssertionError(self.cur_token)
1970
1971 def _ReadWord(self, word_mode):
1972 # type: (lex_mode_t) -> Optional[word_t]
1973 """Helper function for ReadWord()."""
1974
1975 # Change the pseudo lexer mode to a real lexer mode
1976 if word_mode == lex_mode_e.ShCommandFakeBrack:
1977 lex_mode = lex_mode_e.ShCommand
1978 else:
1979 lex_mode = word_mode
1980
1981 self._GetToken()
1982
1983 if self.token_kind == Kind.Eof:
1984 # No advance
1985 return self.cur_token
1986
1987 # Allow Arith for ) at end of for loop?
1988 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1989 self._SetNext(lex_mode)
1990
1991 # Newlines are complicated. See 3x2 matrix in the comment about
1992 # self.multiline and self.newline_state above.
1993 if self.token_type == Id.Op_Newline:
1994 if self.multiline:
1995 if self.newline_state > 1:
1996 # This points at a blank line, but at least it gives the line number
1997 p_die('Invalid blank line in multiline mode',
1998 self.cur_token)
1999 return None
2000
2001 if self.returned_newline: # skip
2002 return None
2003
2004 return self.cur_token
2005
2006 elif self.token_kind == Kind.Right:
2007 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2008 Id.Right_CasePat,
2009 Id.Right_ShArrayLiteral):
2010 raise AssertionError(self.cur_token)
2011
2012 self._SetNext(lex_mode)
2013 return self.cur_token
2014
2015 elif self.token_kind in (Kind.Ignored, Kind.WS):
2016 self._SetNext(lex_mode)
2017 return None
2018
2019 else:
2020 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2021 Kind.Left, Kind.KW, Kind.ControlFlow,
2022 Kind.BoolUnary, Kind.BoolBinary,
2023 Kind.ExtGlob,
2024 Kind.BashRegex), 'Unhandled token kind'
2025
2026 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2027 self.parse_opts.parse_bracket() and
2028 self.token_type == Id.Lit_LBracket):
2029 # Change [ from Kind.Lit -> Kind.Op
2030 # So CommandParser can treat
2031 # assert [42 === x]
2032 # like
2033 # json write (x)
2034 bracket_word = self.cur_token
2035 bracket_word.id = Id.Op_LBracket
2036
2037 self._SetNext(lex_mode)
2038 return bracket_word
2039
2040 # We're beginning a word. If we see Id.Lit_Pound, change to
2041 # lex_mode_e.Comment and read until end of line.
2042 if self.token_type == Id.Lit_Pound:
2043 self._SetNext(lex_mode_e.Comment)
2044 self._GetToken()
2045
2046 # NOTE: The # could be the last character in the file. It can't be
2047 # Eof_{RParen,Backtick} because #) and #` are comments.
2048 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2049 self.cur_token
2050
2051 # The next iteration will go into Kind.Ignored and set lex state to
2052 # lex_mode_e.ShCommand/etc.
2053 return None # tell ReadWord() to try again after comment
2054
2055 elif self.token_type == Id.Lit_TPound: ### doc comment
2056 self._SetNext(lex_mode_e.Comment)
2057 self._GetToken()
2058
2059 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2060 return self.cur_token
2061
2062 return None # tell ReadWord() to try again after comment
2063
2064 else:
2065 # r'' u'' b''
2066 if (self.token_type == Id.Lit_Chars and
2067 self.lexer.LookAheadOne(
2068 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2069
2070 # When shopt -s parse_raw_string:
2071 # echo r'hi' is like echo 'hi'
2072 #
2073 # echo u'\u{3bc}' b'\yff' works
2074
2075 tok = self.cur_token
2076 if self.parse_opts.parse_ysh_string():
2077 if lexer.TokenEquals(tok, 'r'):
2078 left_id = Id.Left_RSingleQuote
2079 elif lexer.TokenEquals(tok, 'u'):
2080 left_id = Id.Left_USingleQuote
2081 elif lexer.TokenEquals(tok, 'b'):
2082 left_id = Id.Left_BSingleQuote
2083 else:
2084 left_id = Id.Undefined_Tok
2085
2086 if left_id != Id.Undefined_Tok:
2087 # skip the r, and then 'foo' will be read as normal
2088 self._SetNext(lex_mode_e.ShCommand)
2089
2090 self._GetToken()
2091 assert self.token_type == Id.Left_SingleQuote, self.token_type
2092
2093 # Read the word in a different lexer mode
2094 return self._ReadYshSingleQuoted(left_id)
2095
2096 return self._ReadCompoundWord(lex_mode)
2097
2098 def ParseVarRef(self):
2099 # type: () -> BracedVarSub
2100 """DYNAMIC parsing of what's inside ${!ref}
2101
2102 # Same as VarOf production
2103 VarRefExpr = VarOf EOF
2104 """
2105 self._SetNext(lex_mode_e.VSub_1)
2106
2107 self._GetToken()
2108 if self.token_kind != Kind.VSub:
2109 p_die('Expected var name', self.cur_token)
2110
2111 part = self._ParseVarOf()
2112 # NOTE: no ${ } means no part.left and part.right
2113 part.left = part.token # cheat to make test pass
2114 part.right = part.token
2115
2116 self._GetToken()
2117 if self.token_type != Id.Eof_Real:
2118 p_die('Expected end of var ref expression', self.cur_token)
2119 return part
2120
2121 def LookPastSpace(self):
2122 # type: () -> Id_t
2123 """Look ahead to the next token.
2124
2125 For the CommandParser to recognize
2126 array= (1 2 3)
2127 YSH for ( versus bash for ((
2128 YSH if ( versus if test
2129 YSH while ( versus while test
2130 YSH bare assignment 'grep =' versus 'grep foo'
2131 """
2132 assert self.token_type != Id.Undefined_Tok
2133 if self.cur_token.id == Id.WS_Space:
2134 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2135 else:
2136 id_ = self.cur_token.id
2137 return id_
2138
2139 def LookAheadFuncParens(self):
2140 # type: () -> bool
2141 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2142 assert self.token_type != Id.Undefined_Tok
2143
2144 # We have to handle 2 cases because we buffer a token
2145 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2146 return self.lexer.LookAheadFuncParens(1) # go back one char
2147
2148 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2149 return self.lexer.LookAheadFuncParens(0)
2150
2151 else:
2152 return False
2153
2154 def ReadWord(self, word_mode):
2155 # type: (lex_mode_t) -> word_t
2156 """Read the next word, using the given lexer mode.
2157
2158 This is a stateful wrapper for the stateless _ReadWord function.
2159 """
2160 assert word_mode in (lex_mode_e.ShCommand,
2161 lex_mode_e.ShCommandFakeBrack,
2162 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2163
2164 if self.buffered_word: # For integration with pgen2
2165 w = self.buffered_word
2166 self.buffered_word = None
2167 else:
2168 while True:
2169 w = self._ReadWord(word_mode)
2170 if w is not None:
2171 break
2172
2173 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2174 return w
2175
2176 def ReadArithWord(self):
2177 # type: () -> word_t
2178 while True:
2179 w = self._ReadArithWord()
2180 if w is not None:
2181 break
2182 return w
2183
2184 def ReadHereDocBody(self, parts):
2185 # type: (List[word_part_t]) -> None
2186 """
2187 A here doc is like a double quoted context, except " isn't special.
2188 """
2189 self._ReadLikeDQ(None, False, parts)
2190 # Returns nothing
2191
2192 def ReadForPlugin(self):
2193 # type: () -> CompoundWord
2194 """For $PS1, $PS4, etc.
2195
2196 This is just like reading a here doc line. "\n" is allowed, as
2197 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2198 """
2199 w = CompoundWord([])
2200 self._ReadLikeDQ(None, False, w.parts)
2201 return w
2202
2203 def EmitDocToken(self, b):
2204 # type: (bool) -> None
2205 self.emit_doc_token = b
2206
2207 def Multiline(self, b):
2208 # type: (bool) -> None
2209 self.multiline = b
2210
2211
2212if 0:
2213 import collections
2214 WORD_HIST = collections.Counter()
2215
2216# vim: sw=4