osh/word_parse.py

OILS / osh / word_parse.py View on Github | oilshell.org

2218 lines, 1183 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	ShArrayLiteral,
65	AssocPair,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	arith_expr,
91	)
92	from core import alloc
93	from core.error import p_die
94	from mycpp.mylib import log
95	from core import pyutil
96	from core import ui
97	from frontend import consts
98	from frontend import lexer
99	from frontend import reader
100	from osh import tdop
101	from osh import arith_parse
102	from osh import braces
103	from osh import word_
104	from osh import word_compile
105	from mycpp.mylib import tagswitch
106
107	from typing import List, Optional, Tuple, cast
108	from typing import TYPE_CHECKING
109	if TYPE_CHECKING:
110	from frontend.lexer import Lexer
111	from frontend.parse_lib import ParseContext
112	from frontend.reader import _Reader
113	from osh.cmd_parse import VarChecker
114
115	unused1 = log
116	unused2 = Id_str
117
118	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
119
120
121	class WordEmitter(object):
122	"""Common interface for [ and [["""
123
124	def __init__(self):
125	# type: () -> None
126	"""Empty constructor for mycpp."""
127	pass
128
129	def ReadWord(self, lex_mode):
130	# type: (lex_mode_t) -> word_t
131	raise NotImplementedError()
132
133
134	class WordParser(WordEmitter):
135
136	def __init__(self, parse_ctx, lexer, line_reader):
137	# type: (ParseContext, Lexer, _Reader) -> None
138	self.parse_ctx = parse_ctx
139	self.lexer = lexer
140	self.line_reader = line_reader
141	self.arena = line_reader.arena
142
143	self.parse_opts = parse_ctx.parse_opts
144	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
145	self.parse_opts)
146	self.Reset()
147
148	def Init(self, lex_mode):
149	# type: (lex_mode_t) -> None
150	"""Used to parse arithmetic, see ParseContext."""
151	self.next_lex_mode = lex_mode
152
153	def Reset(self):
154	# type: () -> None
155	"""Called by interactive loop."""
156	# For _GetToken()
157	self.cur_token = None # type: Token
158	self.token_kind = Kind.Undefined
159	self.token_type = Id.Undefined_Tok
160
161	self.next_lex_mode = lex_mode_e.ShCommand
162
163	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
164	# comments
165	self.emit_doc_token = False
166	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
167	# multiline mode.
168	self.multiline = False
169
170	# For detecting invalid \n\n in multiline mode. Counts what we got
171	# directly from the lexer.
172	self.newline_state = 0
173	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
174	# that consume words.
175	self.returned_newline = False
176
177	# For integration with pgen2
178	self.buffered_word = None # type: word_t
179
180	def _GetToken(self):
181	# type: () -> None
182	"""Call this when you need to make a decision based on any of:
183
184	self.token_type
185	self.token_kind
186	self.cur_token
187	"""
188	if self.next_lex_mode == lex_mode_e.Undefined:
189	return # _SetNext() not called, so do nothing
190
191	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
192	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
193
194	self.cur_token = self.lexer.Read(real_mode)
195
196	# MUTATE TOKEN for fake lexer mode.
197	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
198	if (is_fake and self.cur_token.id
199	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
200	self.cur_token.id = Id.Lit_Chars
201
202	self.token_type = self.cur_token.id
203	self.token_kind = consts.GetKind(self.token_type)
204
205	# number of consecutive newlines, ignoring whitespace
206	if self.token_type == Id.Op_Newline:
207	self.newline_state += 1
208	elif self.token_kind != Kind.WS:
209	self.newline_state = 0
210
211	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
212	self.next_lex_mode = lex_mode_e.Undefined
213
214	def _SetNext(self, lex_mode):
215	# type: (lex_mode_t) -> None
216	"""Set the next lex state, but don't actually read a token.
217
218	We need this for proper interactive parsing.
219	"""
220	self.next_lex_mode = lex_mode
221
222	def _ReadVarOpArg(self, arg_lex_mode):
223	# type: (lex_mode_t) -> rhs_word_t
224
225	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
226	# valid, even when unquoted.
227	self._SetNext(arg_lex_mode)
228	self._GetToken()
229
230	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
231	True) # empty_ok
232
233	# If the Compound has no parts, and we're in a double-quoted VarSub
234	# arg, and empty_ok, then return Empty. This is so it can evaluate to
235	# the empty string and not get elided.
236	#
237	# Examples:
238	# - "${s:-}", "${s/%pat/}"
239	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
240	# has the same potential problem of not having Token location info.
241	#
242	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
243	# return a Compound with no parts, which is explicitly checked with a
244	# custom error message.
245	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
246	return rhs_word.Empty
247
248	return w
249
250	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
251	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
252	"""Return a CompoundWord.
253
254	Helper function for _ReadVarOpArg and used directly by
255	_ReadPatSubVarOp.
256	"""
257	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
258	#log('w %s', w)
259	tilde = word_.TildeDetect(w)
260	if tilde:
261	w = tilde
262	return w
263
264	def _ReadSliceVarOp(self):
265	# type: () -> suffix_op.Slice
266	"""
267	Looking token after first ':'
268
269	ArithExpr? (':' ArithExpr? )? '}'
270	"""
271	self._NextNonSpace()
272
273	cur_id = self.token_type
274
275	if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
276	begin = arith_expr.EmptyZero # type: arith_expr_t
277	else:
278	begin = self.a_parser.Parse()
279	cur_id = self.a_parser.CurrentId() # advance
280
281	if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
282	# No length specified, so it's N
283	no_length = None # type: Optional[arith_expr_t]
284	return suffix_op.Slice(begin, no_length)
285
286	elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
287	colon_tok = self.cur_token
288	self._NextNonSpace()
289
290	if self.token_type == Id.Arith_RBrace:
291	# quirky bash behavior:
292	# ${a:1:} or ${a::} means length ZERO
293	# but ${a:1} or ${a:} means length N
294	if self.parse_opts.strict_parse_slice():
295	p_die(
296	"Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
297	colon_tok)
298
299	length = arith_expr.EmptyZero # type: arith_expr_t
300	else:
301	length = self._ReadArithExpr(Id.Arith_RBrace)
302
303	return suffix_op.Slice(begin, length)
304
305	else:
306	p_die("Expected : or } in slice", self.cur_token)
307
308	raise AssertionError() # for MyPy
309
310	def _ReadPatSubVarOp(self):
311	# type: () -> suffix_op.PatSub
312	"""Looking at the first '/' after VarOf:
313
314	VarSub = ...
315	\| VarOf '/' Match ( '/' WORD? )?
316	Match = '/' WORD # can't be empty
317	\| '#' WORD? # may be empty
318	\| '%' WORD?
319	"""
320	slash_tok = self.cur_token # location info
321	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
322
323	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
324
325	self._GetToken()
326	if self.token_type == Id.Right_DollarBrace:
327	pat = CompoundWord([])
328	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
329	slash_tok)
330
331	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
332	replace_mode = self.token_type
333	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
334
335	# Bash quirk:
336	# echo ${x/#/replace} has an empty pattern
337	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
338	empty_ok = replace_mode != Id.Lit_Slash
339	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
340	empty_ok)
341	#log('pat 1 %r', pat)
342
343	if self.token_type == Id.Lit_Slash:
344	# read until }
345	replace = self._ReadVarOpArg(
346	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
347	#log('r 1 %r', replace)
348	else:
349	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
350	replace = rhs_word.Empty
351
352	self._GetToken()
353	if self.token_type != Id.Right_DollarBrace:
354	# This happens on invalid code
355	p_die(
356	"Expected } after replacement string, got %s" %
357	ui.PrettyId(self.token_type), self.cur_token)
358
359	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
360
361	def _ReadSubscript(self):
362	# type: () -> bracket_op_t
363	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
364	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
365	# expression.
366	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
367	if next_id in (Id.Lit_At, Id.Arith_Star):
368	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
369
370	self._SetNext(lex_mode_e.Arith) # skip past [
371	self._GetToken()
372	self._SetNext(lex_mode_e.Arith) # skip past @
373	self._GetToken()
374	else:
375	self._SetNext(lex_mode_e.Arith) # skip past [
376	anode = self._ReadArithExpr(Id.Arith_RBracket)
377	op = bracket_op.ArrayIndex(anode)
378
379	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
380	p_die('Expected ] to close subscript', self.cur_token)
381
382	self._SetNext(lex_mode_e.VSub_2) # skip past ]
383	self._GetToken() # Needed to be in the same spot as no subscript
384
385	return op
386
387	def _ParseVarOf(self):
388	# type: () -> BracedVarSub
389	"""
390	VarOf = NAME Subscript?
391	\| NUMBER # no subscript allowed, none of these are arrays
392	# ${@[1]} doesn't work, even though slicing does
393	\| VarSymbol
394	"""
395	self._GetToken()
396	name_token = self.cur_token
397	self._SetNext(lex_mode_e.VSub_2)
398
399	self._GetToken() # Check for []
400	if self.token_type == Id.VOp2_LBracket:
401	bracket_op = self._ReadSubscript()
402	else:
403	bracket_op = None
404
405	part = BracedVarSub.CreateNull()
406	part.token = name_token
407	part.var_name = lexer.TokenVal(name_token)
408	part.bracket_op = bracket_op
409	return part
410
411	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
412	# type: (lex_mode_t, bool) -> BracedVarSub
413	"""Start parsing at the op -- we already skipped past the name."""
414	part = self._ParseVarOf()
415
416	self._GetToken()
417	if self.token_type == Id.Right_DollarBrace:
418	return part # no ops
419
420	op_kind = self.token_kind
421
422	if op_kind == Kind.VTest:
423	tok = self.cur_token
424	arg_word = self._ReadVarOpArg(arg_lex_mode)
425	if self.token_type != Id.Right_DollarBrace:
426	p_die('Expected } to close ${', self.cur_token)
427
428	part.suffix_op = suffix_op.Unary(tok, arg_word)
429
430	elif op_kind == Kind.VOpYsh:
431	tok = self.cur_token
432	arg_word = self._ReadVarOpArg(arg_lex_mode)
433	if self.token_type != Id.Right_DollarBrace:
434	p_die('Expected } to close ${', self.cur_token)
435
436	UP_arg_word = arg_word
437	with tagswitch(arg_word) as case:
438	if case(rhs_word_e.Empty):
439	pass
440	elif case(rhs_word_e.Compound):
441	arg_word = cast(CompoundWord, UP_arg_word)
442	# This handles ${x\|html} and ${x %.3f} now
443	# However I think ${x %.3f} should be statically parsed? It can enter
444	# the printf lexer modes.
445	ok, arg, quoted = word_.StaticEval(arg_word)
446	if not ok or quoted:
447	p_die('Expected a constant argument',
448	loc.Word(arg_word))
449
450	part.suffix_op = suffix_op.Static(tok, arg)
451
452	elif op_kind == Kind.VOp0:
453	part.suffix_op = self.cur_token # Nullary
454	self._SetNext(lex_mode_e.VSub_2) # Expecting }
455	self._GetToken()
456
457	elif op_kind == Kind.VOp1: # % %% # ## etc.
458	tok = self.cur_token
459	# Weird exception that all shells have: these operators take a glob
460	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
461	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
462	if self.token_type != Id.Right_DollarBrace:
463	p_die('Expected } to close ${', self.cur_token)
464
465	part.suffix_op = suffix_op.Unary(tok, arg_word)
466
467	elif op_kind == Kind.VOp2: # / : [ ]
468	if self.token_type == Id.VOp2_Slash:
469	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
470	part.suffix_op = patsub_op
471
472	# Checked by the method above
473	assert self.token_type == Id.Right_DollarBrace, self.cur_token
474
475	elif self.token_type == Id.VOp2_Colon:
476	part.suffix_op = self._ReadSliceVarOp()
477	# NOTE: } in arithmetic mode.
478	if self.token_type != Id.Arith_RBrace:
479	# Token seems off; doesn't point to X in # ${a:1:2 X
480	p_die('Expected } to close ${', self.cur_token)
481
482	else:
483	# TODO: Does this ever happen?
484	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
485
486	elif op_kind == Kind.VOp3: # ${prefix@} etc.
487	if allow_query:
488	part.suffix_op = self.cur_token # Nullary
489	self._SetNext(lex_mode_e.VSub_2) # Expecting }
490	self._GetToken()
491	else:
492	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
493
494	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
495	# mode. It's redundantly checked above.
496	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
497	# ${a.} or ${!a.}
498	p_die('Expected } to close ${', self.cur_token)
499
500	# Now look for ops
501	return part
502
503	def _ReadZshVarSub(self, left_token):
504	# type: (Token) -> word_part.ZshVarSub
505
506	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
507
508	# Can be empty
509	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
510	True)
511	self._GetToken()
512	return word_part.ZshVarSub(left_token, w, self.cur_token)
513
514	def ReadBracedVarSub(self, left_token):
515	# type: (Token) -> Tuple[BracedVarSub, Token]
516	""" For YSH expressions like var x = ${x:-"default"}. """
517	part = self._ReadBracedVarSub(left_token, d_quoted=False)
518	last_token = self.cur_token
519	return part, last_token
520
521	def _ReadBracedVarSub(self, left_token, d_quoted):
522	# type: (Token, bool) -> BracedVarSub
523	"""For the ${} expression language.
524
525	NAME = [a-zA-Z_][a-zA-Z0-9_]*
526	NUMBER = [0-9]+ # ${10}, ${11}, ...
527
528	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
529	VarSymbol = '!' \| '@' \| '#' \| ...
530	VarOf = NAME Subscript?
531	\| NUMBER # no subscript allowed, none of these are arrays
532	# ${@[1]} doesn't work, even though slicing does
533	\| VarSymbol
534
535	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
536
537	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
538	STRIP_OP = '#' \| '##' \| '%' \| '%%'
539	CASE_OP = ',' \| ',,' \| '^' \| '^^'
540	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
541
542	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
543	# SPACE is operator not %
544	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
545	VarExpr = VarOf
546	\| VarOf NULLARY_OP
547	\| VarOf UnaryOp WORD
548	\| VarOf YSH_UNARY STATIC_WORD
549	\| VarOf ':' ArithExpr (':' ArithExpr )?
550	\| VarOf '/' Match '/' WORD
551
552	LengthExpr = '#' VarOf # can't apply operators after length
553
554	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
555	# ${!ref[0]} vs ${!keys[@]} resolved later
556
557	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
558
559	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
560
561	VarSub = LengthExpr
562	\| RefOrKeys
563	\| PrefixQuery
564	\| VarExpr
565	\| BuiltinSub
566
567	NOTES:
568	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
569	slicing ${a:x+1:y+2}
570	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
571	- @ and * are technically arithmetic expressions in this implementation
572	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
573	it's also vectorized.
574
575	Strictness over bash:
576	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
577	grammar
578	- ! and # prefixes can't be composed, even though named refs can be
579	composed with other operators
580	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
581	a prefix, and it can also be a literal part of WORD.
582
583	From the parser's point of view, the prefix # can't be combined with
584	UnaryOp/slicing/matching, and the ! can. However
585
586	- ${a[@]:1:2} is not allowed
587	- ${#a[@]:1:2} is allowed, but gives the wrong answer
588	"""
589	if d_quoted:
590	arg_lex_mode = lex_mode_e.VSub_ArgDQ
591	else:
592	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
593
594	self._SetNext(lex_mode_e.VSub_1)
595	self._GetToken()
596
597	ty = self.token_type
598	first_tok = self.cur_token
599
600	if ty == Id.VSub_Pound:
601	# Disambiguate
602	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
603	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
604	# e.g. a name, '#' is the prefix
605	self._SetNext(lex_mode_e.VSub_1)
606	part = self._ParseVarOf()
607
608	self._GetToken()
609	if self.token_type != Id.Right_DollarBrace:
610	p_die('Expected } after length expression', self.cur_token)
611
612	part.prefix_op = first_tok
613
614	else: # not a prefix, '#' is the variable
615	part = self._ParseVarExpr(arg_lex_mode)
616
617	elif ty == Id.VSub_Bang:
618	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
619	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
620	# e.g. a name, '!' is the prefix
621	# ${!a} -- this is a ref
622	# ${!3} -- this is ref
623	# ${!a[1]} -- this is a ref
624	# ${!a[@]} -- this is a keys
625	# No lookahead -- do it in a second step, or at runtime
626	self._SetNext(lex_mode_e.VSub_1)
627	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
628
629	part.prefix_op = first_tok
630
631	else: # not a prefix, '!' is the variable
632	part = self._ParseVarExpr(arg_lex_mode)
633
634	elif ty == Id.VSub_Dot:
635	# Note: this will become a new builtin_sub type, so this method must
636	# return word_part_t rather than BracedVarSub. I don't think that
637	# should cause problems.
638	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
639
640	# VS_NAME, VS_NUMBER, symbol that isn't # or !
641	elif self.token_kind == Kind.VSub:
642	part = self._ParseVarExpr(arg_lex_mode)
643
644	else:
645	# e.g. ${^}
646	p_die('Unexpected token in ${}', self.cur_token)
647
648	part.left = left_token # attach the argument
649	part.right = self.cur_token
650	return part
651
652	def _ReadSingleQuoted(self, left_token, lex_mode):
653	# type: (Token, lex_mode_t) -> SingleQuoted
654	"""Internal method to read a word_part."""
655	tokens = [] # type: List[Token]
656	# In command mode, we never disallow backslashes like '\'
657	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
658	False)
659	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
660	node = SingleQuoted(left_token, sval, right_quote)
661	return node
662
663	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
664	# type: (lex_mode_t, Token, List[Token], bool) -> Token
665	"""Appends to out_tokens; returns last token
666
667	Used by expr_parse.py
668	"""
669	# TODO: Remove and use out_tokens
670	tokens = [] # type: List[Token]
671
672	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
673	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
674
675	expected_end_tokens = 3 if left_token.id in (
676	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
677	Id.Left_BTSingleQuote) else 1
678	num_end_tokens = 0
679
680	while num_end_tokens < expected_end_tokens:
681	self._SetNext(lex_mode)
682	self._GetToken()
683
684	# Kind.Char emitted in lex_mode.SQ_C
685	if self.token_kind in (Kind.Lit, Kind.Char):
686	tok = self.cur_token
687	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
688	# r'one\two' or c'one\\two'
689	if no_backslashes and lexer.TokenContains(tok, '\\'):
690	p_die(
691	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
692	tok)
693
694	if is_ysh_expr:
695	# Disallow var x = $'\001'. Arguably we don't need these
696	# checks because u'\u{1}' is the way to write it.
697	if self.token_type == Id.Char_Octal3:
698	p_die(
699	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
700	tok)
701
702	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
703	# disallow \xH
704	p_die(
705	r'Invalid hex escape in YSH string (must be \xHH)',
706	tok)
707
708	tokens.append(tok)
709
710	elif self.token_kind == Kind.Unknown:
711	tok = self.cur_token
712	assert tok.id == Id.Unknown_Backslash, tok
713
714	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
715	if is_ysh_expr or not self.parse_opts.parse_backslash():
716	p_die(
717	"Invalid char escape in C-style string literal (OILS-ERR-11)",
718	tok)
719
720	tokens.append(tok)
721
722	elif self.token_kind == Kind.Eof:
723	p_die('Unexpected EOF in single-quoted string that began here',
724	left_token)
725
726	elif self.token_kind == Kind.Right:
727	# assume Id.Right_SingleQuote
728	num_end_tokens += 1
729	tokens.append(self.cur_token)
730
731	else:
732	raise AssertionError(self.cur_token)
733
734	if self.token_kind != Kind.Right:
735	num_end_tokens = 0 # we need three in a ROW
736
737	if expected_end_tokens == 1:
738	tokens.pop()
739	elif expected_end_tokens == 3: # Get rid of spurious end tokens
740	tokens.pop()
741	tokens.pop()
742	tokens.pop()
743
744	# Remove space from ''' r''' $''' in both expression mode and command mode
745	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
746	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
747	word_compile.RemoveLeadingSpaceSQ(tokens)
748
749	# Validation after lexing - same 2 checks in j8.LexerDecoder
750	is_u_string = left_token.id in (Id.Left_USingleQuote,
751	Id.Left_UTSingleQuote)
752
753	for tok in tokens:
754	# u'\yff' is not valid, but b'\yff' is
755	if is_u_string and tok.id == Id.Char_YHex:
756	p_die(
757	r"%s escapes not allowed in u'' strings" %
758	lexer.TokenVal(tok), tok)
759
760	out_tokens.extend(tokens)
761	return self.cur_token
762
763	def _ReadDoubleQuotedLeftParts(self):
764	# type: () -> word_part_t
765	"""Read substitution parts in a double quoted context."""
766	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
767	return self._ReadCommandSub(self.token_type, d_quoted=True)
768
769	if self.token_type == Id.Left_DollarBrace:
770	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
771
772	if self.token_type == Id.Left_DollarDParen:
773	return self._ReadArithSub()
774
775	if self.token_type == Id.Left_DollarBracket:
776	return self._ReadExprSub(lex_mode_e.DQ)
777
778	raise AssertionError(self.cur_token)
779
780	def _ReadYshSingleQuoted(self, left_id):
781	# type: (Id_t) -> CompoundWord
782	"""Read YSH style strings
783
784	r'' u'' b''
785	r''' ''' u''' ''' b''' '''
786	"""
787	#log('BEF self.cur_token %s', self.cur_token)
788	if left_id == Id.Left_RSingleQuote:
789	lexer_mode = lex_mode_e.SQ_Raw
790	triple_left_id = Id.Left_RTSingleQuote
791	elif left_id == Id.Left_USingleQuote:
792	lexer_mode = lex_mode_e.J8_Str
793	triple_left_id = Id.Left_UTSingleQuote
794	elif left_id == Id.Left_BSingleQuote:
795	lexer_mode = lex_mode_e.J8_Str
796	triple_left_id = Id.Left_BTSingleQuote
797	else:
798	raise AssertionError(left_id)
799
800	# Needed for syntax checks
801	left_tok = self.cur_token
802	left_tok.id = left_id
803
804	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
805
806	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
807	self._SetNext(lex_mode_e.ShCommand)
808	self._GetToken()
809
810	assert self.token_type == Id.Left_SingleQuote
811	# HACK: magically transform the third ' in u''' to
812	# Id.Left_UTSingleQuote, so that ''' is the terminator
813	left_tok = self.cur_token
814	left_tok.id = triple_left_id
815
816	# Handles stripping leading whitespace
817	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
818
819	# Advance and validate
820	self._SetNext(lex_mode_e.ShCommand)
821
822	self._GetToken()
823	if self.token_kind not in KINDS_THAT_END_WORDS:
824	p_die('Unexpected token after YSH single-quoted string',
825	self.cur_token)
826
827	return CompoundWord([sq_part])
828
829	def _ReadUnquotedLeftParts(self, triple_out):
830	# type: (Optional[BoolParamBox]) -> word_part_t
831	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
832
833	If triple_out is set, then we try parsing triple quoted strings,
834	and set its value to True if we got one.
835	"""
836	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
837	# Note: $"" is a synonym for "". It might make sense if it added
838	# \n \0 \x00 \u{123} etc. But that's not what bash does!
839	dq_part = self._ReadDoubleQuoted(self.cur_token)
840	# Got empty word "" and there's a " after
841	if (triple_out and len(dq_part.parts) == 0 and
842	self.lexer.ByteLookAhead() == '"'):
843
844	self._SetNext(lex_mode_e.ShCommand)
845	self._GetToken()
846	# HACK: magically transform the third " in """ to
847	# Id.Left_TDoubleQuote, so that """ is the terminator
848	left_dq_token = self.cur_token
849	left_dq_token.id = Id.Left_TDoubleQuote
850	triple_out.b = True # let caller know we got it
851	return self._ReadDoubleQuoted(left_dq_token)
852
853	return dq_part
854
855	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
856	Id.Left_DollarSingleQuote):
857	if self.token_type == Id.Left_SingleQuote:
858	lexer_mode = lex_mode_e.SQ_Raw
859	triple_left_id = Id.Left_TSingleQuote
860	elif self.token_type == Id.Left_RSingleQuote:
861	lexer_mode = lex_mode_e.SQ_Raw
862	triple_left_id = Id.Left_RTSingleQuote
863	else:
864	lexer_mode = lex_mode_e.SQ_C
865	# there is no such thing as $'''
866	triple_left_id = Id.Undefined_Tok
867
868	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
869
870	# Got empty '' or r'' and there's a ' after
871	# u'' and b'' are handled in _ReadYshSingleQuoted
872	if (triple_left_id != Id.Undefined_Tok and
873	triple_out is not None and len(sq_part.sval) == 0 and
874	self.lexer.ByteLookAhead() == "'"):
875
876	self._SetNext(lex_mode_e.ShCommand)
877	self._GetToken()
878
879	# HACK: magically transform the third ' in ''' to
880	# Id.Left_TSingleQuote, so that ''' is the terminator
881	left_sq_token = self.cur_token
882	left_sq_token.id = triple_left_id
883
884	triple_out.b = True # let caller know we got it
885	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
886
887	return sq_part
888
889	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
890	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
891	return self._ReadCommandSub(self.token_type, d_quoted=False)
892
893	if self.token_type == Id.Left_DollarBrace:
894	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
895
896	if self.token_type == Id.Left_DollarDParen:
897	return self._ReadArithSub()
898
899	if self.token_type == Id.Left_DollarBracket:
900	return self._ReadExprSub(lex_mode_e.ShCommand)
901
902	if self.token_type == Id.Left_DollarBraceZsh:
903	return self._ReadZshVarSub(self.cur_token)
904
905	raise AssertionError(self.cur_token)
906
907	def _ReadExtGlob(self):
908	# type: () -> word_part.ExtGlob
909	"""
910	Grammar:
911	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
912	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
913	RIGHT = ')'
914	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
915	Compound includes ExtGlob
916	"""
917	left_token = self.cur_token
918	right_token = None # type: Token
919	arms = [] # type: List[CompoundWord]
920
921	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
922	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
923
924	read_word = False # did we just a read a word? To handle @(\|\|).
925
926	while True:
927	self._GetToken()
928
929	if self.token_type == Id.Right_ExtGlob:
930	if not read_word:
931	arms.append(CompoundWord([]))
932	right_token = self.cur_token
933	break
934
935	elif self.token_type == Id.Op_Pipe:
936	if not read_word:
937	arms.append(CompoundWord([]))
938	read_word = False
939	self._SetNext(lex_mode_e.ExtGlob)
940
941	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
942	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
943	Kind.ExtGlob):
944	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
945	arms.append(w)
946	read_word = True
947
948	elif self.token_kind == Kind.Eof:
949	p_die('Unexpected EOF reading extended glob that began here',
950	left_token)
951
952	else:
953	raise AssertionError(self.cur_token)
954
955	return word_part.ExtGlob(left_token, arms, right_token)
956
957	def _ReadBashRegexGroup(self):
958	# type: () -> word_part.BashRegexGroup
959	"""
960	Grammar:
961	BashRegexGroup = '(' WORD? ')
962	"""
963	left_token = self.cur_token
964	assert left_token.id == Id.BashRegex_LParen, left_token
965
966	right_token = None # type: Token
967	arms = [] # type: List[CompoundWord]
968
969	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
970	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
971
972	self._GetToken()
973	if self.token_type == Id.Right_BashRegexGroup: # empty ()
974	return word_part.BashRegexGroup(left_token, None, self.cur_token)
975
976	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
977	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
978	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
979	# To allow bash style [[ s =~ (a b) ]]
980	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
981	arms.append(w)
982
983	self._GetToken()
984	if self.token_type != Id.Right_BashRegexGroup:
985	p_die('Expected ) to close bash regex group', self.cur_token)
986
987	return word_part.BashRegexGroup(left_token, w, self.cur_token)
988
989	p_die('Expected word after ( opening bash regex group', self.cur_token)
990
991	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
992	# type: (Optional[Token], bool, List[word_part_t]) -> None
993	"""
994	Args:
995	left_token: A token if we are reading a double quoted part, or None if
996	we're reading a here doc.
997	is_ysh_expr: Whether to disallow backticks and invalid char escapes
998	out_parts: list of word_part to append to
999	"""
1000	if left_token:
1001	if left_token.id in (Id.Left_TDoubleQuote,
1002	Id.Left_DollarTDoubleQuote):
1003	expected_end_tokens = 3
1004	else:
1005	expected_end_tokens = 1
1006	else:
1007	expected_end_tokens = 1000 # here doc will break
1008
1009	num_end_tokens = 0
1010	while num_end_tokens < expected_end_tokens:
1011	self._SetNext(lex_mode_e.DQ)
1012	self._GetToken()
1013
1014	if self.token_kind == Kind.Lit:
1015	if self.token_type == Id.Lit_EscapedChar:
1016	tok = self.cur_token
1017	ch = lexer.TokenSliceLeft(tok, 1)
1018	part = word_part.EscapedLiteral(tok,
1019	ch) # type: word_part_t
1020	else:
1021	if self.token_type == Id.Lit_BadBackslash:
1022	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1023	# YSH.
1024	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1025	# recursion (unless parse_backslash)
1026	if (is_ysh_expr or
1027	not self.parse_opts.parse_backslash()):
1028	p_die(
1029	"Invalid char escape in double quoted string (OILS-ERR-12)",
1030	self.cur_token)
1031	elif self.token_type == Id.Lit_Dollar:
1032	if is_ysh_expr or not self.parse_opts.parse_dollar():
1033	p_die("Literal $ should be quoted like \$",
1034	self.cur_token)
1035
1036	part = self.cur_token
1037	out_parts.append(part)
1038
1039	elif self.token_kind == Kind.Left:
1040	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1041	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1042	self.cur_token)
1043
1044	part = self._ReadDoubleQuotedLeftParts()
1045	out_parts.append(part)
1046
1047	elif self.token_kind == Kind.VSub:
1048	tok = self.cur_token
1049	part = SimpleVarSub(tok)
1050	out_parts.append(part)
1051	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1052	# later.
1053
1054	elif self.token_kind == Kind.Right:
1055	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1056	if left_token:
1057	num_end_tokens += 1
1058
1059	# In a here doc, the right quote is literal!
1060	out_parts.append(self.cur_token)
1061
1062	elif self.token_kind == Kind.Eof:
1063	if left_token:
1064	p_die(
1065	'Unexpected EOF reading double-quoted string that began here',
1066	left_token)
1067	else: # here docs will have an EOF in their token stream
1068	break
1069
1070	else:
1071	raise AssertionError(self.cur_token)
1072
1073	if self.token_kind != Kind.Right:
1074	num_end_tokens = 0 # """ must be CONSECUTIVE
1075
1076	if expected_end_tokens == 1:
1077	out_parts.pop()
1078	elif expected_end_tokens == 3:
1079	out_parts.pop()
1080	out_parts.pop()
1081	out_parts.pop()
1082
1083	# Remove space from """ in both expression mode and command mode
1084	if (left_token and left_token.id
1085	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1086	word_compile.RemoveLeadingSpaceDQ(out_parts)
1087
1088	# Return nothing, since we appended to 'out_parts'
1089
1090	def _ReadDoubleQuoted(self, left_token):
1091	# type: (Token) -> DoubleQuoted
1092	"""Helper function for "hello $name".
1093
1094	Args:
1095	eof_type: for stopping at }, Id.Lit_RBrace
1096	here_doc: Whether we are reading in a here doc context
1097
1098	Also ${foo%%a b c} # treat this as double quoted. until you hit
1099	"""
1100	parts = [] # type: List[word_part_t]
1101	self._ReadLikeDQ(left_token, False, parts)
1102
1103	right_quote = self.cur_token
1104	return DoubleQuoted(left_token, parts, right_quote)
1105
1106	def ReadDoubleQuoted(self, left_token, parts):
1107	# type: (Token, List[word_part_t]) -> Token
1108	"""For expression mode.
1109
1110	Read var x = "${dir:-}/$name"; etc.
1111	"""
1112	self._ReadLikeDQ(left_token, True, parts)
1113	return self.cur_token
1114
1115	def _ReadCommandSub(self, left_id, d_quoted=False):
1116	# type: (Id_t, bool) -> CommandSub
1117	"""
1118	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1119
1120	command_sub = '$(' command_list ')'
1121	\| '@(' command_list ')'
1122	\| '<(' command_list ')'
1123	\| '>(' command_list ')'
1124	\| ` command_list `
1125	"""
1126	left_token = self.cur_token
1127
1128	# Set the lexer in a state so ) becomes the EOF token.
1129	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1130	Id.Left_ProcSubOut):
1131	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1132
1133	right_id = Id.Eof_RParen
1134	self.lexer.PushHint(Id.Op_RParen, right_id)
1135	c_parser = self.parse_ctx.MakeParserForCommandSub(
1136	self.line_reader, self.lexer, right_id)
1137	# NOTE: This doesn't use something like main_loop because we don't want
1138	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1139	node = c_parser.ParseCommandSub()
1140
1141	right_token = c_parser.w_parser.cur_token
1142
1143	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1144	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1145	# test/osh2oil.
1146
1147	right_id = Id.Eof_Backtick
1148	self.lexer.PushHint(Id.Left_Backtick, right_id)
1149	c_parser = self.parse_ctx.MakeParserForCommandSub(
1150	self.line_reader, self.lexer, right_id)
1151	node = c_parser.ParseCommandSub()
1152	right_token = c_parser.w_parser.cur_token
1153
1154	elif left_id == Id.Left_Backtick:
1155	if not self.parse_opts.parse_backticks():
1156	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1157	left_token)
1158
1159	self._SetNext(lex_mode_e.Backtick) # advance past `
1160
1161	parts = [] # type: List[str]
1162	while True:
1163	self._GetToken()
1164	#log("TOK %s", self.cur_token)
1165
1166	if self.token_type == Id.Backtick_Quoted:
1167	# Remove leading \
1168	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1169
1170	elif self.token_type == Id.Backtick_DoubleQuote:
1171	# Compatibility: If backticks are double quoted, then double quotes
1172	# within them have to be \"
1173	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1174	# is)
1175	if d_quoted:
1176	# Remove leading \
1177	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1178	else:
1179	parts.append(lexer.TokenVal(self.cur_token))
1180
1181	elif self.token_type == Id.Backtick_Other:
1182	parts.append(lexer.TokenVal(self.cur_token))
1183
1184	elif self.token_type == Id.Backtick_Right:
1185	break
1186
1187	elif self.token_type == Id.Eof_Real:
1188	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1189	p_die('Unexpected EOF while looking for closing backtick',
1190	left_token)
1191
1192	else:
1193	raise AssertionError(self.cur_token)
1194
1195	self._SetNext(lex_mode_e.Backtick)
1196
1197	# Calculate right SPID on CommandSub BEFORE re-parsing.
1198	right_token = self.cur_token
1199
1200	code_str = ''.join(parts)
1201	#log('code %r', code_str)
1202
1203	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1204	# won't have the same location info as MakeParserForCommandSub(), because
1205	# the lexer is different.
1206	arena = self.parse_ctx.arena
1207	#arena = alloc.Arena()
1208	line_reader = reader.StringLineReader(code_str, arena)
1209	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1210	src = source.Reparsed('backticks', left_token, right_token)
1211	with alloc.ctx_SourceCode(arena, src):
1212	node = c_parser.ParseCommandSub()
1213
1214	else:
1215	raise AssertionError(left_id)
1216
1217	return CommandSub(left_token, node, right_token)
1218
1219	def _ReadExprSub(self, lex_mode):
1220	# type: (lex_mode_t) -> word_part.ExprSub
1221	"""$[d->key] $[obj.method()] etc."""
1222	left_token = self.cur_token
1223
1224	self._SetNext(lex_mode_e.Expr)
1225	enode, right_token = self.parse_ctx.ParseYshExpr(
1226	self.lexer, grammar_nt.ysh_expr_sub)
1227
1228	self._SetNext(lex_mode) # Move past ]
1229	return word_part.ExprSub(left_token, enode, right_token)
1230
1231	def ParseVarDecl(self, kw_token):
1232	# type: (Token) -> command.VarDecl
1233	"""
1234	oil_var_decl: name_type_list '=' testlist end_stmt
1235
1236	Note that assignments must end with \n ; } or EOF. Unlike shell
1237	assignments, we disallow:
1238
1239	var x = 42 \| wc -l
1240	var x = 42 && echo hi
1241	"""
1242	self._SetNext(lex_mode_e.Expr)
1243	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1244	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1245	# wants
1246	if last_token.id == Id.Op_RBrace:
1247	last_token.id = Id.Lit_RBrace
1248
1249	# Let the CommandParser see the Op_Semi or Op_Newline.
1250	self.buffered_word = last_token
1251	self._SetNext(lex_mode_e.ShCommand) # always back to this
1252	return enode
1253
1254	def ParseMutation(self, kw_token, var_checker):
1255	# type: (Token, VarChecker) -> command.Mutation
1256	"""
1257	setvar i = 42
1258	setvar i += 1
1259	setvar a[i] = 42
1260	setvar a[i] += 1
1261	setvar d.key = 42
1262	setvar d.key += 1
1263	"""
1264	self._SetNext(lex_mode_e.Expr)
1265	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1266	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1267	# wants
1268	if last_token.id == Id.Op_RBrace:
1269	last_token.id = Id.Lit_RBrace
1270
1271	for lhs in enode.lhs:
1272	UP_lhs = lhs
1273	with tagswitch(lhs) as case:
1274	if case(y_lhs_e.Var):
1275	lhs = cast(Token, UP_lhs)
1276	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1277
1278	# Note: this does not cover cases like
1279	# setvar (a[0])[1] = v
1280	# setvar (d.key).other = v
1281	# This leaks into catching all typos statically, which may be
1282	# possible if 'use' makes all names explicit.
1283	elif case(y_lhs_e.Subscript):
1284	lhs = cast(Subscript, UP_lhs)
1285	if lhs.obj.tag() == expr_e.Var:
1286	v = cast(expr.Var, lhs.obj)
1287	var_checker.Check(kw_token.id, v.name, v.left)
1288
1289	elif case(y_lhs_e.Attribute):
1290	lhs = cast(Attribute, UP_lhs)
1291	if lhs.obj.tag() == expr_e.Var:
1292	v = cast(expr.Var, lhs.obj)
1293	var_checker.Check(kw_token.id, v.name, v.left)
1294
1295	# Let the CommandParser see the Op_Semi or Op_Newline.
1296	self.buffered_word = last_token
1297	self._SetNext(lex_mode_e.ShCommand) # always back to this
1298	return enode
1299
1300	def ParseBareDecl(self):
1301	# type: () -> expr_t
1302	"""
1303	x = {name: val}
1304	"""
1305	self._SetNext(lex_mode_e.Expr)
1306	self._GetToken()
1307	enode, last_token = self.parse_ctx.ParseYshExpr(
1308	self.lexer, grammar_nt.command_expr)
1309	if last_token.id == Id.Op_RBrace:
1310	last_token.id = Id.Lit_RBrace
1311	self.buffered_word = last_token
1312	self._SetNext(lex_mode_e.ShCommand)
1313	return enode
1314
1315	def ParseYshExprForCommand(self):
1316	# type: () -> expr_t
1317
1318	# Fudge for this case
1319	# for x in(y) {
1320	# versus
1321	# for x in (y) {
1322	#
1323	# In the former case, ReadWord on 'in' puts the lexer past (.
1324	# Also see LookPastSpace in CommandParers.
1325	# A simpler solution would be nicer.
1326
1327	if self.token_type == Id.Op_LParen:
1328	self.lexer.MaybeUnreadOne()
1329
1330	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1331
1332	self._SetNext(lex_mode_e.ShCommand)
1333	return enode
1334
1335	def ParseCommandExpr(self):
1336	# type: () -> expr_t
1337	"""
1338	= 1+2
1339	"""
1340	enode, last_token = self.parse_ctx.ParseYshExpr(
1341	self.lexer, grammar_nt.command_expr)
1342
1343	# In some cases, such as the case statement, we expect the lexer to be
1344	# pointing at the token right after the expression. But the expression
1345	# parser must have read to the `last_token`. Unreading places the lexer
1346	# back in the expected state. Ie:
1347	#
1348	# case (x) { case (x) {
1349	# (else) { = x } (else) { = x }
1350	# ^ The lexer is here ^ Unread to here
1351	# } }
1352	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1353	Id.Op_RBrace), last_token
1354	if last_token.id != Id.Eof_Real:
1355	# Eof_Real is the only token we cannot unread
1356	self.lexer.MaybeUnreadOne()
1357
1358	return enode
1359
1360	def ParseProc(self, node):
1361	# type: (Proc) -> None
1362
1363	# proc name-with-hyphens() must be accepted
1364	self._SetNext(lex_mode_e.ShCommand)
1365	self._GetToken()
1366	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1367	if self.token_type != Id.Lit_Chars:
1368	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1369	self.cur_token)
1370
1371	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1372	# for shell functions. Similar to IsValidVarName().
1373	node.name = self.cur_token
1374
1375	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1376
1377	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1378	assert last_token.id == Id.Op_LBrace
1379	last_token.id = Id.Lit_LBrace
1380	self.buffered_word = last_token
1381
1382	self._SetNext(lex_mode_e.ShCommand)
1383
1384	def ParseFunc(self, node):
1385	# type: (Func) -> None
1386	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1387
1388	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1389	assert last_token.id == Id.Op_LBrace
1390	last_token.id = Id.Lit_LBrace
1391	self.buffered_word = last_token
1392
1393	self._SetNext(lex_mode_e.ShCommand)
1394
1395	def ParseYshCasePattern(self):
1396	# type: () -> Tuple[pat_t, Token]
1397	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1398	self.lexer)
1399
1400	if last_token.id == Id.Op_LBrace:
1401	last_token.id = Id.Lit_LBrace
1402	self.buffered_word = last_token
1403
1404	return pat, left_tok
1405
1406	def NewlineOkForYshCase(self):
1407	# type: () -> Id_t
1408	"""Check for optional newline and consume it.
1409
1410	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1411	which crop up while parsing Ysh Case Arms. For more details, see
1412	#oil-dev > Progress On YSH Case Grammar on zulip.
1413
1414	Returns a token id which is filled with the choice of
1415
1416	word { echo word }
1417	(3) { echo expr }
1418	/e/ { echo eggex }
1419	} # right brace
1420	"""
1421	while True:
1422	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1423
1424	# Cannot lookahead past lines
1425	if next_id == Id.Unknown_Tok:
1426	self.lexer.MoveToNextLine()
1427	continue
1428
1429	next_kind = consts.GetKind(next_id)
1430	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1431	break
1432
1433	self.lexer.Read(lex_mode_e.Expr)
1434
1435	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1436	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1437	else:
1438	# Consume the trailing Op_Newline
1439	self._SetNext(lex_mode_e.ShCommand)
1440	self._GetToken()
1441
1442	return next_id
1443
1444	def _ReadArithExpr(self, end_id):
1445	# type: (Id_t) -> arith_expr_t
1446	"""Read and parse an arithmetic expression in various contexts.
1447
1448	$(( 1+2 ))
1449	(( a=1+2 ))
1450	${a[ 1+2 ]}
1451	${a : 1+2 : 1+2}
1452
1453	See tests/arith-context.test.sh for ambiguous cases.
1454
1455	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1456
1457	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1458
1459	See the assertion in ArithParser.Parse() -- unexpected extra input.
1460	"""
1461	# calls self.ReadWord(lex_mode_e.Arith)
1462	anode = self.a_parser.Parse()
1463	cur_id = self.a_parser.CurrentId()
1464	if end_id != Id.Undefined_Tok and cur_id != end_id:
1465	p_die(
1466	'Unexpected token after arithmetic expression (%s != %s)' %
1467	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1468	loc.Word(self.a_parser.cur_word))
1469	return anode
1470
1471	def _ReadArithSub(self):
1472	# type: () -> word_part.ArithSub
1473	"""Read an arith substitution, which contains an arith expression, e.g.
1474
1475	$((a + 1)).
1476	"""
1477	left_tok = self.cur_token
1478
1479	# The second one needs to be disambiguated in stuff like stuff like:
1480	# $(echo $(( 1+2 )) )
1481	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1482
1483	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1484	# could save the lexer/reader state here, and retry if the arithmetic parse
1485	# fails. But we can almost always catch this at parse time. There could
1486	# be some exceptions like:
1487	# $((echo * foo)) # looks like multiplication
1488	# $((echo / foo)) # looks like division
1489
1490	# $(( )) is valid
1491	anode = arith_expr.EmptyZero # type: arith_expr_t
1492
1493	self._NextNonSpace()
1494	if self.token_type != Id.Arith_RParen:
1495	anode = self._ReadArithExpr(Id.Arith_RParen)
1496
1497	self._SetNext(lex_mode_e.ShCommand)
1498
1499	# Ensure we get closing )
1500	self._GetToken()
1501	if self.token_type != Id.Right_DollarDParen:
1502	p_die('Expected second ) to end arith sub', self.cur_token)
1503
1504	right_tok = self.cur_token
1505	return word_part.ArithSub(left_tok, anode, right_tok)
1506
1507	def ReadDParen(self):
1508	# type: () -> Tuple[arith_expr_t, Token]
1509	"""Read ((1+ 2)) -- command context.
1510
1511	We're using the word parser because it's very similar to _ReadArithExpr
1512	above.
1513
1514	This also returns the terminating Id.Op_DRightParen token for location
1515	info.
1516	"""
1517	# (( )) is valid
1518	anode = arith_expr.EmptyZero # type: arith_expr_t
1519
1520	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1521
1522	self._NextNonSpace()
1523	if self.token_type != Id.Arith_RParen:
1524	anode = self._ReadArithExpr(Id.Arith_RParen)
1525
1526	self._SetNext(lex_mode_e.ShCommand)
1527
1528	# Ensure we get the second )
1529	self._GetToken()
1530	right = self.cur_token
1531	if right.id != Id.Op_DRightParen:
1532	p_die('Expected second ) to end arith statement', right)
1533
1534	self._SetNext(lex_mode_e.ShCommand)
1535
1536	return anode, right
1537
1538	def _NextNonSpace(self):
1539	# type: () -> None
1540	"""Advance in lex_mode_e.Arith until non-space token.
1541
1542	Same logic as _ReadWord, but used in
1543	$(( ))
1544	(( ))
1545	for (( ))
1546
1547	You can read self.token_type after this, without calling _GetToken.
1548	"""
1549	while True:
1550	self._SetNext(lex_mode_e.Arith)
1551	self._GetToken()
1552	if self.token_kind not in (Kind.Ignored, Kind.WS):
1553	break
1554
1555	def ReadForExpression(self):
1556	# type: () -> command.ForExpr
1557	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1558	self._NextNonSpace() # skip over ((
1559	cur_id = self.token_type # for end of arith expressions
1560
1561	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1562	init_node = arith_expr.EmptyZero # type: arith_expr_t
1563	else:
1564	init_node = self.a_parser.Parse()
1565	cur_id = self.a_parser.CurrentId()
1566	self._NextNonSpace()
1567
1568	# It's odd to keep track of both cur_id and self.token_type in this
1569	# function, but it works, and is tested in 'test/parse_error.sh
1570	# arith-integration'
1571	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1572	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1573
1574	self._GetToken()
1575	cur_id = self.token_type
1576
1577	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1578	# empty condition is TRUE
1579	cond_node = arith_expr.EmptyOne # type: arith_expr_t
1580	else:
1581	cond_node = self.a_parser.Parse()
1582	cur_id = self.a_parser.CurrentId()
1583
1584	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1585	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1586
1587	self._NextNonSpace()
1588	if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1589	update_node = arith_expr.EmptyZero # type: arith_expr_t
1590	else:
1591	update_node = self._ReadArithExpr(Id.Arith_RParen)
1592
1593	self._NextNonSpace()
1594	if self.token_type != Id.Arith_RParen:
1595	p_die('Expected ) to end for loop expression', self.cur_token)
1596	self._SetNext(lex_mode_e.ShCommand)
1597
1598	# redirects is None, will be assigned in CommandEvaluator
1599	node = command.ForExpr.CreateNull()
1600	node.init = init_node
1601	node.cond = cond_node
1602	node.update = update_node
1603	return node
1604
1605	def _ReadArrayLiteral(self):
1606	# type: () -> word_part_t
1607	"""a=(1 2 3)
1608
1609	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1610
1611	We want:
1612
1613	A=(['x']=1 ["x"]=2 [$x$y]=3)
1614
1615	Maybe allow this as a literal string? Because I think I've seen it before?
1616	Or maybe force people to patch to learn the rule.
1617
1618	A=([x]=4)
1619
1620	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1621	Maybe enforce that ALL have keys or NONE of have keys.
1622	"""
1623	self._SetNext(lex_mode_e.ShCommand) # advance past (
1624	self._GetToken()
1625	if self.cur_token.id != Id.Op_LParen:
1626	p_die('Expected ( after =', self.cur_token)
1627	left_token = self.cur_token
1628	right_token = None # type: Token
1629
1630	# MUST use a new word parser (with same lexer).
1631	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1632	words = [] # type: List[CompoundWord]
1633	done = False
1634	while not done:
1635	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1636	with tagswitch(w) as case:
1637	if case(word_e.Operator):
1638	tok = cast(Token, w)
1639	if tok.id == Id.Right_ShArrayLiteral:
1640	right_token = tok
1641	done = True # can't use break here
1642	# Unlike command parsing, array parsing allows embedded \n.
1643	elif tok.id == Id.Op_Newline:
1644	continue
1645	else:
1646	p_die('Unexpected token in array literal', loc.Word(w))
1647
1648	elif case(word_e.Compound):
1649	words.append(cast(CompoundWord, w))
1650
1651	else:
1652	raise AssertionError()
1653
1654	if len(words) == 0: # a=() is empty indexed array
1655	# Needed for type safety, doh
1656	no_words = [] # type: List[word_t]
1657	node = ShArrayLiteral(left_token, no_words, right_token)
1658	return node
1659
1660	pairs = [] # type: List[AssocPair]
1661	# If the first one is a key/value pair, then the rest are assumed to be.
1662	pair = word_.DetectAssocPair(words[0])
1663	if pair:
1664	pairs.append(pair)
1665
1666	n = len(words)
1667	for i in xrange(1, n):
1668	w2 = words[i]
1669	pair = word_.DetectAssocPair(w2)
1670	if not pair:
1671	p_die("Expected associative array pair", loc.Word(w2))
1672
1673	pairs.append(pair)
1674
1675	# invariant List?
1676	return word_part.BashAssocLiteral(left_token, pairs, right_token)
1677
1678	# Brace detection for arrays but NOT associative arrays
1679	words2 = braces.BraceDetectAll(words)
1680	words3 = word_.TildeDetectAll(words2)
1681	return ShArrayLiteral(left_token, words3, right_token)
1682
1683	def ParseProcCallArgs(self, start_symbol):
1684	# type: (int) -> ArgList
1685	""" json write (x) """
1686	self.lexer.MaybeUnreadOne()
1687
1688	arg_list = ArgList.CreateNull(alloc_lists=True)
1689	arg_list.left = self.cur_token
1690	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1691	return arg_list
1692
1693	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1694	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1695	"""Helper for _ReadCompoundWord3."""
1696	done = False
1697
1698	if self.token_type == Id.Lit_EscapedChar:
1699	tok = self.cur_token
1700	assert tok.length == 2
1701	ch = lexer.TokenSliceLeft(tok, 1)
1702	if not self.parse_opts.parse_backslash():
1703	if not pyutil.IsValidCharEscape(ch):
1704	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1705	self.cur_token)
1706
1707	part = word_part.EscapedLiteral(self.cur_token,
1708	ch) # type: word_part_t
1709	else:
1710	part = self.cur_token
1711
1712	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1713	parts.append(part)
1714	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1715	# _ReadWord.
1716	next_id = self.lexer.LookPastSpace(lex_mode)
1717	if next_id == Id.Op_LParen:
1718	self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1719	part2 = self._ReadArrayLiteral()
1720	parts.append(part2)
1721
1722	# Array literal must be the last part of the word.
1723	self._SetNext(lex_mode)
1724	self._GetToken()
1725	# EOF, whitespace, newline, Right_Subshell
1726	if self.token_kind not in KINDS_THAT_END_WORDS:
1727	p_die('Unexpected token after array literal',
1728	self.cur_token)
1729	done = True
1730
1731	elif (is_first and self.parse_opts.parse_at() and
1732	self.token_type == Id.Lit_Splice):
1733
1734	splice_tok = self.cur_token
1735	part2 = word_part.Splice(splice_tok,
1736	lexer.TokenSliceLeft(splice_tok, 1))
1737
1738	parts.append(part2)
1739
1740	# @words must be the last part of the word
1741	self._SetNext(lex_mode)
1742	self._GetToken()
1743	# EOF, whitespace, newline, Right_Subshell
1744	if self.token_kind not in KINDS_THAT_END_WORDS:
1745	p_die('Unexpected token after array splice', self.cur_token)
1746	done = True
1747
1748	elif (is_first and self.parse_opts.parse_at() and
1749	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1750	part2 = self._ReadExprSub(lex_mode_e.DQ)
1751	parts.append(part2)
1752
1753	# @[split(x)]
1754	self._SetNext(lex_mode)
1755	self._GetToken()
1756	# EOF, whitespace, newline, Right_Subshell
1757	if self.token_kind not in KINDS_THAT_END_WORDS:
1758	p_die('Unexpected token after Expr splice', self.cur_token)
1759	done = True
1760
1761	elif (is_first and self.parse_opts.parse_at() and
1762	self.token_type == Id.Lit_AtLBraceDot):
1763	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1764
1765	elif (is_first and self.parse_opts.parse_at_all() and
1766	self.token_type == Id.Lit_At):
1767	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1768	# at the beginning of a word to be reserved.
1769
1770	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1771	# @_argv and
1772	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1773	self.cur_token)
1774
1775	else:
1776	# not a literal with lookahead; append it
1777	parts.append(part)
1778
1779	return done
1780
1781	def _ReadCompoundWord(self, lex_mode):
1782	# type: (lex_mode_t) -> CompoundWord
1783	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1784
1785	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1786	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1787	"""
1788	Precondition: Looking at the first token of the first word part
1789	Postcondition: Looking at the token after, e.g. space or operator
1790
1791	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1792	could be an operator delimiting a compound word. Can we change lexer modes
1793	and remove this special case?
1794	"""
1795	w = CompoundWord([])
1796	num_parts = 0
1797	brace_count = 0
1798	done = False
1799	is_triple_quoted = None # type: Optional[BoolParamBox]
1800
1801	while not done:
1802	self._GetToken()
1803
1804	allow_done = empty_ok or num_parts != 0
1805	if allow_done and self.token_type == eof_type:
1806	done = True # e.g. for ${foo//pat/replace}
1807
1808	# Keywords like "for" are treated like literals
1809	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1810	Kind.ControlFlow, Kind.BoolUnary,
1811	Kind.BoolBinary):
1812
1813	# Syntax error for { and }
1814	if self.token_type == Id.Lit_LBrace:
1815	brace_count += 1
1816	elif self.token_type == Id.Lit_RBrace:
1817	brace_count -= 1
1818	elif self.token_type == Id.Lit_Dollar:
1819	if not self.parse_opts.parse_dollar():
1820	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1821	next_byte = self.lexer.ByteLookAhead()
1822	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1823	if next_byte == '/':
1824	#log('next_byte %r', next_byte)
1825	pass
1826
1827	p_die('Literal $ should be quoted like \$',
1828	self.cur_token)
1829
1830	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1831	w.parts)
1832
1833	elif self.token_kind == Kind.VSub:
1834	vsub_token = self.cur_token
1835
1836	part = SimpleVarSub(vsub_token) # type: word_part_t
1837	w.parts.append(part)
1838
1839	elif self.token_kind == Kind.ExtGlob:
1840	# If parse_at, we can take over @( to start @(seq 3)
1841	# Users can also use look at ,(.py\|.sh)
1842	if (self.parse_opts.parse_at() and
1843	self.token_type == Id.ExtGlob_At and num_parts == 0):
1844	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1845	d_quoted=False)
1846	# RARE mutation of tok.id!
1847	cs_part.left_token.id = Id.Left_AtParen
1848	part = cs_part # for type safety
1849
1850	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1851	# a=(one two)x and @arrayfunc(3)x.
1852	self._GetToken()
1853	if self.token_kind not in KINDS_THAT_END_WORDS:
1854	p_die('Unexpected token after @()', self.cur_token)
1855	done = True
1856
1857	else:
1858	part = self._ReadExtGlob()
1859	w.parts.append(part)
1860
1861	elif self.token_kind == Kind.BashRegex:
1862	if self.token_type == Id.BashRegex_LParen: # Opening (
1863	part = self._ReadBashRegexGroup()
1864	w.parts.append(part)
1865	else:
1866	assert self.token_type == Id.BashRegex_AllowedInParens
1867	p_die('Invalid token in bash regex', self.cur_token)
1868
1869	elif self.token_kind == Kind.Left:
1870	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1871	lex_mode == lex_mode_e.ShCommand and
1872	num_parts == 0)
1873
1874	# Save allocation
1875	if try_triple_quote:
1876	is_triple_quoted = BoolParamBox(False)
1877
1878	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1879	w.parts.append(part)
1880
1881	# NOT done yet, will advance below
1882	elif self.token_kind == Kind.Right:
1883	# Still part of the word; will be done on the next iter.
1884	if self.token_type == Id.Right_DoubleQuote:
1885	pass
1886	# Never happens, no PushHint for this case.
1887	#elif self.token_type == Id.Right_DollarParen:
1888	# pass
1889	elif self.token_type == Id.Right_Subshell:
1890	# LEXER HACK for (case x in x) ;; esac )
1891	# Rewind before it's used
1892	assert self.next_lex_mode == lex_mode_e.Undefined
1893	if self.lexer.MaybeUnreadOne():
1894	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1895	self._SetNext(lex_mode)
1896	done = True
1897	else:
1898	done = True
1899
1900	elif self.token_kind == Kind.Ignored:
1901	done = True
1902
1903	else:
1904	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1905	# so to test for ESAC, we can read ) before getting a chance to
1906	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1907	# token and do it again.
1908
1909	# We get Id.Op_RParen at top level: case x in x) ;; esac
1910	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1911	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1912	# Rewind before it's used
1913	assert self.next_lex_mode == lex_mode_e.Undefined
1914	if self.lexer.MaybeUnreadOne():
1915	if self.token_type == Id.Eof_RParen:
1916	# Redo translation
1917	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1918	self._SetNext(lex_mode)
1919
1920	done = True # anything we don't recognize means we're done
1921
1922	if not done:
1923	self._SetNext(lex_mode)
1924	num_parts += 1
1925
1926	if (self.parse_opts.parse_brace() and num_parts > 1 and
1927	brace_count != 0):
1928	# accept { and }, but not foo{
1929	p_die(
1930	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1931	loc.Word(w))
1932
1933	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1934	p_die('Unexpected parts after triple quoted string',
1935	loc.WordPart(w.parts[-1]))
1936
1937	if 0:
1938	from _devbuild.gen.syntax_asdl import word_part_str
1939	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1940	WORD_HIST[word_key] += 1
1941	return w
1942
1943	def _ReadArithWord(self):
1944	# type: () -> Optional[word_t]
1945	""" Helper for ReadArithWord() """
1946	self._GetToken()
1947
1948	if self.token_kind == Kind.Unknown:
1949	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1950	p_die(
1951	'Unexpected token while parsing arithmetic: %r' %
1952	lexer.TokenVal(self.cur_token), self.cur_token)
1953
1954	elif self.token_kind == Kind.Eof:
1955	return self.cur_token
1956
1957	elif self.token_kind == Kind.Ignored:
1958	# Space should be ignored.
1959	self._SetNext(lex_mode_e.Arith)
1960	return None
1961
1962	elif self.token_kind in (Kind.Arith, Kind.Right):
1963	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1964	self._SetNext(lex_mode_e.Arith)
1965	return self.cur_token
1966
1967	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1968	return self._ReadCompoundWord(lex_mode_e.Arith)
1969
1970	else:
1971	raise AssertionError(self.cur_token)
1972
1973	def _ReadWord(self, word_mode):
1974	# type: (lex_mode_t) -> Optional[word_t]
1975	"""Helper function for ReadWord()."""
1976
1977	# Change the pseudo lexer mode to a real lexer mode
1978	if word_mode == lex_mode_e.ShCommandFakeBrack:
1979	lex_mode = lex_mode_e.ShCommand
1980	else:
1981	lex_mode = word_mode
1982
1983	self._GetToken()
1984
1985	if self.token_kind == Kind.Eof:
1986	# No advance
1987	return self.cur_token
1988
1989	# Allow Arith for ) at end of for loop?
1990	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1991	self._SetNext(lex_mode)
1992
1993	# Newlines are complicated. See 3x2 matrix in the comment about
1994	# self.multiline and self.newline_state above.
1995	if self.token_type == Id.Op_Newline:
1996	if self.multiline:
1997	if self.newline_state > 1:
1998	# This points at a blank line, but at least it gives the line number
1999	p_die('Invalid blank line in multiline mode',
2000	self.cur_token)
2001	return None
2002
2003	if self.returned_newline: # skip
2004	return None
2005
2006	return self.cur_token
2007
2008	elif self.token_kind == Kind.Right:
2009	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2010	Id.Right_CasePat,
2011	Id.Right_ShArrayLiteral):
2012	raise AssertionError(self.cur_token)
2013
2014	self._SetNext(lex_mode)
2015	return self.cur_token
2016
2017	elif self.token_kind in (Kind.Ignored, Kind.WS):
2018	self._SetNext(lex_mode)
2019	return None
2020
2021	else:
2022	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2023	Kind.Left, Kind.KW, Kind.ControlFlow,
2024	Kind.BoolUnary, Kind.BoolBinary,
2025	Kind.ExtGlob,
2026	Kind.BashRegex), 'Unhandled token kind'
2027
2028	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2029	self.parse_opts.parse_bracket() and
2030	self.token_type == Id.Lit_LBracket):
2031	# Change [ from Kind.Lit -> Kind.Op
2032	# So CommandParser can treat
2033	# assert [42 === x]
2034	# like
2035	# json write (x)
2036	bracket_word = self.cur_token
2037	bracket_word.id = Id.Op_LBracket
2038
2039	self._SetNext(lex_mode)
2040	return bracket_word
2041
2042	# We're beginning a word. If we see Id.Lit_Pound, change to
2043	# lex_mode_e.Comment and read until end of line.
2044	if self.token_type == Id.Lit_Pound:
2045	self._SetNext(lex_mode_e.Comment)
2046	self._GetToken()
2047
2048	# NOTE: The # could be the last character in the file. It can't be
2049	# Eof_{RParen,Backtick} because #) and #` are comments.
2050	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2051	self.cur_token
2052
2053	# The next iteration will go into Kind.Ignored and set lex state to
2054	# lex_mode_e.ShCommand/etc.
2055	return None # tell ReadWord() to try again after comment
2056
2057	elif self.token_type == Id.Lit_TPound: ### doc comment
2058	self._SetNext(lex_mode_e.Comment)
2059	self._GetToken()
2060
2061	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2062	return self.cur_token
2063
2064	return None # tell ReadWord() to try again after comment
2065
2066	else:
2067	# r'' u'' b''
2068	if (self.token_type == Id.Lit_Chars and
2069	self.lexer.LookAheadOne(
2070	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2071
2072	# When shopt -s parse_raw_string:
2073	# echo r'hi' is like echo 'hi'
2074	#
2075	# echo u'\u{3bc}' b'\yff' works
2076
2077	tok = self.cur_token
2078	if self.parse_opts.parse_ysh_string():
2079	if lexer.TokenEquals(tok, 'r'):
2080	left_id = Id.Left_RSingleQuote
2081	elif lexer.TokenEquals(tok, 'u'):
2082	left_id = Id.Left_USingleQuote
2083	elif lexer.TokenEquals(tok, 'b'):
2084	left_id = Id.Left_BSingleQuote
2085	else:
2086	left_id = Id.Undefined_Tok
2087
2088	if left_id != Id.Undefined_Tok:
2089	# skip the r, and then 'foo' will be read as normal
2090	self._SetNext(lex_mode_e.ShCommand)
2091
2092	self._GetToken()
2093	assert self.token_type == Id.Left_SingleQuote, self.token_type
2094
2095	# Read the word in a different lexer mode
2096	return self._ReadYshSingleQuoted(left_id)
2097
2098	return self._ReadCompoundWord(lex_mode)
2099
2100	def ParseVarRef(self):
2101	# type: () -> BracedVarSub
2102	"""DYNAMIC parsing of what's inside ${!ref}
2103
2104	# Same as VarOf production
2105	VarRefExpr = VarOf EOF
2106	"""
2107	self._SetNext(lex_mode_e.VSub_1)
2108
2109	self._GetToken()
2110	if self.token_kind != Kind.VSub:
2111	p_die('Expected var name', self.cur_token)
2112
2113	part = self._ParseVarOf()
2114	# NOTE: no ${ } means no part.left and part.right
2115	part.left = part.token # cheat to make test pass
2116	part.right = part.token
2117
2118	self._GetToken()
2119	if self.token_type != Id.Eof_Real:
2120	p_die('Expected end of var ref expression', self.cur_token)
2121	return part
2122
2123	def LookPastSpace(self):
2124	# type: () -> Id_t
2125	"""Look ahead to the next token.
2126
2127	For the CommandParser to recognize
2128	array= (1 2 3)
2129	YSH for ( versus bash for ((
2130	YSH if ( versus if test
2131	YSH while ( versus while test
2132	YSH bare assignment 'grep =' versus 'grep foo'
2133	"""
2134	assert self.token_type != Id.Undefined_Tok
2135	if self.cur_token.id == Id.WS_Space:
2136	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2137	else:
2138	id_ = self.cur_token.id
2139	return id_
2140
2141	def LookAheadFuncParens(self):
2142	# type: () -> bool
2143	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2144	assert self.token_type != Id.Undefined_Tok
2145
2146	# We have to handle 2 cases because we buffer a token
2147	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2148	return self.lexer.LookAheadFuncParens(1) # go back one char
2149
2150	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2151	return self.lexer.LookAheadFuncParens(0)
2152
2153	else:
2154	return False
2155
2156	def ReadWord(self, word_mode):
2157	# type: (lex_mode_t) -> word_t
2158	"""Read the next word, using the given lexer mode.
2159
2160	This is a stateful wrapper for the stateless _ReadWord function.
2161	"""
2162	assert word_mode in (lex_mode_e.ShCommand,
2163	lex_mode_e.ShCommandFakeBrack,
2164	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2165
2166	if self.buffered_word: # For integration with pgen2
2167	w = self.buffered_word
2168	self.buffered_word = None
2169	else:
2170	while True:
2171	w = self._ReadWord(word_mode)
2172	if w is not None:
2173	break
2174
2175	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2176	return w
2177
2178	def ReadArithWord(self):
2179	# type: () -> word_t
2180	while True:
2181	w = self._ReadArithWord()
2182	if w is not None:
2183	break
2184	return w
2185
2186	def ReadHereDocBody(self, parts):
2187	# type: (List[word_part_t]) -> None
2188	"""
2189	A here doc is like a double quoted context, except " isn't special.
2190	"""
2191	self._ReadLikeDQ(None, False, parts)
2192	# Returns nothing
2193
2194	def ReadForPlugin(self):
2195	# type: () -> CompoundWord
2196	"""For $PS1, $PS4, etc.
2197
2198	This is just like reading a here doc line. "\n" is allowed, as
2199	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2200	"""
2201	w = CompoundWord([])
2202	self._ReadLikeDQ(None, False, w.parts)
2203	return w
2204
2205	def EmitDocToken(self, b):
2206	# type: (bool) -> None
2207	self.emit_doc_token = b
2208
2209	def Multiline(self, b):
2210	# type: (bool) -> None
2211	self.multiline = b
2212
2213
2214	if 0:
2215	import collections
2216	WORD_HIST = collections.Counter()
2217
2218	# vim: sw=4