osh/word_parse.py

OILS / osh / word_parse.py View on Github | oilshell.org

2192 lines, 1174 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	ShArrayLiteral,
65	AssocPair,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	)
91	from core import alloc
92	from core.error import p_die
93	from mycpp.mylib import log
94	from core import pyutil
95	from core import ui
96	from frontend import consts
97	from frontend import lexer
98	from frontend import reader
99	from osh import tdop
100	from osh import arith_parse
101	from osh import braces
102	from osh import word_
103	from osh import word_compile
104	from mycpp.mylib import tagswitch
105
106	from typing import List, Optional, Tuple, cast
107	from typing import TYPE_CHECKING
108	if TYPE_CHECKING:
109	from frontend.lexer import Lexer
110	from frontend.parse_lib import ParseContext
111	from frontend.reader import _Reader
112	from osh.cmd_parse import VarChecker
113
114	unused1 = log
115	unused2 = Id_str
116
117	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
118
119
120	class WordEmitter(object):
121	"""Common interface for [ and [["""
122
123	def __init__(self):
124	# type: () -> None
125	"""Empty constructor for mycpp."""
126	pass
127
128	def ReadWord(self, lex_mode):
129	# type: (lex_mode_t) -> word_t
130	raise NotImplementedError()
131
132
133	class WordParser(WordEmitter):
134
135	def __init__(self, parse_ctx, lexer, line_reader):
136	# type: (ParseContext, Lexer, _Reader) -> None
137	self.parse_ctx = parse_ctx
138	self.lexer = lexer
139	self.line_reader = line_reader
140	self.arena = line_reader.arena
141
142	self.parse_opts = parse_ctx.parse_opts
143	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
144	self.parse_opts)
145	self.Reset()
146
147	def Init(self, lex_mode):
148	# type: (lex_mode_t) -> None
149	"""Used to parse arithmetic, see ParseContext."""
150	self.next_lex_mode = lex_mode
151
152	def Reset(self):
153	# type: () -> None
154	"""Called by interactive loop."""
155	# For _GetToken()
156	self.cur_token = None # type: Token
157	self.token_kind = Kind.Undefined
158	self.token_type = Id.Undefined_Tok
159
160	self.next_lex_mode = lex_mode_e.ShCommand
161
162	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
163	# comments
164	self.emit_doc_token = False
165	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
166	# multiline mode.
167	self.multiline = False
168
169	# For detecting invalid \n\n in multiline mode. Counts what we got
170	# directly from the lexer.
171	self.newline_state = 0
172	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
173	# that consume words.
174	self.returned_newline = False
175
176	# For integration with pgen2
177	self.buffered_word = None # type: word_t
178
179	def _GetToken(self):
180	# type: () -> None
181	"""Call this when you need to make a decision based on any of:
182
183	self.token_type
184	self.token_kind
185	self.cur_token
186	"""
187	if self.next_lex_mode == lex_mode_e.Undefined:
188	return # _SetNext() not called, so do nothing
189
190	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
191	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
192
193	self.cur_token = self.lexer.Read(real_mode)
194
195	# MUTATE TOKEN for fake lexer mode.
196	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
197	if (is_fake and self.cur_token.id
198	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
199	self.cur_token.id = Id.Lit_Chars
200
201	self.token_type = self.cur_token.id
202	self.token_kind = consts.GetKind(self.token_type)
203
204	# number of consecutive newlines, ignoring whitespace
205	if self.token_type == Id.Op_Newline:
206	self.newline_state += 1
207	elif self.token_kind != Kind.WS:
208	self.newline_state = 0
209
210	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
211	self.next_lex_mode = lex_mode_e.Undefined
212
213	def _SetNext(self, lex_mode):
214	# type: (lex_mode_t) -> None
215	"""Set the next lex state, but don't actually read a token.
216
217	We need this for proper interactive parsing.
218	"""
219	self.next_lex_mode = lex_mode
220
221	def _ReadVarOpArg(self, arg_lex_mode):
222	# type: (lex_mode_t) -> rhs_word_t
223
224	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
225	# valid, even when unquoted.
226	self._SetNext(arg_lex_mode)
227	self._GetToken()
228
229	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
230	True) # empty_ok
231
232	# If the Compound has no parts, and we're in a double-quoted VarSub
233	# arg, and empty_ok, then return Empty. This is so it can evaluate to
234	# the empty string and not get elided.
235	#
236	# Examples:
237	# - "${s:-}", "${s/%pat/}"
238	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
239	# has the same potential problem of not having Token location info.
240	#
241	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
242	# return a Compound with no parts, which is explicitly checked with a
243	# custom error message.
244	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
245	return rhs_word.Empty
246
247	return w
248
249	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
250	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
251	"""Return a CompoundWord.
252
253	Helper function for _ReadVarOpArg and used directly by
254	_ReadPatSubVarOp.
255	"""
256	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
257	#log('w %s', w)
258	tilde = word_.TildeDetect(w)
259	if tilde:
260	w = tilde
261	return w
262
263	def _ReadSliceVarOp(self):
264	# type: () -> suffix_op.Slice
265	"""VarOf ':' ArithExpr (':' ArithExpr )?"""
266	self._SetNext(lex_mode_e.Arith)
267	self._GetToken()
268	cur_id = self.token_type # e.g. Id.Arith_Colon
269
270	if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
271	# no beginning specified
272	begin = None # type: Optional[arith_expr_t]
273	else:
274	begin = self.a_parser.Parse()
275	cur_id = self.a_parser.CurrentId()
276
277	if cur_id == Id.Arith_RBrace:
278	no_length = None # type: Optional[arith_expr_t] # No length specified
279	return suffix_op.Slice(begin, no_length)
280
281	# Id.Arith_Colon is a pun for Id.VOp2_Colon
282	if cur_id == Id.Arith_Colon:
283	self._SetNext(lex_mode_e.Arith)
284	length = self._ReadArithExpr(Id.Arith_RBrace)
285	return suffix_op.Slice(begin, length)
286
287	p_die("Expected : or } in slice", self.cur_token)
288	raise AssertionError() # for MyPy
289
290	def _ReadPatSubVarOp(self):
291	# type: () -> suffix_op.PatSub
292	"""Looking at the first '/' after VarOf:
293
294	VarSub = ...
295	\| VarOf '/' Match ( '/' WORD? )?
296	Match = '/' WORD # can't be empty
297	\| '#' WORD? # may be empty
298	\| '%' WORD?
299	"""
300	slash_tok = self.cur_token # location info
301	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
302
303	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
304
305	self._GetToken()
306	if self.token_type == Id.Right_DollarBrace:
307	pat = CompoundWord([])
308	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
309	slash_tok)
310
311	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
312	replace_mode = self.token_type
313	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
314
315	# Bash quirk:
316	# echo ${x/#/replace} has an empty pattern
317	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
318	empty_ok = replace_mode != Id.Lit_Slash
319	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
320	empty_ok)
321	#log('pat 1 %r', pat)
322
323	if self.token_type == Id.Lit_Slash:
324	# read until }
325	replace = self._ReadVarOpArg(
326	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
327	#log('r 1 %r', replace)
328	else:
329	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
330	replace = rhs_word.Empty
331
332	self._GetToken()
333	if self.token_type != Id.Right_DollarBrace:
334	# This happens on invalid code
335	p_die(
336	"Expected } after replacement string, got %s" %
337	ui.PrettyId(self.token_type), self.cur_token)
338
339	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
340
341	def _ReadSubscript(self):
342	# type: () -> bracket_op_t
343	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
344	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
345	# expression.
346	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
347	if next_id in (Id.Lit_At, Id.Arith_Star):
348	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
349
350	self._SetNext(lex_mode_e.Arith) # skip past [
351	self._GetToken()
352	self._SetNext(lex_mode_e.Arith) # skip past @
353	self._GetToken()
354	else:
355	self._SetNext(lex_mode_e.Arith) # skip past [
356	anode = self._ReadArithExpr(Id.Arith_RBracket)
357	op = bracket_op.ArrayIndex(anode)
358
359	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
360	p_die('Expected ] to close subscript', self.cur_token)
361
362	self._SetNext(lex_mode_e.VSub_2) # skip past ]
363	self._GetToken() # Needed to be in the same spot as no subscript
364
365	return op
366
367	def _ParseVarOf(self):
368	# type: () -> BracedVarSub
369	"""
370	VarOf = NAME Subscript?
371	\| NUMBER # no subscript allowed, none of these are arrays
372	# ${@[1]} doesn't work, even though slicing does
373	\| VarSymbol
374	"""
375	self._GetToken()
376	name_token = self.cur_token
377	self._SetNext(lex_mode_e.VSub_2)
378
379	self._GetToken() # Check for []
380	if self.token_type == Id.VOp2_LBracket:
381	bracket_op = self._ReadSubscript()
382	else:
383	bracket_op = None
384
385	part = BracedVarSub.CreateNull()
386	part.token = name_token
387	part.var_name = lexer.TokenVal(name_token)
388	part.bracket_op = bracket_op
389	return part
390
391	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
392	# type: (lex_mode_t, bool) -> BracedVarSub
393	"""Start parsing at the op -- we already skipped past the name."""
394	part = self._ParseVarOf()
395
396	self._GetToken()
397	if self.token_type == Id.Right_DollarBrace:
398	return part # no ops
399
400	op_kind = self.token_kind
401
402	if op_kind == Kind.VTest:
403	tok = self.cur_token
404	arg_word = self._ReadVarOpArg(arg_lex_mode)
405	if self.token_type != Id.Right_DollarBrace:
406	p_die('Expected } to close ${', self.cur_token)
407
408	part.suffix_op = suffix_op.Unary(tok, arg_word)
409
410	elif op_kind == Kind.VOpYsh:
411	tok = self.cur_token
412	arg_word = self._ReadVarOpArg(arg_lex_mode)
413	if self.token_type != Id.Right_DollarBrace:
414	p_die('Expected } to close ${', self.cur_token)
415
416	UP_arg_word = arg_word
417	with tagswitch(arg_word) as case:
418	if case(rhs_word_e.Empty):
419	pass
420	elif case(rhs_word_e.Compound):
421	arg_word = cast(CompoundWord, UP_arg_word)
422	# This handles ${x\|html} and ${x %.3f} now
423	# However I think ${x %.3f} should be statically parsed? It can enter
424	# the printf lexer modes.
425	ok, arg, quoted = word_.StaticEval(arg_word)
426	if not ok or quoted:
427	p_die('Expected a constant argument',
428	loc.Word(arg_word))
429
430	part.suffix_op = suffix_op.Static(tok, arg)
431
432	elif op_kind == Kind.VOp0:
433	part.suffix_op = self.cur_token # Nullary
434	self._SetNext(lex_mode_e.VSub_2) # Expecting }
435	self._GetToken()
436
437	elif op_kind == Kind.VOp1: # % %% # ## etc.
438	tok = self.cur_token
439	# Weird exception that all shells have: these operators take a glob
440	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
441	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
442	if self.token_type != Id.Right_DollarBrace:
443	p_die('Expected } to close ${', self.cur_token)
444
445	part.suffix_op = suffix_op.Unary(tok, arg_word)
446
447	elif op_kind == Kind.VOp2: # / : [ ]
448	if self.token_type == Id.VOp2_Slash:
449	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
450	part.suffix_op = patsub_op
451
452	# Checked by the method above
453	assert self.token_type == Id.Right_DollarBrace, self.cur_token
454
455	elif self.token_type == Id.VOp2_Colon:
456	part.suffix_op = self._ReadSliceVarOp()
457	# NOTE: } in arithmetic mode.
458	if self.token_type != Id.Arith_RBrace:
459	# Token seems off; doesn't point to X in # ${a:1:2 X
460	p_die('Expected } to close ${', self.cur_token)
461
462	else:
463	# TODO: Does this ever happen?
464	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
465
466	elif op_kind == Kind.VOp3: # ${prefix@} etc.
467	if allow_query:
468	part.suffix_op = self.cur_token # Nullary
469	self._SetNext(lex_mode_e.VSub_2) # Expecting }
470	self._GetToken()
471	else:
472	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
473
474	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
475	# mode. It's redundantly checked above.
476	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
477	# ${a.} or ${!a.}
478	p_die('Expected } to close ${', self.cur_token)
479
480	# Now look for ops
481	return part
482
483	def _ReadZshVarSub(self, left_token):
484	# type: (Token) -> word_part.ZshVarSub
485
486	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
487
488	# Can be empty
489	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
490	True)
491	self._GetToken()
492	return word_part.ZshVarSub(left_token, w, self.cur_token)
493
494	def ReadBracedVarSub(self, left_token):
495	# type: (Token) -> Tuple[BracedVarSub, Token]
496	""" For YSH expressions like var x = ${x:-"default"}. """
497	part = self._ReadBracedVarSub(left_token, d_quoted=False)
498	last_token = self.cur_token
499	return part, last_token
500
501	def _ReadBracedVarSub(self, left_token, d_quoted):
502	# type: (Token, bool) -> BracedVarSub
503	"""For the ${} expression language.
504
505	NAME = [a-zA-Z_][a-zA-Z0-9_]*
506	NUMBER = [0-9]+ # ${10}, ${11}, ...
507
508	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
509	VarSymbol = '!' \| '@' \| '#' \| ...
510	VarOf = NAME Subscript?
511	\| NUMBER # no subscript allowed, none of these are arrays
512	# ${@[1]} doesn't work, even though slicing does
513	\| VarSymbol
514
515	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
516
517	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
518	STRIP_OP = '#' \| '##' \| '%' \| '%%'
519	CASE_OP = ',' \| ',,' \| '^' \| '^^'
520	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
521
522	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
523	# SPACE is operator not %
524	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
525	VarExpr = VarOf
526	\| VarOf NULLARY_OP
527	\| VarOf UnaryOp WORD
528	\| VarOf YSH_UNARY STATIC_WORD
529	\| VarOf ':' ArithExpr (':' ArithExpr )?
530	\| VarOf '/' Match '/' WORD
531
532	LengthExpr = '#' VarOf # can't apply operators after length
533
534	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
535	# ${!ref[0]} vs ${!keys[@]} resolved later
536
537	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
538
539	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
540
541	VarSub = LengthExpr
542	\| RefOrKeys
543	\| PrefixQuery
544	\| VarExpr
545	\| BuiltinSub
546
547	NOTES:
548	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
549	slicing ${a:x+1:y+2}
550	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
551	- @ and * are technically arithmetic expressions in this implementation
552	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
553	it's also vectorized.
554
555	Strictness over bash:
556	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
557	grammar
558	- ! and # prefixes can't be composed, even though named refs can be
559	composed with other operators
560	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
561	a prefix, and it can also be a literal part of WORD.
562
563	From the parser's point of view, the prefix # can't be combined with
564	UnaryOp/slicing/matching, and the ! can. However
565
566	- ${a[@]:1:2} is not allowed
567	- ${#a[@]:1:2} is allowed, but gives the wrong answer
568	"""
569	if d_quoted:
570	arg_lex_mode = lex_mode_e.VSub_ArgDQ
571	else:
572	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
573
574	self._SetNext(lex_mode_e.VSub_1)
575	self._GetToken()
576
577	ty = self.token_type
578	first_tok = self.cur_token
579
580	if ty == Id.VSub_Pound:
581	# Disambiguate
582	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
583	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
584	# e.g. a name, '#' is the prefix
585	self._SetNext(lex_mode_e.VSub_1)
586	part = self._ParseVarOf()
587
588	self._GetToken()
589	if self.token_type != Id.Right_DollarBrace:
590	p_die('Expected } after length expression', self.cur_token)
591
592	part.prefix_op = first_tok
593
594	else: # not a prefix, '#' is the variable
595	part = self._ParseVarExpr(arg_lex_mode)
596
597	elif ty == Id.VSub_Bang:
598	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
599	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
600	# e.g. a name, '!' is the prefix
601	# ${!a} -- this is a ref
602	# ${!3} -- this is ref
603	# ${!a[1]} -- this is a ref
604	# ${!a[@]} -- this is a keys
605	# No lookahead -- do it in a second step, or at runtime
606	self._SetNext(lex_mode_e.VSub_1)
607	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
608
609	part.prefix_op = first_tok
610
611	else: # not a prefix, '!' is the variable
612	part = self._ParseVarExpr(arg_lex_mode)
613
614	elif ty == Id.VSub_Dot:
615	# Note: this will become a new builtin_sub type, so this method must
616	# return word_part_t rather than BracedVarSub. I don't think that
617	# should cause problems.
618	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
619
620	# VS_NAME, VS_NUMBER, symbol that isn't # or !
621	elif self.token_kind == Kind.VSub:
622	part = self._ParseVarExpr(arg_lex_mode)
623
624	else:
625	# e.g. ${^}
626	p_die('Unexpected token in ${}', self.cur_token)
627
628	part.left = left_token # attach the argument
629	part.right = self.cur_token
630	return part
631
632	def _ReadSingleQuoted(self, left_token, lex_mode):
633	# type: (Token, lex_mode_t) -> SingleQuoted
634	"""Internal method to read a word_part."""
635	tokens = [] # type: List[Token]
636	# In command mode, we never disallow backslashes like '\'
637	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
638	False)
639	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
640	node = SingleQuoted(left_token, sval, right_quote)
641	return node
642
643	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
644	# type: (lex_mode_t, Token, List[Token], bool) -> Token
645	"""Appends to out_tokens; returns last token
646
647	Used by expr_parse.py
648	"""
649	# TODO: Remove and use out_tokens
650	tokens = [] # type: List[Token]
651
652	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
653	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
654
655	expected_end_tokens = 3 if left_token.id in (
656	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
657	Id.Left_BTSingleQuote) else 1
658	num_end_tokens = 0
659
660	while num_end_tokens < expected_end_tokens:
661	self._SetNext(lex_mode)
662	self._GetToken()
663
664	# Kind.Char emitted in lex_mode.SQ_C
665	if self.token_kind in (Kind.Lit, Kind.Char):
666	tok = self.cur_token
667	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
668	# r'one\two' or c'one\\two'
669	if no_backslashes and lexer.TokenContains(tok, '\\'):
670	p_die(
671	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
672	tok)
673
674	if is_ysh_expr:
675	# Disallow var x = $'\001'. Arguably we don't need these
676	# checks because u'\u{1}' is the way to write it.
677	if self.token_type == Id.Char_Octal3:
678	p_die(
679	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
680	tok)
681
682	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
683	# disallow \xH
684	p_die(
685	r'Invalid hex escape in YSH string (must be \xHH)',
686	tok)
687
688	tokens.append(tok)
689
690	elif self.token_kind == Kind.Unknown:
691	tok = self.cur_token
692	assert tok.id == Id.Unknown_Backslash, tok
693
694	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
695	if is_ysh_expr or not self.parse_opts.parse_backslash():
696	p_die(
697	"Invalid char escape in C-style string literal (OILS-ERR-11)",
698	tok)
699
700	tokens.append(tok)
701
702	elif self.token_kind == Kind.Eof:
703	p_die('Unexpected EOF in single-quoted string that began here',
704	left_token)
705
706	elif self.token_kind == Kind.Right:
707	# assume Id.Right_SingleQuote
708	num_end_tokens += 1
709	tokens.append(self.cur_token)
710
711	else:
712	raise AssertionError(self.cur_token)
713
714	if self.token_kind != Kind.Right:
715	num_end_tokens = 0 # we need three in a ROW
716
717	if expected_end_tokens == 1:
718	tokens.pop()
719	elif expected_end_tokens == 3: # Get rid of spurious end tokens
720	tokens.pop()
721	tokens.pop()
722	tokens.pop()
723
724	# Remove space from ''' r''' $''' in both expression mode and command mode
725	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
726	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
727	word_compile.RemoveLeadingSpaceSQ(tokens)
728
729	# Validation after lexing - same 2 checks in j8.LexerDecoder
730	is_u_string = left_token.id in (Id.Left_USingleQuote,
731	Id.Left_UTSingleQuote)
732
733	for tok in tokens:
734	# u'\yff' is not valid, but b'\yff' is
735	if is_u_string and tok.id == Id.Char_YHex:
736	p_die(
737	r"%s escapes not allowed in u'' strings" %
738	lexer.TokenVal(tok), tok)
739
740	out_tokens.extend(tokens)
741	return self.cur_token
742
743	def _ReadDoubleQuotedLeftParts(self):
744	# type: () -> word_part_t
745	"""Read substitution parts in a double quoted context."""
746	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
747	return self._ReadCommandSub(self.token_type, d_quoted=True)
748
749	if self.token_type == Id.Left_DollarBrace:
750	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
751
752	if self.token_type == Id.Left_DollarDParen:
753	return self._ReadArithSub()
754
755	if self.token_type == Id.Left_DollarBracket:
756	return self._ReadExprSub(lex_mode_e.DQ)
757
758	raise AssertionError(self.cur_token)
759
760	def _ReadYshSingleQuoted(self, left_id):
761	# type: (Id_t) -> CompoundWord
762	"""Read YSH style strings
763
764	r'' u'' b''
765	r''' ''' u''' ''' b''' '''
766	"""
767	#log('BEF self.cur_token %s', self.cur_token)
768	if left_id == Id.Left_RSingleQuote:
769	lexer_mode = lex_mode_e.SQ_Raw
770	triple_left_id = Id.Left_RTSingleQuote
771	elif left_id == Id.Left_USingleQuote:
772	lexer_mode = lex_mode_e.J8_Str
773	triple_left_id = Id.Left_UTSingleQuote
774	elif left_id == Id.Left_BSingleQuote:
775	lexer_mode = lex_mode_e.J8_Str
776	triple_left_id = Id.Left_BTSingleQuote
777	else:
778	raise AssertionError(left_id)
779
780	# Needed for syntax checks
781	left_tok = self.cur_token
782	left_tok.id = left_id
783
784	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
785
786	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
787	self._SetNext(lex_mode_e.ShCommand)
788	self._GetToken()
789
790	assert self.token_type == Id.Left_SingleQuote
791	# HACK: magically transform the third ' in u''' to
792	# Id.Left_UTSingleQuote, so that ''' is the terminator
793	left_tok = self.cur_token
794	left_tok.id = triple_left_id
795
796	# Handles stripping leading whitespace
797	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
798
799	# Advance and validate
800	self._SetNext(lex_mode_e.ShCommand)
801
802	self._GetToken()
803	if self.token_kind not in KINDS_THAT_END_WORDS:
804	p_die('Unexpected token after YSH single-quoted string',
805	self.cur_token)
806
807	return CompoundWord([sq_part])
808
809	def _ReadUnquotedLeftParts(self, triple_out):
810	# type: (Optional[BoolParamBox]) -> word_part_t
811	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
812
813	If triple_out is set, then we try parsing triple quoted strings,
814	and set its value to True if we got one.
815	"""
816	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
817	# Note: $"" is a synonym for "". It might make sense if it added
818	# \n \0 \x00 \u{123} etc. But that's not what bash does!
819	dq_part = self._ReadDoubleQuoted(self.cur_token)
820	# Got empty word "" and there's a " after
821	if (triple_out and len(dq_part.parts) == 0 and
822	self.lexer.ByteLookAhead() == '"'):
823
824	self._SetNext(lex_mode_e.ShCommand)
825	self._GetToken()
826	# HACK: magically transform the third " in """ to
827	# Id.Left_TDoubleQuote, so that """ is the terminator
828	left_dq_token = self.cur_token
829	left_dq_token.id = Id.Left_TDoubleQuote
830	triple_out.b = True # let caller know we got it
831	return self._ReadDoubleQuoted(left_dq_token)
832
833	return dq_part
834
835	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
836	Id.Left_DollarSingleQuote):
837	if self.token_type == Id.Left_SingleQuote:
838	lexer_mode = lex_mode_e.SQ_Raw
839	triple_left_id = Id.Left_TSingleQuote
840	elif self.token_type == Id.Left_RSingleQuote:
841	lexer_mode = lex_mode_e.SQ_Raw
842	triple_left_id = Id.Left_RTSingleQuote
843	else:
844	lexer_mode = lex_mode_e.SQ_C
845	# there is no such thing as $'''
846	triple_left_id = Id.Undefined_Tok
847
848	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
849
850	# Got empty '' or r'' and there's a ' after
851	# u'' and b'' are handled in _ReadYshSingleQuoted
852	if (triple_left_id != Id.Undefined_Tok and
853	triple_out is not None and len(sq_part.sval) == 0 and
854	self.lexer.ByteLookAhead() == "'"):
855
856	self._SetNext(lex_mode_e.ShCommand)
857	self._GetToken()
858
859	# HACK: magically transform the third ' in ''' to
860	# Id.Left_TSingleQuote, so that ''' is the terminator
861	left_sq_token = self.cur_token
862	left_sq_token.id = triple_left_id
863
864	triple_out.b = True # let caller know we got it
865	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
866
867	return sq_part
868
869	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
870	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
871	return self._ReadCommandSub(self.token_type, d_quoted=False)
872
873	if self.token_type == Id.Left_DollarBrace:
874	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
875
876	if self.token_type == Id.Left_DollarDParen:
877	return self._ReadArithSub()
878
879	if self.token_type == Id.Left_DollarBracket:
880	return self._ReadExprSub(lex_mode_e.ShCommand)
881
882	if self.token_type == Id.Left_DollarBraceZsh:
883	return self._ReadZshVarSub(self.cur_token)
884
885	raise AssertionError(self.cur_token)
886
887	def _ReadExtGlob(self):
888	# type: () -> word_part.ExtGlob
889	"""
890	Grammar:
891	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
892	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
893	RIGHT = ')'
894	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
895	Compound includes ExtGlob
896	"""
897	left_token = self.cur_token
898	right_token = None # type: Token
899	arms = [] # type: List[CompoundWord]
900
901	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
902	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
903
904	read_word = False # did we just a read a word? To handle @(\|\|).
905
906	while True:
907	self._GetToken()
908
909	if self.token_type == Id.Right_ExtGlob:
910	if not read_word:
911	arms.append(CompoundWord([]))
912	right_token = self.cur_token
913	break
914
915	elif self.token_type == Id.Op_Pipe:
916	if not read_word:
917	arms.append(CompoundWord([]))
918	read_word = False
919	self._SetNext(lex_mode_e.ExtGlob)
920
921	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
922	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
923	Kind.ExtGlob):
924	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
925	arms.append(w)
926	read_word = True
927
928	elif self.token_kind == Kind.Eof:
929	p_die('Unexpected EOF reading extended glob that began here',
930	left_token)
931
932	else:
933	raise AssertionError(self.cur_token)
934
935	return word_part.ExtGlob(left_token, arms, right_token)
936
937	def _ReadBashRegexGroup(self):
938	# type: () -> word_part.BashRegexGroup
939	"""
940	Grammar:
941	BashRegexGroup = '(' WORD? ')
942	"""
943	left_token = self.cur_token
944	assert left_token.id == Id.BashRegex_LParen, left_token
945
946	right_token = None # type: Token
947	arms = [] # type: List[CompoundWord]
948
949	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
950	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
951
952	self._GetToken()
953	if self.token_type == Id.Right_BashRegexGroup: # empty ()
954	return word_part.BashRegexGroup(left_token, None, self.cur_token)
955
956	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
957	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
958	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
959	# To allow bash style [[ s =~ (a b) ]]
960	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
961	arms.append(w)
962
963	self._GetToken()
964	if self.token_type != Id.Right_BashRegexGroup:
965	p_die('Expected ) to close bash regex group', self.cur_token)
966
967	return word_part.BashRegexGroup(left_token, w, self.cur_token)
968
969	p_die('Expected word after ( opening bash regex group', self.cur_token)
970
971	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
972	# type: (Optional[Token], bool, List[word_part_t]) -> None
973	"""
974	Args:
975	left_token: A token if we are reading a double quoted part, or None if
976	we're reading a here doc.
977	is_ysh_expr: Whether to disallow backticks and invalid char escapes
978	out_parts: list of word_part to append to
979	"""
980	if left_token:
981	if left_token.id in (Id.Left_TDoubleQuote,
982	Id.Left_DollarTDoubleQuote):
983	expected_end_tokens = 3
984	else:
985	expected_end_tokens = 1
986	else:
987	expected_end_tokens = 1000 # here doc will break
988
989	num_end_tokens = 0
990	while num_end_tokens < expected_end_tokens:
991	self._SetNext(lex_mode_e.DQ)
992	self._GetToken()
993
994	if self.token_kind == Kind.Lit:
995	if self.token_type == Id.Lit_EscapedChar:
996	tok = self.cur_token
997	ch = lexer.TokenSliceLeft(tok, 1)
998	part = word_part.EscapedLiteral(tok,
999	ch) # type: word_part_t
1000	else:
1001	if self.token_type == Id.Lit_BadBackslash:
1002	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1003	# YSH.
1004	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1005	# recursion (unless parse_backslash)
1006	if (is_ysh_expr or
1007	not self.parse_opts.parse_backslash()):
1008	p_die(
1009	"Invalid char escape in double quoted string (OILS-ERR-12)",
1010	self.cur_token)
1011	elif self.token_type == Id.Lit_Dollar:
1012	if is_ysh_expr or not self.parse_opts.parse_dollar():
1013	p_die("Literal $ should be quoted like \$",
1014	self.cur_token)
1015
1016	part = self.cur_token
1017	out_parts.append(part)
1018
1019	elif self.token_kind == Kind.Left:
1020	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1021	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1022	self.cur_token)
1023
1024	part = self._ReadDoubleQuotedLeftParts()
1025	out_parts.append(part)
1026
1027	elif self.token_kind == Kind.VSub:
1028	tok = self.cur_token
1029	part = SimpleVarSub(tok)
1030	out_parts.append(part)
1031	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1032	# later.
1033
1034	elif self.token_kind == Kind.Right:
1035	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1036	if left_token:
1037	num_end_tokens += 1
1038
1039	# In a here doc, the right quote is literal!
1040	out_parts.append(self.cur_token)
1041
1042	elif self.token_kind == Kind.Eof:
1043	if left_token:
1044	p_die(
1045	'Unexpected EOF reading double-quoted string that began here',
1046	left_token)
1047	else: # here docs will have an EOF in their token stream
1048	break
1049
1050	else:
1051	raise AssertionError(self.cur_token)
1052
1053	if self.token_kind != Kind.Right:
1054	num_end_tokens = 0 # """ must be CONSECUTIVE
1055
1056	if expected_end_tokens == 1:
1057	out_parts.pop()
1058	elif expected_end_tokens == 3:
1059	out_parts.pop()
1060	out_parts.pop()
1061	out_parts.pop()
1062
1063	# Remove space from """ in both expression mode and command mode
1064	if (left_token and left_token.id
1065	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1066	word_compile.RemoveLeadingSpaceDQ(out_parts)
1067
1068	# Return nothing, since we appended to 'out_parts'
1069
1070	def _ReadDoubleQuoted(self, left_token):
1071	# type: (Token) -> DoubleQuoted
1072	"""Helper function for "hello $name".
1073
1074	Args:
1075	eof_type: for stopping at }, Id.Lit_RBrace
1076	here_doc: Whether we are reading in a here doc context
1077
1078	Also ${foo%%a b c} # treat this as double quoted. until you hit
1079	"""
1080	parts = [] # type: List[word_part_t]
1081	self._ReadLikeDQ(left_token, False, parts)
1082
1083	right_quote = self.cur_token
1084	return DoubleQuoted(left_token, parts, right_quote)
1085
1086	def ReadDoubleQuoted(self, left_token, parts):
1087	# type: (Token, List[word_part_t]) -> Token
1088	"""For expression mode.
1089
1090	Read var x = "${dir:-}/$name"; etc.
1091	"""
1092	self._ReadLikeDQ(left_token, True, parts)
1093	return self.cur_token
1094
1095	def _ReadCommandSub(self, left_id, d_quoted=False):
1096	# type: (Id_t, bool) -> CommandSub
1097	"""
1098	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1099
1100	command_sub = '$(' command_list ')'
1101	\| '@(' command_list ')'
1102	\| '<(' command_list ')'
1103	\| '>(' command_list ')'
1104	\| ` command_list `
1105	"""
1106	left_token = self.cur_token
1107
1108	# Set the lexer in a state so ) becomes the EOF token.
1109	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1110	Id.Left_ProcSubOut):
1111	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1112
1113	right_id = Id.Eof_RParen
1114	self.lexer.PushHint(Id.Op_RParen, right_id)
1115	c_parser = self.parse_ctx.MakeParserForCommandSub(
1116	self.line_reader, self.lexer, right_id)
1117	# NOTE: This doesn't use something like main_loop because we don't want
1118	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1119	node = c_parser.ParseCommandSub()
1120
1121	right_token = c_parser.w_parser.cur_token
1122
1123	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1124	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1125	# test/osh2oil.
1126
1127	right_id = Id.Eof_Backtick
1128	self.lexer.PushHint(Id.Left_Backtick, right_id)
1129	c_parser = self.parse_ctx.MakeParserForCommandSub(
1130	self.line_reader, self.lexer, right_id)
1131	node = c_parser.ParseCommandSub()
1132	right_token = c_parser.w_parser.cur_token
1133
1134	elif left_id == Id.Left_Backtick:
1135	if not self.parse_opts.parse_backticks():
1136	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1137	left_token)
1138
1139	self._SetNext(lex_mode_e.Backtick) # advance past `
1140
1141	parts = [] # type: List[str]
1142	while True:
1143	self._GetToken()
1144	#log("TOK %s", self.cur_token)
1145
1146	if self.token_type == Id.Backtick_Quoted:
1147	# Remove leading \
1148	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1149
1150	elif self.token_type == Id.Backtick_DoubleQuote:
1151	# Compatibility: If backticks are double quoted, then double quotes
1152	# within them have to be \"
1153	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1154	# is)
1155	if d_quoted:
1156	# Remove leading \
1157	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1158	else:
1159	parts.append(lexer.TokenVal(self.cur_token))
1160
1161	elif self.token_type == Id.Backtick_Other:
1162	parts.append(lexer.TokenVal(self.cur_token))
1163
1164	elif self.token_type == Id.Backtick_Right:
1165	break
1166
1167	elif self.token_type == Id.Eof_Real:
1168	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1169	p_die('Unexpected EOF while looking for closing backtick',
1170	left_token)
1171
1172	else:
1173	raise AssertionError(self.cur_token)
1174
1175	self._SetNext(lex_mode_e.Backtick)
1176
1177	# Calculate right SPID on CommandSub BEFORE re-parsing.
1178	right_token = self.cur_token
1179
1180	code_str = ''.join(parts)
1181	#log('code %r', code_str)
1182
1183	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1184	# won't have the same location info as MakeParserForCommandSub(), because
1185	# the lexer is different.
1186	arena = self.parse_ctx.arena
1187	#arena = alloc.Arena()
1188	line_reader = reader.StringLineReader(code_str, arena)
1189	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1190	src = source.Reparsed('backticks', left_token, right_token)
1191	with alloc.ctx_SourceCode(arena, src):
1192	node = c_parser.ParseCommandSub()
1193
1194	else:
1195	raise AssertionError(left_id)
1196
1197	return CommandSub(left_token, node, right_token)
1198
1199	def _ReadExprSub(self, lex_mode):
1200	# type: (lex_mode_t) -> word_part.ExprSub
1201	"""$[d->key] $[obj.method()] etc."""
1202	left_token = self.cur_token
1203
1204	self._SetNext(lex_mode_e.Expr)
1205	enode, right_token = self.parse_ctx.ParseYshExpr(
1206	self.lexer, grammar_nt.ysh_expr_sub)
1207
1208	self._SetNext(lex_mode) # Move past ]
1209	return word_part.ExprSub(left_token, enode, right_token)
1210
1211	def ParseVarDecl(self, kw_token):
1212	# type: (Token) -> command.VarDecl
1213	"""
1214	oil_var_decl: name_type_list '=' testlist end_stmt
1215
1216	Note that assignments must end with \n ; } or EOF. Unlike shell
1217	assignments, we disallow:
1218
1219	var x = 42 \| wc -l
1220	var x = 42 && echo hi
1221	"""
1222	self._SetNext(lex_mode_e.Expr)
1223	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1224	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1225	# wants
1226	if last_token.id == Id.Op_RBrace:
1227	last_token.id = Id.Lit_RBrace
1228
1229	# Let the CommandParser see the Op_Semi or Op_Newline.
1230	self.buffered_word = last_token
1231	self._SetNext(lex_mode_e.ShCommand) # always back to this
1232	return enode
1233
1234	def ParseMutation(self, kw_token, var_checker):
1235	# type: (Token, VarChecker) -> command.Mutation
1236	"""
1237	setvar i = 42
1238	setvar i += 1
1239	setvar a[i] = 42
1240	setvar a[i] += 1
1241	setvar d.key = 42
1242	setvar d.key += 1
1243	"""
1244	self._SetNext(lex_mode_e.Expr)
1245	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1246	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1247	# wants
1248	if last_token.id == Id.Op_RBrace:
1249	last_token.id = Id.Lit_RBrace
1250
1251	for lhs in enode.lhs:
1252	UP_lhs = lhs
1253	with tagswitch(lhs) as case:
1254	if case(y_lhs_e.Var):
1255	lhs = cast(Token, UP_lhs)
1256	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1257
1258	# Note: this does not cover cases like
1259	# setvar (a[0])[1] = v
1260	# setvar (d.key).other = v
1261	# This leaks into catching all typos statically, which may be
1262	# possible if 'use' makes all names explicit.
1263	elif case(y_lhs_e.Subscript):
1264	lhs = cast(Subscript, UP_lhs)
1265	if lhs.obj.tag() == expr_e.Var:
1266	v = cast(expr.Var, lhs.obj)
1267	var_checker.Check(kw_token.id, v.name, v.left)
1268
1269	elif case(y_lhs_e.Attribute):
1270	lhs = cast(Attribute, UP_lhs)
1271	if lhs.obj.tag() == expr_e.Var:
1272	v = cast(expr.Var, lhs.obj)
1273	var_checker.Check(kw_token.id, v.name, v.left)
1274
1275	# Let the CommandParser see the Op_Semi or Op_Newline.
1276	self.buffered_word = last_token
1277	self._SetNext(lex_mode_e.ShCommand) # always back to this
1278	return enode
1279
1280	def ParseBareDecl(self):
1281	# type: () -> expr_t
1282	"""
1283	x = {name: val}
1284	"""
1285	self._SetNext(lex_mode_e.Expr)
1286	self._GetToken()
1287	enode, last_token = self.parse_ctx.ParseYshExpr(
1288	self.lexer, grammar_nt.command_expr)
1289	if last_token.id == Id.Op_RBrace:
1290	last_token.id = Id.Lit_RBrace
1291	self.buffered_word = last_token
1292	self._SetNext(lex_mode_e.ShCommand)
1293	return enode
1294
1295	def ParseYshExprForCommand(self):
1296	# type: () -> expr_t
1297
1298	# Fudge for this case
1299	# for x in(y) {
1300	# versus
1301	# for x in (y) {
1302	#
1303	# In the former case, ReadWord on 'in' puts the lexer past (.
1304	# Also see LookPastSpace in CommandParers.
1305	# A simpler solution would be nicer.
1306
1307	if self.token_type == Id.Op_LParen:
1308	self.lexer.MaybeUnreadOne()
1309
1310	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1311
1312	self._SetNext(lex_mode_e.ShCommand)
1313	return enode
1314
1315	def ParseCommandExpr(self):
1316	# type: () -> expr_t
1317	"""
1318	= 1+2
1319	"""
1320	enode, last_token = self.parse_ctx.ParseYshExpr(
1321	self.lexer, grammar_nt.command_expr)
1322
1323	# In some cases, such as the case statement, we expect the lexer to be
1324	# pointing at the token right after the expression. But the expression
1325	# parser must have read to the `last_token`. Unreading places the lexer
1326	# back in the expected state. Ie:
1327	#
1328	# case (x) { case (x) {
1329	# (else) { = x } (else) { = x }
1330	# ^ The lexer is here ^ Unread to here
1331	# } }
1332	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1333	Id.Op_RBrace), last_token
1334	if last_token.id != Id.Eof_Real:
1335	# Eof_Real is the only token we cannot unread
1336	self.lexer.MaybeUnreadOne()
1337
1338	return enode
1339
1340	def ParseProc(self, node):
1341	# type: (Proc) -> None
1342
1343	# proc name-with-hyphens() must be accepted
1344	self._SetNext(lex_mode_e.ShCommand)
1345	self._GetToken()
1346	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1347	if self.token_type != Id.Lit_Chars:
1348	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1349	self.cur_token)
1350
1351	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1352	# for shell functions. Similar to IsValidVarName().
1353	node.name = self.cur_token
1354
1355	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1356
1357	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1358	assert last_token.id == Id.Op_LBrace
1359	last_token.id = Id.Lit_LBrace
1360	self.buffered_word = last_token
1361
1362	self._SetNext(lex_mode_e.ShCommand)
1363
1364	def ParseFunc(self, node):
1365	# type: (Func) -> None
1366	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1367
1368	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1369	assert last_token.id == Id.Op_LBrace
1370	last_token.id = Id.Lit_LBrace
1371	self.buffered_word = last_token
1372
1373	self._SetNext(lex_mode_e.ShCommand)
1374
1375	def ParseYshCasePattern(self):
1376	# type: () -> Tuple[pat_t, Token]
1377	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1378	self.lexer)
1379
1380	if last_token.id == Id.Op_LBrace:
1381	last_token.id = Id.Lit_LBrace
1382	self.buffered_word = last_token
1383
1384	return pat, left_tok
1385
1386	def NewlineOkForYshCase(self):
1387	# type: () -> Id_t
1388	"""Check for optional newline and consume it.
1389
1390	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1391	which crop up while parsing Ysh Case Arms. For more details, see
1392	#oil-dev > Progress On YSH Case Grammar on zulip.
1393
1394	Returns a token id which is filled with the choice of
1395
1396	word { echo word }
1397	(3) { echo expr }
1398	/e/ { echo eggex }
1399	} # right brace
1400	"""
1401	while True:
1402	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1403
1404	# Cannot lookahead past lines
1405	if next_id == Id.Unknown_Tok:
1406	self.lexer.MoveToNextLine()
1407	continue
1408
1409	next_kind = consts.GetKind(next_id)
1410	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1411	break
1412
1413	self.lexer.Read(lex_mode_e.Expr)
1414
1415	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1416	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1417	else:
1418	# Consume the trailing Op_Newline
1419	self._SetNext(lex_mode_e.ShCommand)
1420	self._GetToken()
1421
1422	return next_id
1423
1424	def _ReadArithExpr(self, end_id):
1425	# type: (Id_t) -> arith_expr_t
1426	"""Read and parse an arithmetic expression in various contexts.
1427
1428	$(( 1+2 ))
1429	(( a=1+2 ))
1430	${a[ 1+2 ]}
1431	${a : 1+2 : 1+2}
1432
1433	See tests/arith-context.test.sh for ambiguous cases.
1434
1435	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1436
1437	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1438
1439	See the assertion in ArithParser.Parse() -- unexpected extra input.
1440	"""
1441	# calls self.ReadWord(lex_mode_e.Arith)
1442	anode = self.a_parser.Parse()
1443	cur_id = self.a_parser.CurrentId()
1444	if end_id != Id.Undefined_Tok and cur_id != end_id:
1445	p_die(
1446	'Unexpected token after arithmetic expression (%s != %s)' %
1447	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1448	loc.Word(self.a_parser.cur_word))
1449	return anode
1450
1451	def _ReadArithSub(self):
1452	# type: () -> word_part.ArithSub
1453	"""Read an arith substitution, which contains an arith expression, e.g.
1454
1455	$((a + 1)).
1456	"""
1457	left_tok = self.cur_token
1458
1459	# The second one needs to be disambiguated in stuff like stuff like:
1460	# $(echo $(( 1+2 )) )
1461	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1462
1463	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1464	# could save the lexer/reader state here, and retry if the arithmetic parse
1465	# fails. But we can almost always catch this at parse time. There could
1466	# be some exceptions like:
1467	# $((echo * foo)) # looks like multiplication
1468	# $((echo / foo)) # looks like division
1469
1470	self._SetNext(lex_mode_e.Arith)
1471	anode = self._ReadArithExpr(Id.Arith_RParen)
1472
1473	# TODO: This could be DQ or Arith too
1474	self._SetNext(lex_mode_e.ShCommand)
1475
1476	# PROBLEM: $(echo $(( 1 + 2 )) )
1477	# Two right parens break the Id.Eof_RParen scheme
1478	self._GetToken()
1479	if self.token_type != Id.Right_DollarDParen:
1480	p_die('Expected second ) to end arith sub', self.cur_token)
1481
1482	right_tok = self.cur_token
1483	return word_part.ArithSub(left_tok, anode, right_tok)
1484
1485	def ReadDParen(self):
1486	# type: () -> Tuple[arith_expr_t, Token]
1487	"""Read ((1+ 2)) -- command context.
1488
1489	We're using the word parser because it's very similar to _ReadArithExpr
1490	above.
1491
1492	This also returns the terminating `Op_DRightParen` token for use as location
1493	tracking.
1494	"""
1495	# The second one needs to be disambiguated in stuff like stuff like:
1496	# TODO: Be consistent with ReadForExpression below and use lex_mode_e.Arith?
1497	# Then you can get rid of this.
1498	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1499
1500	self._SetNext(lex_mode_e.Arith)
1501	anode = self._ReadArithExpr(Id.Arith_RParen)
1502
1503	self._SetNext(lex_mode_e.ShCommand)
1504
1505	# PROBLEM: $(echo $(( 1 + 2 )) )
1506	self._GetToken()
1507	right = self.cur_token
1508	if self.token_type != Id.Op_DRightParen:
1509	p_die('Expected second ) to end arith statement', self.cur_token)
1510
1511	self._SetNext(lex_mode_e.ShCommand)
1512
1513	return anode, right
1514
1515	def _SetNextNonSpace(self):
1516	# type: () -> None
1517	"""Same logic as _ReadWord, but for ReadForExpression."""
1518	while True:
1519	self._SetNext(lex_mode_e.Arith)
1520	self._GetToken()
1521	if self.token_kind not in (Kind.Ignored, Kind.WS):
1522	break
1523
1524	def ReadForExpression(self):
1525	# type: () -> command.ForExpr
1526	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1527	self._SetNextNonSpace() # skip over ((
1528
1529	self._GetToken()
1530	cur_id = self.token_type # for end of arith expressions
1531
1532	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1533	init_node = None # type: Optional[arith_expr_t]
1534	else:
1535	init_node = self.a_parser.Parse()
1536	cur_id = self.a_parser.CurrentId()
1537	self._SetNextNonSpace()
1538
1539	# It's odd to keep track of both cur_id and self.token_type in this
1540	# function, but it works, and is tested in 'test/parse_error.sh
1541	# arith-integration'
1542	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1543	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1544
1545	self._GetToken()
1546	cur_id = self.token_type
1547
1548	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1549	cond_node = None # type: Optional[arith_expr_t]
1550	else:
1551	cond_node = self.a_parser.Parse()
1552	cur_id = self.a_parser.CurrentId()
1553	self._SetNextNonSpace()
1554
1555	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1556	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1557
1558	self._GetToken()
1559	cur_id = self.token_type
1560
1561	if cur_id == Id.Arith_RParen: # for (( ; ; ))
1562	update_node = None # type: Optional[arith_expr_t]
1563	else:
1564	update_node = self._ReadArithExpr(Id.Arith_RParen)
1565	self._SetNextNonSpace()
1566
1567	self._GetToken()
1568	if self.token_type != Id.Arith_RParen:
1569	p_die('Expected ) to end for loop expression', self.cur_token)
1570	self._SetNext(lex_mode_e.ShCommand)
1571
1572	# redirects is None, will be assigned in CommandEvaluator
1573	node = command.ForExpr.CreateNull()
1574	node.init = init_node
1575	node.cond = cond_node
1576	node.update = update_node
1577	return node
1578
1579	def _ReadArrayLiteral(self):
1580	# type: () -> word_part_t
1581	"""a=(1 2 3)
1582
1583	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1584
1585	We want:
1586
1587	A=(['x']=1 ["x"]=2 [$x$y]=3)
1588
1589	Maybe allow this as a literal string? Because I think I've seen it before?
1590	Or maybe force people to patch to learn the rule.
1591
1592	A=([x]=4)
1593
1594	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1595	Maybe enforce that ALL have keys or NONE of have keys.
1596	"""
1597	self._SetNext(lex_mode_e.ShCommand) # advance past (
1598	self._GetToken()
1599	if self.cur_token.id != Id.Op_LParen:
1600	p_die('Expected ( after =', self.cur_token)
1601	left_token = self.cur_token
1602	right_token = None # type: Token
1603
1604	# MUST use a new word parser (with same lexer).
1605	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1606	words = [] # type: List[CompoundWord]
1607	done = False
1608	while not done:
1609	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1610	with tagswitch(w) as case:
1611	if case(word_e.Operator):
1612	tok = cast(Token, w)
1613	if tok.id == Id.Right_ShArrayLiteral:
1614	right_token = tok
1615	done = True # can't use break here
1616	# Unlike command parsing, array parsing allows embedded \n.
1617	elif tok.id == Id.Op_Newline:
1618	continue
1619	else:
1620	p_die('Unexpected token in array literal', loc.Word(w))
1621
1622	elif case(word_e.Compound):
1623	words.append(cast(CompoundWord, w))
1624
1625	else:
1626	raise AssertionError()
1627
1628	if len(words) == 0: # a=() is empty indexed array
1629	# Needed for type safety, doh
1630	no_words = [] # type: List[word_t]
1631	node = ShArrayLiteral(left_token, no_words, right_token)
1632	return node
1633
1634	pairs = [] # type: List[AssocPair]
1635	# If the first one is a key/value pair, then the rest are assumed to be.
1636	pair = word_.DetectAssocPair(words[0])
1637	if pair:
1638	pairs.append(pair)
1639
1640	n = len(words)
1641	for i in xrange(1, n):
1642	w2 = words[i]
1643	pair = word_.DetectAssocPair(w2)
1644	if not pair:
1645	p_die("Expected associative array pair", loc.Word(w2))
1646
1647	pairs.append(pair)
1648
1649	# invariant List?
1650	return word_part.BashAssocLiteral(left_token, pairs, right_token)
1651
1652	# Brace detection for arrays but NOT associative arrays
1653	words2 = braces.BraceDetectAll(words)
1654	words3 = word_.TildeDetectAll(words2)
1655	return ShArrayLiteral(left_token, words3, right_token)
1656
1657	def ParseProcCallArgs(self, start_symbol):
1658	# type: (int) -> ArgList
1659	""" json write (x) """
1660	self.lexer.MaybeUnreadOne()
1661
1662	arg_list = ArgList.CreateNull(alloc_lists=True)
1663	arg_list.left = self.cur_token
1664	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1665	return arg_list
1666
1667	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1668	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1669	"""Helper for _ReadCompoundWord3."""
1670	done = False
1671
1672	if self.token_type == Id.Lit_EscapedChar:
1673	tok = self.cur_token
1674	assert tok.length == 2
1675	ch = lexer.TokenSliceLeft(tok, 1)
1676	if not self.parse_opts.parse_backslash():
1677	if not pyutil.IsValidCharEscape(ch):
1678	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1679	self.cur_token)
1680
1681	part = word_part.EscapedLiteral(self.cur_token,
1682	ch) # type: word_part_t
1683	else:
1684	part = self.cur_token
1685
1686	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1687	parts.append(part)
1688	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1689	# _ReadWord.
1690	next_id = self.lexer.LookPastSpace(lex_mode)
1691	if next_id == Id.Op_LParen:
1692	self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1693	part2 = self._ReadArrayLiteral()
1694	parts.append(part2)
1695
1696	# Array literal must be the last part of the word.
1697	self._SetNext(lex_mode)
1698	self._GetToken()
1699	# EOF, whitespace, newline, Right_Subshell
1700	if self.token_kind not in KINDS_THAT_END_WORDS:
1701	p_die('Unexpected token after array literal',
1702	self.cur_token)
1703	done = True
1704
1705	elif (is_first and self.parse_opts.parse_at() and
1706	self.token_type == Id.Lit_Splice):
1707
1708	splice_tok = self.cur_token
1709	part2 = word_part.Splice(splice_tok,
1710	lexer.TokenSliceLeft(splice_tok, 1))
1711
1712	parts.append(part2)
1713
1714	# @words must be the last part of the word
1715	self._SetNext(lex_mode)
1716	self._GetToken()
1717	# EOF, whitespace, newline, Right_Subshell
1718	if self.token_kind not in KINDS_THAT_END_WORDS:
1719	p_die('Unexpected token after array splice', self.cur_token)
1720	done = True
1721
1722	elif (is_first and self.parse_opts.parse_at() and
1723	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1724	part2 = self._ReadExprSub(lex_mode_e.DQ)
1725	parts.append(part2)
1726
1727	# @[split(x)]
1728	self._SetNext(lex_mode)
1729	self._GetToken()
1730	# EOF, whitespace, newline, Right_Subshell
1731	if self.token_kind not in KINDS_THAT_END_WORDS:
1732	p_die('Unexpected token after Expr splice', self.cur_token)
1733	done = True
1734
1735	elif (is_first and self.parse_opts.parse_at() and
1736	self.token_type == Id.Lit_AtLBraceDot):
1737	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1738
1739	elif (is_first and self.parse_opts.parse_at_all() and
1740	self.token_type == Id.Lit_At):
1741	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1742	# at the beginning of a word to be reserved.
1743
1744	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1745	# @_argv and
1746	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1747	self.cur_token)
1748
1749	else:
1750	# not a literal with lookahead; append it
1751	parts.append(part)
1752
1753	return done
1754
1755	def _ReadCompoundWord(self, lex_mode):
1756	# type: (lex_mode_t) -> CompoundWord
1757	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1758
1759	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1760	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1761	"""
1762	Precondition: Looking at the first token of the first word part
1763	Postcondition: Looking at the token after, e.g. space or operator
1764
1765	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1766	could be an operator delimiting a compound word. Can we change lexer modes
1767	and remove this special case?
1768	"""
1769	w = CompoundWord([])
1770	num_parts = 0
1771	brace_count = 0
1772	done = False
1773	is_triple_quoted = None # type: Optional[BoolParamBox]
1774
1775	while not done:
1776	self._GetToken()
1777
1778	allow_done = empty_ok or num_parts != 0
1779	if allow_done and self.token_type == eof_type:
1780	done = True # e.g. for ${foo//pat/replace}
1781
1782	# Keywords like "for" are treated like literals
1783	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1784	Kind.ControlFlow, Kind.BoolUnary,
1785	Kind.BoolBinary):
1786
1787	# Syntax error for { and }
1788	if self.token_type == Id.Lit_LBrace:
1789	brace_count += 1
1790	elif self.token_type == Id.Lit_RBrace:
1791	brace_count -= 1
1792	elif self.token_type == Id.Lit_Dollar:
1793	if not self.parse_opts.parse_dollar():
1794	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1795	next_byte = self.lexer.ByteLookAhead()
1796	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1797	if next_byte == '/':
1798	#log('next_byte %r', next_byte)
1799	pass
1800
1801	p_die('Literal $ should be quoted like \$',
1802	self.cur_token)
1803
1804	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1805	w.parts)
1806
1807	elif self.token_kind == Kind.VSub:
1808	vsub_token = self.cur_token
1809
1810	part = SimpleVarSub(vsub_token) # type: word_part_t
1811	w.parts.append(part)
1812
1813	elif self.token_kind == Kind.ExtGlob:
1814	# If parse_at, we can take over @( to start @(seq 3)
1815	# Users can also use look at ,(.py\|.sh)
1816	if (self.parse_opts.parse_at() and
1817	self.token_type == Id.ExtGlob_At and num_parts == 0):
1818	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1819	d_quoted=False)
1820	# RARE mutation of tok.id!
1821	cs_part.left_token.id = Id.Left_AtParen
1822	part = cs_part # for type safety
1823
1824	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1825	# a=(one two)x and @arrayfunc(3)x.
1826	self._GetToken()
1827	if self.token_kind not in KINDS_THAT_END_WORDS:
1828	p_die('Unexpected token after @()', self.cur_token)
1829	done = True
1830
1831	else:
1832	part = self._ReadExtGlob()
1833	w.parts.append(part)
1834
1835	elif self.token_kind == Kind.BashRegex:
1836	if self.token_type == Id.BashRegex_LParen: # Opening (
1837	part = self._ReadBashRegexGroup()
1838	w.parts.append(part)
1839	else:
1840	assert self.token_type == Id.BashRegex_AllowedInParens
1841	p_die('Invalid token in bash regex', self.cur_token)
1842
1843	elif self.token_kind == Kind.Left:
1844	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1845	lex_mode == lex_mode_e.ShCommand and
1846	num_parts == 0)
1847
1848	# Save allocation
1849	if try_triple_quote:
1850	is_triple_quoted = BoolParamBox(False)
1851
1852	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1853	w.parts.append(part)
1854
1855	# NOT done yet, will advance below
1856	elif self.token_kind == Kind.Right:
1857	# Still part of the word; will be done on the next iter.
1858	if self.token_type == Id.Right_DoubleQuote:
1859	pass
1860	# Never happens, no PushHint for this case.
1861	#elif self.token_type == Id.Right_DollarParen:
1862	# pass
1863	elif self.token_type == Id.Right_Subshell:
1864	# LEXER HACK for (case x in x) ;; esac )
1865	# Rewind before it's used
1866	assert self.next_lex_mode == lex_mode_e.Undefined
1867	if self.lexer.MaybeUnreadOne():
1868	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1869	self._SetNext(lex_mode)
1870	done = True
1871	else:
1872	done = True
1873
1874	elif self.token_kind == Kind.Ignored:
1875	done = True
1876
1877	else:
1878	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1879	# so to test for ESAC, we can read ) before getting a chance to
1880	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1881	# token and do it again.
1882
1883	# We get Id.Op_RParen at top level: case x in x) ;; esac
1884	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1885	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1886	# Rewind before it's used
1887	assert self.next_lex_mode == lex_mode_e.Undefined
1888	if self.lexer.MaybeUnreadOne():
1889	if self.token_type == Id.Eof_RParen:
1890	# Redo translation
1891	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1892	self._SetNext(lex_mode)
1893
1894	done = True # anything we don't recognize means we're done
1895
1896	if not done:
1897	self._SetNext(lex_mode)
1898	num_parts += 1
1899
1900	if (self.parse_opts.parse_brace() and num_parts > 1 and
1901	brace_count != 0):
1902	# accept { and }, but not foo{
1903	p_die(
1904	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1905	loc.Word(w))
1906
1907	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1908	p_die('Unexpected parts after triple quoted string',
1909	loc.WordPart(w.parts[-1]))
1910
1911	if 0:
1912	from _devbuild.gen.syntax_asdl import word_part_str
1913	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1914	WORD_HIST[word_key] += 1
1915	return w
1916
1917	def _ReadArithWord(self):
1918	# type: () -> Optional[word_t]
1919	""" Helper for ReadArithWord() """
1920	self._GetToken()
1921
1922	if self.token_kind == Kind.Unknown:
1923	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1924	p_die(
1925	'Unexpected token while parsing arithmetic: %r' %
1926	lexer.TokenVal(self.cur_token), self.cur_token)
1927
1928	elif self.token_kind == Kind.Eof:
1929	return self.cur_token
1930
1931	elif self.token_kind == Kind.Ignored:
1932	# Space should be ignored.
1933	self._SetNext(lex_mode_e.Arith)
1934	return None
1935
1936	elif self.token_kind in (Kind.Arith, Kind.Right):
1937	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1938	self._SetNext(lex_mode_e.Arith)
1939	return self.cur_token
1940
1941	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1942	return self._ReadCompoundWord(lex_mode_e.Arith)
1943
1944	else:
1945	raise AssertionError(self.cur_token)
1946
1947	def _ReadWord(self, word_mode):
1948	# type: (lex_mode_t) -> Optional[word_t]
1949	"""Helper function for ReadWord()."""
1950
1951	# Change the pseudo lexer mode to a real lexer mode
1952	if word_mode == lex_mode_e.ShCommandFakeBrack:
1953	lex_mode = lex_mode_e.ShCommand
1954	else:
1955	lex_mode = word_mode
1956
1957	self._GetToken()
1958
1959	if self.token_kind == Kind.Eof:
1960	# No advance
1961	return self.cur_token
1962
1963	# Allow Arith for ) at end of for loop?
1964	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1965	self._SetNext(lex_mode)
1966
1967	# Newlines are complicated. See 3x2 matrix in the comment about
1968	# self.multiline and self.newline_state above.
1969	if self.token_type == Id.Op_Newline:
1970	if self.multiline:
1971	if self.newline_state > 1:
1972	# This points at a blank line, but at least it gives the line number
1973	p_die('Invalid blank line in multiline mode',
1974	self.cur_token)
1975	return None
1976
1977	if self.returned_newline: # skip
1978	return None
1979
1980	return self.cur_token
1981
1982	elif self.token_kind == Kind.Right:
1983	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
1984	Id.Right_CasePat,
1985	Id.Right_ShArrayLiteral):
1986	raise AssertionError(self.cur_token)
1987
1988	self._SetNext(lex_mode)
1989	return self.cur_token
1990
1991	elif self.token_kind in (Kind.Ignored, Kind.WS):
1992	self._SetNext(lex_mode)
1993	return None
1994
1995	else:
1996	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
1997	Kind.Left, Kind.KW, Kind.ControlFlow,
1998	Kind.BoolUnary, Kind.BoolBinary,
1999	Kind.ExtGlob,
2000	Kind.BashRegex), 'Unhandled token kind'
2001
2002	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2003	self.parse_opts.parse_bracket() and
2004	self.token_type == Id.Lit_LBracket):
2005	# Change [ from Kind.Lit -> Kind.Op
2006	# So CommandParser can treat
2007	# assert [42 === x]
2008	# like
2009	# json write (x)
2010	bracket_word = self.cur_token
2011	bracket_word.id = Id.Op_LBracket
2012
2013	self._SetNext(lex_mode)
2014	return bracket_word
2015
2016	# We're beginning a word. If we see Id.Lit_Pound, change to
2017	# lex_mode_e.Comment and read until end of line.
2018	if self.token_type == Id.Lit_Pound:
2019	self._SetNext(lex_mode_e.Comment)
2020	self._GetToken()
2021
2022	# NOTE: The # could be the last character in the file. It can't be
2023	# Eof_{RParen,Backtick} because #) and #` are comments.
2024	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2025	self.cur_token
2026
2027	# The next iteration will go into Kind.Ignored and set lex state to
2028	# lex_mode_e.ShCommand/etc.
2029	return None # tell ReadWord() to try again after comment
2030
2031	elif self.token_type == Id.Lit_TPound: ### doc comment
2032	self._SetNext(lex_mode_e.Comment)
2033	self._GetToken()
2034
2035	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2036	return self.cur_token
2037
2038	return None # tell ReadWord() to try again after comment
2039
2040	else:
2041	# r'' u'' b''
2042	if (self.token_type == Id.Lit_Chars and
2043	self.lexer.LookAheadOne(
2044	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2045
2046	# When shopt -s parse_raw_string:
2047	# echo r'hi' is like echo 'hi'
2048	#
2049	# echo u'\u{3bc}' b'\yff' works
2050
2051	tok = self.cur_token
2052	if self.parse_opts.parse_ysh_string():
2053	if lexer.TokenEquals(tok, 'r'):
2054	left_id = Id.Left_RSingleQuote
2055	elif lexer.TokenEquals(tok, 'u'):
2056	left_id = Id.Left_USingleQuote
2057	elif lexer.TokenEquals(tok, 'b'):
2058	left_id = Id.Left_BSingleQuote
2059	else:
2060	left_id = Id.Undefined_Tok
2061
2062	if left_id != Id.Undefined_Tok:
2063	# skip the r, and then 'foo' will be read as normal
2064	self._SetNext(lex_mode_e.ShCommand)
2065
2066	self._GetToken()
2067	assert self.token_type == Id.Left_SingleQuote, self.token_type
2068
2069	# Read the word in a different lexer mode
2070	return self._ReadYshSingleQuoted(left_id)
2071
2072	return self._ReadCompoundWord(lex_mode)
2073
2074	def ParseVarRef(self):
2075	# type: () -> BracedVarSub
2076	"""DYNAMIC parsing of what's inside ${!ref}
2077
2078	# Same as VarOf production
2079	VarRefExpr = VarOf EOF
2080	"""
2081	self._SetNext(lex_mode_e.VSub_1)
2082
2083	self._GetToken()
2084	if self.token_kind != Kind.VSub:
2085	p_die('Expected var name', self.cur_token)
2086
2087	part = self._ParseVarOf()
2088	# NOTE: no ${ } means no part.left and part.right
2089	part.left = part.token # cheat to make test pass
2090	part.right = part.token
2091
2092	self._GetToken()
2093	if self.token_type != Id.Eof_Real:
2094	p_die('Expected end of var ref expression', self.cur_token)
2095	return part
2096
2097	def LookPastSpace(self):
2098	# type: () -> Id_t
2099	"""Look ahead to the next token.
2100
2101	For the CommandParser to recognize
2102	array= (1 2 3)
2103	YSH for ( versus bash for ((
2104	YSH if ( versus if test
2105	YSH while ( versus while test
2106	YSH bare assignment 'grep =' versus 'grep foo'
2107	"""
2108	assert self.token_type != Id.Undefined_Tok
2109	if self.cur_token.id == Id.WS_Space:
2110	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2111	else:
2112	id_ = self.cur_token.id
2113	return id_
2114
2115	def LookAheadFuncParens(self):
2116	# type: () -> bool
2117	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2118	assert self.token_type != Id.Undefined_Tok
2119
2120	# We have to handle 2 cases because we buffer a token
2121	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2122	return self.lexer.LookAheadFuncParens(1) # go back one char
2123
2124	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2125	return self.lexer.LookAheadFuncParens(0)
2126
2127	else:
2128	return False
2129
2130	def ReadWord(self, word_mode):
2131	# type: (lex_mode_t) -> word_t
2132	"""Read the next word, using the given lexer mode.
2133
2134	This is a stateful wrapper for the stateless _ReadWord function.
2135	"""
2136	assert word_mode in (lex_mode_e.ShCommand,
2137	lex_mode_e.ShCommandFakeBrack,
2138	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2139
2140	if self.buffered_word: # For integration with pgen2
2141	w = self.buffered_word
2142	self.buffered_word = None
2143	else:
2144	while True:
2145	w = self._ReadWord(word_mode)
2146	if w is not None:
2147	break
2148
2149	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2150	return w
2151
2152	def ReadArithWord(self):
2153	# type: () -> word_t
2154	while True:
2155	w = self._ReadArithWord()
2156	if w is not None:
2157	break
2158	return w
2159
2160	def ReadHereDocBody(self, parts):
2161	# type: (List[word_part_t]) -> None
2162	"""
2163	A here doc is like a double quoted context, except " isn't special.
2164	"""
2165	self._ReadLikeDQ(None, False, parts)
2166	# Returns nothing
2167
2168	def ReadForPlugin(self):
2169	# type: () -> CompoundWord
2170	"""For $PS1, $PS4, etc.
2171
2172	This is just like reading a here doc line. "\n" is allowed, as
2173	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2174	"""
2175	w = CompoundWord([])
2176	self._ReadLikeDQ(None, False, w.parts)
2177	return w
2178
2179	def EmitDocToken(self, b):
2180	# type: (bool) -> None
2181	self.emit_doc_token = b
2182
2183	def Multiline(self, b):
2184	# type: (bool) -> None
2185	self.multiline = b
2186
2187
2188	if 0:
2189	import collections
2190	WORD_HIST = collections.Counter()
2191
2192	# vim: sw=4