osh/word_parse.py

OILS / osh / word_parse.py View on Github | oilshell.org

2217 lines, 1183 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	ShArrayLiteral,
65	AssocPair,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	arith_expr,
91	)
92	from core import alloc
93	from core.error import p_die
94	from mycpp.mylib import log
95	from core import pyutil
96	from core import ui
97	from frontend import consts
98	from frontend import lexer
99	from frontend import reader
100	from osh import tdop
101	from osh import arith_parse
102	from osh import braces
103	from osh import word_
104	from osh import word_compile
105	from mycpp.mylib import tagswitch
106
107	from typing import List, Optional, Tuple, cast
108	from typing import TYPE_CHECKING
109	if TYPE_CHECKING:
110	from frontend.lexer import Lexer
111	from frontend.parse_lib import ParseContext
112	from frontend.reader import _Reader
113	from osh.cmd_parse import VarChecker
114
115	unused1 = log
116	unused2 = Id_str
117
118	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
119
120
121	class WordEmitter(object):
122	"""Common interface for [ and [["""
123
124	def __init__(self):
125	# type: () -> None
126	"""Empty constructor for mycpp."""
127	pass
128
129	def ReadWord(self, lex_mode):
130	# type: (lex_mode_t) -> word_t
131	raise NotImplementedError()
132
133
134	class WordParser(WordEmitter):
135
136	def __init__(self, parse_ctx, lexer, line_reader):
137	# type: (ParseContext, Lexer, _Reader) -> None
138	self.parse_ctx = parse_ctx
139	self.lexer = lexer
140	self.line_reader = line_reader
141	self.arena = line_reader.arena
142
143	self.parse_opts = parse_ctx.parse_opts
144	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
145	self.parse_opts)
146	self.Reset()
147
148	def Init(self, lex_mode):
149	# type: (lex_mode_t) -> None
150	"""Used to parse arithmetic, see ParseContext."""
151	self.next_lex_mode = lex_mode
152
153	def Reset(self):
154	# type: () -> None
155	"""Called by interactive loop."""
156	# For _GetToken()
157	self.cur_token = None # type: Token
158	self.token_kind = Kind.Undefined
159	self.token_type = Id.Undefined_Tok
160
161	self.next_lex_mode = lex_mode_e.ShCommand
162
163	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
164	# comments
165	self.emit_doc_token = False
166	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
167	# multiline mode.
168	self.multiline = False
169
170	# For detecting invalid \n\n in multiline mode. Counts what we got
171	# directly from the lexer.
172	self.newline_state = 0
173	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
174	# that consume words.
175	self.returned_newline = False
176
177	# For integration with pgen2
178	self.buffered_word = None # type: word_t
179
180	def _GetToken(self):
181	# type: () -> None
182	"""Call this when you need to make a decision based on any of:
183
184	self.token_type
185	self.token_kind
186	self.cur_token
187	"""
188	if self.next_lex_mode == lex_mode_e.Undefined:
189	return # _SetNext() not called, so do nothing
190
191	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
192	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
193
194	self.cur_token = self.lexer.Read(real_mode)
195
196	# MUTATE TOKEN for fake lexer mode.
197	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
198	if (is_fake and self.cur_token.id
199	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
200	self.cur_token.id = Id.Lit_Chars
201
202	self.token_type = self.cur_token.id
203	self.token_kind = consts.GetKind(self.token_type)
204
205	# number of consecutive newlines, ignoring whitespace
206	if self.token_type == Id.Op_Newline:
207	self.newline_state += 1
208	elif self.token_kind != Kind.WS:
209	self.newline_state = 0
210
211	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
212	self.next_lex_mode = lex_mode_e.Undefined
213
214	def _SetNext(self, lex_mode):
215	# type: (lex_mode_t) -> None
216	"""Set the next lex state, but don't actually read a token.
217
218	We need this for proper interactive parsing.
219	"""
220	self.next_lex_mode = lex_mode
221
222	def _ReadVarOpArg(self, arg_lex_mode):
223	# type: (lex_mode_t) -> rhs_word_t
224
225	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
226	# valid, even when unquoted.
227	self._SetNext(arg_lex_mode)
228	self._GetToken()
229
230	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
231	True) # empty_ok
232
233	# If the Compound has no parts, and we're in a double-quoted VarSub
234	# arg, and empty_ok, then return Empty. This is so it can evaluate to
235	# the empty string and not get elided.
236	#
237	# Examples:
238	# - "${s:-}", "${s/%pat/}"
239	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
240	# has the same potential problem of not having Token location info.
241	#
242	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
243	# return a Compound with no parts, which is explicitly checked with a
244	# custom error message.
245	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
246	return rhs_word.Empty
247
248	return w
249
250	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
251	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
252	"""Return a CompoundWord.
253
254	Helper function for _ReadVarOpArg and used directly by
255	_ReadPatSubVarOp.
256	"""
257	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
258	#log('w %s', w)
259	tilde = word_.TildeDetect(w)
260	if tilde:
261	w = tilde
262	return w
263
264	def _ReadSliceVarOp(self):
265	# type: () -> suffix_op.Slice
266	"""
267	Looking token after first ':'
268
269	ArithExpr? (':' ArithExpr? )? '}'
270	"""
271	self._NextNonSpace()
272
273	cur_id = self.token_type
274
275	if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
276	begin = arith_expr.EmptyZero # type: arith_expr_t
277	else:
278	begin = self.a_parser.Parse()
279	cur_id = self.a_parser.CurrentId() # advance
280
281	if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
282	no_length = None # type: Optional[arith_expr_t] # No length specified
283	return suffix_op.Slice(begin, no_length)
284
285	elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
286	colon_tok = self.cur_token
287	self._NextNonSpace()
288
289	if self.token_type == Id.Arith_RBrace:
290	# quirky bash behavior:
291	# ${a:1:} or ${a::} means length ZERO
292	# but ${a:1} or ${a:} means length N
293	if self.parse_opts.strict_parse_slice():
294	p_die(
295	"Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
296	colon_tok)
297
298	length = arith_expr.EmptyZero
299	else:
300	length = self._ReadArithExpr(Id.Arith_RBrace)
301
302	return suffix_op.Slice(begin, length)
303
304	else:
305	p_die("Expected : or } in slice", self.cur_token)
306
307	raise AssertionError() # for MyPy
308
309	def _ReadPatSubVarOp(self):
310	# type: () -> suffix_op.PatSub
311	"""Looking at the first '/' after VarOf:
312
313	VarSub = ...
314	\| VarOf '/' Match ( '/' WORD? )?
315	Match = '/' WORD # can't be empty
316	\| '#' WORD? # may be empty
317	\| '%' WORD?
318	"""
319	slash_tok = self.cur_token # location info
320	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
321
322	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
323
324	self._GetToken()
325	if self.token_type == Id.Right_DollarBrace:
326	pat = CompoundWord([])
327	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
328	slash_tok)
329
330	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
331	replace_mode = self.token_type
332	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
333
334	# Bash quirk:
335	# echo ${x/#/replace} has an empty pattern
336	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
337	empty_ok = replace_mode != Id.Lit_Slash
338	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
339	empty_ok)
340	#log('pat 1 %r', pat)
341
342	if self.token_type == Id.Lit_Slash:
343	# read until }
344	replace = self._ReadVarOpArg(
345	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
346	#log('r 1 %r', replace)
347	else:
348	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
349	replace = rhs_word.Empty
350
351	self._GetToken()
352	if self.token_type != Id.Right_DollarBrace:
353	# This happens on invalid code
354	p_die(
355	"Expected } after replacement string, got %s" %
356	ui.PrettyId(self.token_type), self.cur_token)
357
358	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
359
360	def _ReadSubscript(self):
361	# type: () -> bracket_op_t
362	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
363	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
364	# expression.
365	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
366	if next_id in (Id.Lit_At, Id.Arith_Star):
367	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
368
369	self._SetNext(lex_mode_e.Arith) # skip past [
370	self._GetToken()
371	self._SetNext(lex_mode_e.Arith) # skip past @
372	self._GetToken()
373	else:
374	self._SetNext(lex_mode_e.Arith) # skip past [
375	anode = self._ReadArithExpr(Id.Arith_RBracket)
376	op = bracket_op.ArrayIndex(anode)
377
378	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
379	p_die('Expected ] to close subscript', self.cur_token)
380
381	self._SetNext(lex_mode_e.VSub_2) # skip past ]
382	self._GetToken() # Needed to be in the same spot as no subscript
383
384	return op
385
386	def _ParseVarOf(self):
387	# type: () -> BracedVarSub
388	"""
389	VarOf = NAME Subscript?
390	\| NUMBER # no subscript allowed, none of these are arrays
391	# ${@[1]} doesn't work, even though slicing does
392	\| VarSymbol
393	"""
394	self._GetToken()
395	name_token = self.cur_token
396	self._SetNext(lex_mode_e.VSub_2)
397
398	self._GetToken() # Check for []
399	if self.token_type == Id.VOp2_LBracket:
400	bracket_op = self._ReadSubscript()
401	else:
402	bracket_op = None
403
404	part = BracedVarSub.CreateNull()
405	part.token = name_token
406	part.var_name = lexer.TokenVal(name_token)
407	part.bracket_op = bracket_op
408	return part
409
410	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
411	# type: (lex_mode_t, bool) -> BracedVarSub
412	"""Start parsing at the op -- we already skipped past the name."""
413	part = self._ParseVarOf()
414
415	self._GetToken()
416	if self.token_type == Id.Right_DollarBrace:
417	return part # no ops
418
419	op_kind = self.token_kind
420
421	if op_kind == Kind.VTest:
422	tok = self.cur_token
423	arg_word = self._ReadVarOpArg(arg_lex_mode)
424	if self.token_type != Id.Right_DollarBrace:
425	p_die('Expected } to close ${', self.cur_token)
426
427	part.suffix_op = suffix_op.Unary(tok, arg_word)
428
429	elif op_kind == Kind.VOpYsh:
430	tok = self.cur_token
431	arg_word = self._ReadVarOpArg(arg_lex_mode)
432	if self.token_type != Id.Right_DollarBrace:
433	p_die('Expected } to close ${', self.cur_token)
434
435	UP_arg_word = arg_word
436	with tagswitch(arg_word) as case:
437	if case(rhs_word_e.Empty):
438	pass
439	elif case(rhs_word_e.Compound):
440	arg_word = cast(CompoundWord, UP_arg_word)
441	# This handles ${x\|html} and ${x %.3f} now
442	# However I think ${x %.3f} should be statically parsed? It can enter
443	# the printf lexer modes.
444	ok, arg, quoted = word_.StaticEval(arg_word)
445	if not ok or quoted:
446	p_die('Expected a constant argument',
447	loc.Word(arg_word))
448
449	part.suffix_op = suffix_op.Static(tok, arg)
450
451	elif op_kind == Kind.VOp0:
452	part.suffix_op = self.cur_token # Nullary
453	self._SetNext(lex_mode_e.VSub_2) # Expecting }
454	self._GetToken()
455
456	elif op_kind == Kind.VOp1: # % %% # ## etc.
457	tok = self.cur_token
458	# Weird exception that all shells have: these operators take a glob
459	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
460	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
461	if self.token_type != Id.Right_DollarBrace:
462	p_die('Expected } to close ${', self.cur_token)
463
464	part.suffix_op = suffix_op.Unary(tok, arg_word)
465
466	elif op_kind == Kind.VOp2: # / : [ ]
467	if self.token_type == Id.VOp2_Slash:
468	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
469	part.suffix_op = patsub_op
470
471	# Checked by the method above
472	assert self.token_type == Id.Right_DollarBrace, self.cur_token
473
474	elif self.token_type == Id.VOp2_Colon:
475	part.suffix_op = self._ReadSliceVarOp()
476	# NOTE: } in arithmetic mode.
477	if self.token_type != Id.Arith_RBrace:
478	# Token seems off; doesn't point to X in # ${a:1:2 X
479	p_die('Expected } to close ${', self.cur_token)
480
481	else:
482	# TODO: Does this ever happen?
483	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
484
485	elif op_kind == Kind.VOp3: # ${prefix@} etc.
486	if allow_query:
487	part.suffix_op = self.cur_token # Nullary
488	self._SetNext(lex_mode_e.VSub_2) # Expecting }
489	self._GetToken()
490	else:
491	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
492
493	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
494	# mode. It's redundantly checked above.
495	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
496	# ${a.} or ${!a.}
497	p_die('Expected } to close ${', self.cur_token)
498
499	# Now look for ops
500	return part
501
502	def _ReadZshVarSub(self, left_token):
503	# type: (Token) -> word_part.ZshVarSub
504
505	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
506
507	# Can be empty
508	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
509	True)
510	self._GetToken()
511	return word_part.ZshVarSub(left_token, w, self.cur_token)
512
513	def ReadBracedVarSub(self, left_token):
514	# type: (Token) -> Tuple[BracedVarSub, Token]
515	""" For YSH expressions like var x = ${x:-"default"}. """
516	part = self._ReadBracedVarSub(left_token, d_quoted=False)
517	last_token = self.cur_token
518	return part, last_token
519
520	def _ReadBracedVarSub(self, left_token, d_quoted):
521	# type: (Token, bool) -> BracedVarSub
522	"""For the ${} expression language.
523
524	NAME = [a-zA-Z_][a-zA-Z0-9_]*
525	NUMBER = [0-9]+ # ${10}, ${11}, ...
526
527	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
528	VarSymbol = '!' \| '@' \| '#' \| ...
529	VarOf = NAME Subscript?
530	\| NUMBER # no subscript allowed, none of these are arrays
531	# ${@[1]} doesn't work, even though slicing does
532	\| VarSymbol
533
534	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
535
536	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
537	STRIP_OP = '#' \| '##' \| '%' \| '%%'
538	CASE_OP = ',' \| ',,' \| '^' \| '^^'
539	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
540
541	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
542	# SPACE is operator not %
543	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
544	VarExpr = VarOf
545	\| VarOf NULLARY_OP
546	\| VarOf UnaryOp WORD
547	\| VarOf YSH_UNARY STATIC_WORD
548	\| VarOf ':' ArithExpr (':' ArithExpr )?
549	\| VarOf '/' Match '/' WORD
550
551	LengthExpr = '#' VarOf # can't apply operators after length
552
553	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
554	# ${!ref[0]} vs ${!keys[@]} resolved later
555
556	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
557
558	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
559
560	VarSub = LengthExpr
561	\| RefOrKeys
562	\| PrefixQuery
563	\| VarExpr
564	\| BuiltinSub
565
566	NOTES:
567	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
568	slicing ${a:x+1:y+2}
569	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
570	- @ and * are technically arithmetic expressions in this implementation
571	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
572	it's also vectorized.
573
574	Strictness over bash:
575	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
576	grammar
577	- ! and # prefixes can't be composed, even though named refs can be
578	composed with other operators
579	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
580	a prefix, and it can also be a literal part of WORD.
581
582	From the parser's point of view, the prefix # can't be combined with
583	UnaryOp/slicing/matching, and the ! can. However
584
585	- ${a[@]:1:2} is not allowed
586	- ${#a[@]:1:2} is allowed, but gives the wrong answer
587	"""
588	if d_quoted:
589	arg_lex_mode = lex_mode_e.VSub_ArgDQ
590	else:
591	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
592
593	self._SetNext(lex_mode_e.VSub_1)
594	self._GetToken()
595
596	ty = self.token_type
597	first_tok = self.cur_token
598
599	if ty == Id.VSub_Pound:
600	# Disambiguate
601	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
602	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
603	# e.g. a name, '#' is the prefix
604	self._SetNext(lex_mode_e.VSub_1)
605	part = self._ParseVarOf()
606
607	self._GetToken()
608	if self.token_type != Id.Right_DollarBrace:
609	p_die('Expected } after length expression', self.cur_token)
610
611	part.prefix_op = first_tok
612
613	else: # not a prefix, '#' is the variable
614	part = self._ParseVarExpr(arg_lex_mode)
615
616	elif ty == Id.VSub_Bang:
617	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
618	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
619	# e.g. a name, '!' is the prefix
620	# ${!a} -- this is a ref
621	# ${!3} -- this is ref
622	# ${!a[1]} -- this is a ref
623	# ${!a[@]} -- this is a keys
624	# No lookahead -- do it in a second step, or at runtime
625	self._SetNext(lex_mode_e.VSub_1)
626	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
627
628	part.prefix_op = first_tok
629
630	else: # not a prefix, '!' is the variable
631	part = self._ParseVarExpr(arg_lex_mode)
632
633	elif ty == Id.VSub_Dot:
634	# Note: this will become a new builtin_sub type, so this method must
635	# return word_part_t rather than BracedVarSub. I don't think that
636	# should cause problems.
637	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
638
639	# VS_NAME, VS_NUMBER, symbol that isn't # or !
640	elif self.token_kind == Kind.VSub:
641	part = self._ParseVarExpr(arg_lex_mode)
642
643	else:
644	# e.g. ${^}
645	p_die('Unexpected token in ${}', self.cur_token)
646
647	part.left = left_token # attach the argument
648	part.right = self.cur_token
649	return part
650
651	def _ReadSingleQuoted(self, left_token, lex_mode):
652	# type: (Token, lex_mode_t) -> SingleQuoted
653	"""Internal method to read a word_part."""
654	tokens = [] # type: List[Token]
655	# In command mode, we never disallow backslashes like '\'
656	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
657	False)
658	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
659	node = SingleQuoted(left_token, sval, right_quote)
660	return node
661
662	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
663	# type: (lex_mode_t, Token, List[Token], bool) -> Token
664	"""Appends to out_tokens; returns last token
665
666	Used by expr_parse.py
667	"""
668	# TODO: Remove and use out_tokens
669	tokens = [] # type: List[Token]
670
671	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
672	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
673
674	expected_end_tokens = 3 if left_token.id in (
675	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
676	Id.Left_BTSingleQuote) else 1
677	num_end_tokens = 0
678
679	while num_end_tokens < expected_end_tokens:
680	self._SetNext(lex_mode)
681	self._GetToken()
682
683	# Kind.Char emitted in lex_mode.SQ_C
684	if self.token_kind in (Kind.Lit, Kind.Char):
685	tok = self.cur_token
686	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
687	# r'one\two' or c'one\\two'
688	if no_backslashes and lexer.TokenContains(tok, '\\'):
689	p_die(
690	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
691	tok)
692
693	if is_ysh_expr:
694	# Disallow var x = $'\001'. Arguably we don't need these
695	# checks because u'\u{1}' is the way to write it.
696	if self.token_type == Id.Char_Octal3:
697	p_die(
698	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
699	tok)
700
701	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
702	# disallow \xH
703	p_die(
704	r'Invalid hex escape in YSH string (must be \xHH)',
705	tok)
706
707	tokens.append(tok)
708
709	elif self.token_kind == Kind.Unknown:
710	tok = self.cur_token
711	assert tok.id == Id.Unknown_Backslash, tok
712
713	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
714	if is_ysh_expr or not self.parse_opts.parse_backslash():
715	p_die(
716	"Invalid char escape in C-style string literal (OILS-ERR-11)",
717	tok)
718
719	tokens.append(tok)
720
721	elif self.token_kind == Kind.Eof:
722	p_die('Unexpected EOF in single-quoted string that began here',
723	left_token)
724
725	elif self.token_kind == Kind.Right:
726	# assume Id.Right_SingleQuote
727	num_end_tokens += 1
728	tokens.append(self.cur_token)
729
730	else:
731	raise AssertionError(self.cur_token)
732
733	if self.token_kind != Kind.Right:
734	num_end_tokens = 0 # we need three in a ROW
735
736	if expected_end_tokens == 1:
737	tokens.pop()
738	elif expected_end_tokens == 3: # Get rid of spurious end tokens
739	tokens.pop()
740	tokens.pop()
741	tokens.pop()
742
743	# Remove space from ''' r''' $''' in both expression mode and command mode
744	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
745	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
746	word_compile.RemoveLeadingSpaceSQ(tokens)
747
748	# Validation after lexing - same 2 checks in j8.LexerDecoder
749	is_u_string = left_token.id in (Id.Left_USingleQuote,
750	Id.Left_UTSingleQuote)
751
752	for tok in tokens:
753	# u'\yff' is not valid, but b'\yff' is
754	if is_u_string and tok.id == Id.Char_YHex:
755	p_die(
756	r"%s escapes not allowed in u'' strings" %
757	lexer.TokenVal(tok), tok)
758
759	out_tokens.extend(tokens)
760	return self.cur_token
761
762	def _ReadDoubleQuotedLeftParts(self):
763	# type: () -> word_part_t
764	"""Read substitution parts in a double quoted context."""
765	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
766	return self._ReadCommandSub(self.token_type, d_quoted=True)
767
768	if self.token_type == Id.Left_DollarBrace:
769	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
770
771	if self.token_type == Id.Left_DollarDParen:
772	return self._ReadArithSub()
773
774	if self.token_type == Id.Left_DollarBracket:
775	return self._ReadExprSub(lex_mode_e.DQ)
776
777	raise AssertionError(self.cur_token)
778
779	def _ReadYshSingleQuoted(self, left_id):
780	# type: (Id_t) -> CompoundWord
781	"""Read YSH style strings
782
783	r'' u'' b''
784	r''' ''' u''' ''' b''' '''
785	"""
786	#log('BEF self.cur_token %s', self.cur_token)
787	if left_id == Id.Left_RSingleQuote:
788	lexer_mode = lex_mode_e.SQ_Raw
789	triple_left_id = Id.Left_RTSingleQuote
790	elif left_id == Id.Left_USingleQuote:
791	lexer_mode = lex_mode_e.J8_Str
792	triple_left_id = Id.Left_UTSingleQuote
793	elif left_id == Id.Left_BSingleQuote:
794	lexer_mode = lex_mode_e.J8_Str
795	triple_left_id = Id.Left_BTSingleQuote
796	else:
797	raise AssertionError(left_id)
798
799	# Needed for syntax checks
800	left_tok = self.cur_token
801	left_tok.id = left_id
802
803	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
804
805	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
806	self._SetNext(lex_mode_e.ShCommand)
807	self._GetToken()
808
809	assert self.token_type == Id.Left_SingleQuote
810	# HACK: magically transform the third ' in u''' to
811	# Id.Left_UTSingleQuote, so that ''' is the terminator
812	left_tok = self.cur_token
813	left_tok.id = triple_left_id
814
815	# Handles stripping leading whitespace
816	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
817
818	# Advance and validate
819	self._SetNext(lex_mode_e.ShCommand)
820
821	self._GetToken()
822	if self.token_kind not in KINDS_THAT_END_WORDS:
823	p_die('Unexpected token after YSH single-quoted string',
824	self.cur_token)
825
826	return CompoundWord([sq_part])
827
828	def _ReadUnquotedLeftParts(self, triple_out):
829	# type: (Optional[BoolParamBox]) -> word_part_t
830	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
831
832	If triple_out is set, then we try parsing triple quoted strings,
833	and set its value to True if we got one.
834	"""
835	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
836	# Note: $"" is a synonym for "". It might make sense if it added
837	# \n \0 \x00 \u{123} etc. But that's not what bash does!
838	dq_part = self._ReadDoubleQuoted(self.cur_token)
839	# Got empty word "" and there's a " after
840	if (triple_out and len(dq_part.parts) == 0 and
841	self.lexer.ByteLookAhead() == '"'):
842
843	self._SetNext(lex_mode_e.ShCommand)
844	self._GetToken()
845	# HACK: magically transform the third " in """ to
846	# Id.Left_TDoubleQuote, so that """ is the terminator
847	left_dq_token = self.cur_token
848	left_dq_token.id = Id.Left_TDoubleQuote
849	triple_out.b = True # let caller know we got it
850	return self._ReadDoubleQuoted(left_dq_token)
851
852	return dq_part
853
854	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
855	Id.Left_DollarSingleQuote):
856	if self.token_type == Id.Left_SingleQuote:
857	lexer_mode = lex_mode_e.SQ_Raw
858	triple_left_id = Id.Left_TSingleQuote
859	elif self.token_type == Id.Left_RSingleQuote:
860	lexer_mode = lex_mode_e.SQ_Raw
861	triple_left_id = Id.Left_RTSingleQuote
862	else:
863	lexer_mode = lex_mode_e.SQ_C
864	# there is no such thing as $'''
865	triple_left_id = Id.Undefined_Tok
866
867	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
868
869	# Got empty '' or r'' and there's a ' after
870	# u'' and b'' are handled in _ReadYshSingleQuoted
871	if (triple_left_id != Id.Undefined_Tok and
872	triple_out is not None and len(sq_part.sval) == 0 and
873	self.lexer.ByteLookAhead() == "'"):
874
875	self._SetNext(lex_mode_e.ShCommand)
876	self._GetToken()
877
878	# HACK: magically transform the third ' in ''' to
879	# Id.Left_TSingleQuote, so that ''' is the terminator
880	left_sq_token = self.cur_token
881	left_sq_token.id = triple_left_id
882
883	triple_out.b = True # let caller know we got it
884	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
885
886	return sq_part
887
888	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
889	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
890	return self._ReadCommandSub(self.token_type, d_quoted=False)
891
892	if self.token_type == Id.Left_DollarBrace:
893	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
894
895	if self.token_type == Id.Left_DollarDParen:
896	return self._ReadArithSub()
897
898	if self.token_type == Id.Left_DollarBracket:
899	return self._ReadExprSub(lex_mode_e.ShCommand)
900
901	if self.token_type == Id.Left_DollarBraceZsh:
902	return self._ReadZshVarSub(self.cur_token)
903
904	raise AssertionError(self.cur_token)
905
906	def _ReadExtGlob(self):
907	# type: () -> word_part.ExtGlob
908	"""
909	Grammar:
910	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
911	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
912	RIGHT = ')'
913	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
914	Compound includes ExtGlob
915	"""
916	left_token = self.cur_token
917	right_token = None # type: Token
918	arms = [] # type: List[CompoundWord]
919
920	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
921	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
922
923	read_word = False # did we just a read a word? To handle @(\|\|).
924
925	while True:
926	self._GetToken()
927
928	if self.token_type == Id.Right_ExtGlob:
929	if not read_word:
930	arms.append(CompoundWord([]))
931	right_token = self.cur_token
932	break
933
934	elif self.token_type == Id.Op_Pipe:
935	if not read_word:
936	arms.append(CompoundWord([]))
937	read_word = False
938	self._SetNext(lex_mode_e.ExtGlob)
939
940	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
941	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
942	Kind.ExtGlob):
943	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
944	arms.append(w)
945	read_word = True
946
947	elif self.token_kind == Kind.Eof:
948	p_die('Unexpected EOF reading extended glob that began here',
949	left_token)
950
951	else:
952	raise AssertionError(self.cur_token)
953
954	return word_part.ExtGlob(left_token, arms, right_token)
955
956	def _ReadBashRegexGroup(self):
957	# type: () -> word_part.BashRegexGroup
958	"""
959	Grammar:
960	BashRegexGroup = '(' WORD? ')
961	"""
962	left_token = self.cur_token
963	assert left_token.id == Id.BashRegex_LParen, left_token
964
965	right_token = None # type: Token
966	arms = [] # type: List[CompoundWord]
967
968	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
969	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
970
971	self._GetToken()
972	if self.token_type == Id.Right_BashRegexGroup: # empty ()
973	return word_part.BashRegexGroup(left_token, None, self.cur_token)
974
975	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
976	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
977	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
978	# To allow bash style [[ s =~ (a b) ]]
979	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
980	arms.append(w)
981
982	self._GetToken()
983	if self.token_type != Id.Right_BashRegexGroup:
984	p_die('Expected ) to close bash regex group', self.cur_token)
985
986	return word_part.BashRegexGroup(left_token, w, self.cur_token)
987
988	p_die('Expected word after ( opening bash regex group', self.cur_token)
989
990	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
991	# type: (Optional[Token], bool, List[word_part_t]) -> None
992	"""
993	Args:
994	left_token: A token if we are reading a double quoted part, or None if
995	we're reading a here doc.
996	is_ysh_expr: Whether to disallow backticks and invalid char escapes
997	out_parts: list of word_part to append to
998	"""
999	if left_token:
1000	if left_token.id in (Id.Left_TDoubleQuote,
1001	Id.Left_DollarTDoubleQuote):
1002	expected_end_tokens = 3
1003	else:
1004	expected_end_tokens = 1
1005	else:
1006	expected_end_tokens = 1000 # here doc will break
1007
1008	num_end_tokens = 0
1009	while num_end_tokens < expected_end_tokens:
1010	self._SetNext(lex_mode_e.DQ)
1011	self._GetToken()
1012
1013	if self.token_kind == Kind.Lit:
1014	if self.token_type == Id.Lit_EscapedChar:
1015	tok = self.cur_token
1016	ch = lexer.TokenSliceLeft(tok, 1)
1017	part = word_part.EscapedLiteral(tok,
1018	ch) # type: word_part_t
1019	else:
1020	if self.token_type == Id.Lit_BadBackslash:
1021	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1022	# YSH.
1023	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1024	# recursion (unless parse_backslash)
1025	if (is_ysh_expr or
1026	not self.parse_opts.parse_backslash()):
1027	p_die(
1028	"Invalid char escape in double quoted string (OILS-ERR-12)",
1029	self.cur_token)
1030	elif self.token_type == Id.Lit_Dollar:
1031	if is_ysh_expr or not self.parse_opts.parse_dollar():
1032	p_die("Literal $ should be quoted like \$",
1033	self.cur_token)
1034
1035	part = self.cur_token
1036	out_parts.append(part)
1037
1038	elif self.token_kind == Kind.Left:
1039	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1040	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1041	self.cur_token)
1042
1043	part = self._ReadDoubleQuotedLeftParts()
1044	out_parts.append(part)
1045
1046	elif self.token_kind == Kind.VSub:
1047	tok = self.cur_token
1048	part = SimpleVarSub(tok)
1049	out_parts.append(part)
1050	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1051	# later.
1052
1053	elif self.token_kind == Kind.Right:
1054	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1055	if left_token:
1056	num_end_tokens += 1
1057
1058	# In a here doc, the right quote is literal!
1059	out_parts.append(self.cur_token)
1060
1061	elif self.token_kind == Kind.Eof:
1062	if left_token:
1063	p_die(
1064	'Unexpected EOF reading double-quoted string that began here',
1065	left_token)
1066	else: # here docs will have an EOF in their token stream
1067	break
1068
1069	else:
1070	raise AssertionError(self.cur_token)
1071
1072	if self.token_kind != Kind.Right:
1073	num_end_tokens = 0 # """ must be CONSECUTIVE
1074
1075	if expected_end_tokens == 1:
1076	out_parts.pop()
1077	elif expected_end_tokens == 3:
1078	out_parts.pop()
1079	out_parts.pop()
1080	out_parts.pop()
1081
1082	# Remove space from """ in both expression mode and command mode
1083	if (left_token and left_token.id
1084	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1085	word_compile.RemoveLeadingSpaceDQ(out_parts)
1086
1087	# Return nothing, since we appended to 'out_parts'
1088
1089	def _ReadDoubleQuoted(self, left_token):
1090	# type: (Token) -> DoubleQuoted
1091	"""Helper function for "hello $name".
1092
1093	Args:
1094	eof_type: for stopping at }, Id.Lit_RBrace
1095	here_doc: Whether we are reading in a here doc context
1096
1097	Also ${foo%%a b c} # treat this as double quoted. until you hit
1098	"""
1099	parts = [] # type: List[word_part_t]
1100	self._ReadLikeDQ(left_token, False, parts)
1101
1102	right_quote = self.cur_token
1103	return DoubleQuoted(left_token, parts, right_quote)
1104
1105	def ReadDoubleQuoted(self, left_token, parts):
1106	# type: (Token, List[word_part_t]) -> Token
1107	"""For expression mode.
1108
1109	Read var x = "${dir:-}/$name"; etc.
1110	"""
1111	self._ReadLikeDQ(left_token, True, parts)
1112	return self.cur_token
1113
1114	def _ReadCommandSub(self, left_id, d_quoted=False):
1115	# type: (Id_t, bool) -> CommandSub
1116	"""
1117	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1118
1119	command_sub = '$(' command_list ')'
1120	\| '@(' command_list ')'
1121	\| '<(' command_list ')'
1122	\| '>(' command_list ')'
1123	\| ` command_list `
1124	"""
1125	left_token = self.cur_token
1126
1127	# Set the lexer in a state so ) becomes the EOF token.
1128	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1129	Id.Left_ProcSubOut):
1130	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1131
1132	right_id = Id.Eof_RParen
1133	self.lexer.PushHint(Id.Op_RParen, right_id)
1134	c_parser = self.parse_ctx.MakeParserForCommandSub(
1135	self.line_reader, self.lexer, right_id)
1136	# NOTE: This doesn't use something like main_loop because we don't want
1137	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1138	node = c_parser.ParseCommandSub()
1139
1140	right_token = c_parser.w_parser.cur_token
1141
1142	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1143	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1144	# test/osh2oil.
1145
1146	right_id = Id.Eof_Backtick
1147	self.lexer.PushHint(Id.Left_Backtick, right_id)
1148	c_parser = self.parse_ctx.MakeParserForCommandSub(
1149	self.line_reader, self.lexer, right_id)
1150	node = c_parser.ParseCommandSub()
1151	right_token = c_parser.w_parser.cur_token
1152
1153	elif left_id == Id.Left_Backtick:
1154	if not self.parse_opts.parse_backticks():
1155	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1156	left_token)
1157
1158	self._SetNext(lex_mode_e.Backtick) # advance past `
1159
1160	parts = [] # type: List[str]
1161	while True:
1162	self._GetToken()
1163	#log("TOK %s", self.cur_token)
1164
1165	if self.token_type == Id.Backtick_Quoted:
1166	# Remove leading \
1167	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1168
1169	elif self.token_type == Id.Backtick_DoubleQuote:
1170	# Compatibility: If backticks are double quoted, then double quotes
1171	# within them have to be \"
1172	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1173	# is)
1174	if d_quoted:
1175	# Remove leading \
1176	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1177	else:
1178	parts.append(lexer.TokenVal(self.cur_token))
1179
1180	elif self.token_type == Id.Backtick_Other:
1181	parts.append(lexer.TokenVal(self.cur_token))
1182
1183	elif self.token_type == Id.Backtick_Right:
1184	break
1185
1186	elif self.token_type == Id.Eof_Real:
1187	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1188	p_die('Unexpected EOF while looking for closing backtick',
1189	left_token)
1190
1191	else:
1192	raise AssertionError(self.cur_token)
1193
1194	self._SetNext(lex_mode_e.Backtick)
1195
1196	# Calculate right SPID on CommandSub BEFORE re-parsing.
1197	right_token = self.cur_token
1198
1199	code_str = ''.join(parts)
1200	#log('code %r', code_str)
1201
1202	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1203	# won't have the same location info as MakeParserForCommandSub(), because
1204	# the lexer is different.
1205	arena = self.parse_ctx.arena
1206	#arena = alloc.Arena()
1207	line_reader = reader.StringLineReader(code_str, arena)
1208	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1209	src = source.Reparsed('backticks', left_token, right_token)
1210	with alloc.ctx_SourceCode(arena, src):
1211	node = c_parser.ParseCommandSub()
1212
1213	else:
1214	raise AssertionError(left_id)
1215
1216	return CommandSub(left_token, node, right_token)
1217
1218	def _ReadExprSub(self, lex_mode):
1219	# type: (lex_mode_t) -> word_part.ExprSub
1220	"""$[d->key] $[obj.method()] etc."""
1221	left_token = self.cur_token
1222
1223	self._SetNext(lex_mode_e.Expr)
1224	enode, right_token = self.parse_ctx.ParseYshExpr(
1225	self.lexer, grammar_nt.ysh_expr_sub)
1226
1227	self._SetNext(lex_mode) # Move past ]
1228	return word_part.ExprSub(left_token, enode, right_token)
1229
1230	def ParseVarDecl(self, kw_token):
1231	# type: (Token) -> command.VarDecl
1232	"""
1233	oil_var_decl: name_type_list '=' testlist end_stmt
1234
1235	Note that assignments must end with \n ; } or EOF. Unlike shell
1236	assignments, we disallow:
1237
1238	var x = 42 \| wc -l
1239	var x = 42 && echo hi
1240	"""
1241	self._SetNext(lex_mode_e.Expr)
1242	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1243	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1244	# wants
1245	if last_token.id == Id.Op_RBrace:
1246	last_token.id = Id.Lit_RBrace
1247
1248	# Let the CommandParser see the Op_Semi or Op_Newline.
1249	self.buffered_word = last_token
1250	self._SetNext(lex_mode_e.ShCommand) # always back to this
1251	return enode
1252
1253	def ParseMutation(self, kw_token, var_checker):
1254	# type: (Token, VarChecker) -> command.Mutation
1255	"""
1256	setvar i = 42
1257	setvar i += 1
1258	setvar a[i] = 42
1259	setvar a[i] += 1
1260	setvar d.key = 42
1261	setvar d.key += 1
1262	"""
1263	self._SetNext(lex_mode_e.Expr)
1264	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1265	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1266	# wants
1267	if last_token.id == Id.Op_RBrace:
1268	last_token.id = Id.Lit_RBrace
1269
1270	for lhs in enode.lhs:
1271	UP_lhs = lhs
1272	with tagswitch(lhs) as case:
1273	if case(y_lhs_e.Var):
1274	lhs = cast(Token, UP_lhs)
1275	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1276
1277	# Note: this does not cover cases like
1278	# setvar (a[0])[1] = v
1279	# setvar (d.key).other = v
1280	# This leaks into catching all typos statically, which may be
1281	# possible if 'use' makes all names explicit.
1282	elif case(y_lhs_e.Subscript):
1283	lhs = cast(Subscript, UP_lhs)
1284	if lhs.obj.tag() == expr_e.Var:
1285	v = cast(expr.Var, lhs.obj)
1286	var_checker.Check(kw_token.id, v.name, v.left)
1287
1288	elif case(y_lhs_e.Attribute):
1289	lhs = cast(Attribute, UP_lhs)
1290	if lhs.obj.tag() == expr_e.Var:
1291	v = cast(expr.Var, lhs.obj)
1292	var_checker.Check(kw_token.id, v.name, v.left)
1293
1294	# Let the CommandParser see the Op_Semi or Op_Newline.
1295	self.buffered_word = last_token
1296	self._SetNext(lex_mode_e.ShCommand) # always back to this
1297	return enode
1298
1299	def ParseBareDecl(self):
1300	# type: () -> expr_t
1301	"""
1302	x = {name: val}
1303	"""
1304	self._SetNext(lex_mode_e.Expr)
1305	self._GetToken()
1306	enode, last_token = self.parse_ctx.ParseYshExpr(
1307	self.lexer, grammar_nt.command_expr)
1308	if last_token.id == Id.Op_RBrace:
1309	last_token.id = Id.Lit_RBrace
1310	self.buffered_word = last_token
1311	self._SetNext(lex_mode_e.ShCommand)
1312	return enode
1313
1314	def ParseYshExprForCommand(self):
1315	# type: () -> expr_t
1316
1317	# Fudge for this case
1318	# for x in(y) {
1319	# versus
1320	# for x in (y) {
1321	#
1322	# In the former case, ReadWord on 'in' puts the lexer past (.
1323	# Also see LookPastSpace in CommandParers.
1324	# A simpler solution would be nicer.
1325
1326	if self.token_type == Id.Op_LParen:
1327	self.lexer.MaybeUnreadOne()
1328
1329	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1330
1331	self._SetNext(lex_mode_e.ShCommand)
1332	return enode
1333
1334	def ParseCommandExpr(self):
1335	# type: () -> expr_t
1336	"""
1337	= 1+2
1338	"""
1339	enode, last_token = self.parse_ctx.ParseYshExpr(
1340	self.lexer, grammar_nt.command_expr)
1341
1342	# In some cases, such as the case statement, we expect the lexer to be
1343	# pointing at the token right after the expression. But the expression
1344	# parser must have read to the `last_token`. Unreading places the lexer
1345	# back in the expected state. Ie:
1346	#
1347	# case (x) { case (x) {
1348	# (else) { = x } (else) { = x }
1349	# ^ The lexer is here ^ Unread to here
1350	# } }
1351	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1352	Id.Op_RBrace), last_token
1353	if last_token.id != Id.Eof_Real:
1354	# Eof_Real is the only token we cannot unread
1355	self.lexer.MaybeUnreadOne()
1356
1357	return enode
1358
1359	def ParseProc(self, node):
1360	# type: (Proc) -> None
1361
1362	# proc name-with-hyphens() must be accepted
1363	self._SetNext(lex_mode_e.ShCommand)
1364	self._GetToken()
1365	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1366	if self.token_type != Id.Lit_Chars:
1367	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1368	self.cur_token)
1369
1370	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1371	# for shell functions. Similar to IsValidVarName().
1372	node.name = self.cur_token
1373
1374	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1375
1376	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1377	assert last_token.id == Id.Op_LBrace
1378	last_token.id = Id.Lit_LBrace
1379	self.buffered_word = last_token
1380
1381	self._SetNext(lex_mode_e.ShCommand)
1382
1383	def ParseFunc(self, node):
1384	# type: (Func) -> None
1385	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1386
1387	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1388	assert last_token.id == Id.Op_LBrace
1389	last_token.id = Id.Lit_LBrace
1390	self.buffered_word = last_token
1391
1392	self._SetNext(lex_mode_e.ShCommand)
1393
1394	def ParseYshCasePattern(self):
1395	# type: () -> Tuple[pat_t, Token]
1396	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1397	self.lexer)
1398
1399	if last_token.id == Id.Op_LBrace:
1400	last_token.id = Id.Lit_LBrace
1401	self.buffered_word = last_token
1402
1403	return pat, left_tok
1404
1405	def NewlineOkForYshCase(self):
1406	# type: () -> Id_t
1407	"""Check for optional newline and consume it.
1408
1409	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1410	which crop up while parsing Ysh Case Arms. For more details, see
1411	#oil-dev > Progress On YSH Case Grammar on zulip.
1412
1413	Returns a token id which is filled with the choice of
1414
1415	word { echo word }
1416	(3) { echo expr }
1417	/e/ { echo eggex }
1418	} # right brace
1419	"""
1420	while True:
1421	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1422
1423	# Cannot lookahead past lines
1424	if next_id == Id.Unknown_Tok:
1425	self.lexer.MoveToNextLine()
1426	continue
1427
1428	next_kind = consts.GetKind(next_id)
1429	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1430	break
1431
1432	self.lexer.Read(lex_mode_e.Expr)
1433
1434	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1435	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1436	else:
1437	# Consume the trailing Op_Newline
1438	self._SetNext(lex_mode_e.ShCommand)
1439	self._GetToken()
1440
1441	return next_id
1442
1443	def _ReadArithExpr(self, end_id):
1444	# type: (Id_t) -> arith_expr_t
1445	"""Read and parse an arithmetic expression in various contexts.
1446
1447	$(( 1+2 ))
1448	(( a=1+2 ))
1449	${a[ 1+2 ]}
1450	${a : 1+2 : 1+2}
1451
1452	See tests/arith-context.test.sh for ambiguous cases.
1453
1454	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1455
1456	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1457
1458	See the assertion in ArithParser.Parse() -- unexpected extra input.
1459	"""
1460	# calls self.ReadWord(lex_mode_e.Arith)
1461	anode = self.a_parser.Parse()
1462	cur_id = self.a_parser.CurrentId()
1463	if end_id != Id.Undefined_Tok and cur_id != end_id:
1464	p_die(
1465	'Unexpected token after arithmetic expression (%s != %s)' %
1466	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1467	loc.Word(self.a_parser.cur_word))
1468	return anode
1469
1470	def _ReadArithSub(self):
1471	# type: () -> word_part.ArithSub
1472	"""Read an arith substitution, which contains an arith expression, e.g.
1473
1474	$((a + 1)).
1475	"""
1476	left_tok = self.cur_token
1477
1478	# The second one needs to be disambiguated in stuff like stuff like:
1479	# $(echo $(( 1+2 )) )
1480	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1481
1482	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1483	# could save the lexer/reader state here, and retry if the arithmetic parse
1484	# fails. But we can almost always catch this at parse time. There could
1485	# be some exceptions like:
1486	# $((echo * foo)) # looks like multiplication
1487	# $((echo / foo)) # looks like division
1488
1489	# $(( )) is valid
1490	anode = arith_expr.EmptyZero # type: arith_expr_t
1491
1492	self._NextNonSpace()
1493	if self.token_type != Id.Arith_RParen:
1494	anode = self._ReadArithExpr(Id.Arith_RParen)
1495
1496	self._SetNext(lex_mode_e.ShCommand)
1497
1498	# Ensure we get closing )
1499	self._GetToken()
1500	if self.token_type != Id.Right_DollarDParen:
1501	p_die('Expected second ) to end arith sub', self.cur_token)
1502
1503	right_tok = self.cur_token
1504	return word_part.ArithSub(left_tok, anode, right_tok)
1505
1506	def ReadDParen(self):
1507	# type: () -> Tuple[arith_expr_t, Token]
1508	"""Read ((1+ 2)) -- command context.
1509
1510	We're using the word parser because it's very similar to _ReadArithExpr
1511	above.
1512
1513	This also returns the terminating Id.Op_DRightParen token for location
1514	info.
1515	"""
1516	# (( )) is valid
1517	anode = arith_expr.EmptyZero # type: arith_expr_t
1518
1519	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1520
1521	self._NextNonSpace()
1522	if self.token_type != Id.Arith_RParen:
1523	anode = self._ReadArithExpr(Id.Arith_RParen)
1524
1525	self._SetNext(lex_mode_e.ShCommand)
1526
1527	# Ensure we get the second )
1528	self._GetToken()
1529	right = self.cur_token
1530	if right.id != Id.Op_DRightParen:
1531	p_die('Expected second ) to end arith statement', right)
1532
1533	self._SetNext(lex_mode_e.ShCommand)
1534
1535	return anode, right
1536
1537	def _NextNonSpace(self):
1538	# type: () -> None
1539	"""Advance in lex_mode_e.Arith until non-space token.
1540
1541	Same logic as _ReadWord, but used in
1542	$(( ))
1543	(( ))
1544	for (( ))
1545
1546	You can read self.token_type after this, without calling _GetToken.
1547	"""
1548	while True:
1549	self._SetNext(lex_mode_e.Arith)
1550	self._GetToken()
1551	if self.token_kind not in (Kind.Ignored, Kind.WS):
1552	break
1553
1554	def ReadForExpression(self):
1555	# type: () -> command.ForExpr
1556	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1557	self._NextNonSpace() # skip over ((
1558	cur_id = self.token_type # for end of arith expressions
1559
1560	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1561	init_node = arith_expr.EmptyZero # type: arith_expr_t
1562	else:
1563	init_node = self.a_parser.Parse()
1564	cur_id = self.a_parser.CurrentId()
1565	self._NextNonSpace()
1566
1567	# It's odd to keep track of both cur_id and self.token_type in this
1568	# function, but it works, and is tested in 'test/parse_error.sh
1569	# arith-integration'
1570	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1571	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1572
1573	self._GetToken()
1574	cur_id = self.token_type
1575
1576	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1577	# empty condition is TRUE
1578	cond_node = arith_expr.EmptyOne # type: arith_expr_t
1579	else:
1580	cond_node = self.a_parser.Parse()
1581	cur_id = self.a_parser.CurrentId()
1582
1583	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1584	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1585
1586	self._NextNonSpace()
1587	if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1588	update_node = arith_expr.EmptyZero # type: arith_expr_t
1589	else:
1590	update_node = self._ReadArithExpr(Id.Arith_RParen)
1591
1592	self._NextNonSpace()
1593	if self.token_type != Id.Arith_RParen:
1594	p_die('Expected ) to end for loop expression', self.cur_token)
1595	self._SetNext(lex_mode_e.ShCommand)
1596
1597	# redirects is None, will be assigned in CommandEvaluator
1598	node = command.ForExpr.CreateNull()
1599	node.init = init_node
1600	node.cond = cond_node
1601	node.update = update_node
1602	return node
1603
1604	def _ReadArrayLiteral(self):
1605	# type: () -> word_part_t
1606	"""a=(1 2 3)
1607
1608	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1609
1610	We want:
1611
1612	A=(['x']=1 ["x"]=2 [$x$y]=3)
1613
1614	Maybe allow this as a literal string? Because I think I've seen it before?
1615	Or maybe force people to patch to learn the rule.
1616
1617	A=([x]=4)
1618
1619	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1620	Maybe enforce that ALL have keys or NONE of have keys.
1621	"""
1622	self._SetNext(lex_mode_e.ShCommand) # advance past (
1623	self._GetToken()
1624	if self.cur_token.id != Id.Op_LParen:
1625	p_die('Expected ( after =', self.cur_token)
1626	left_token = self.cur_token
1627	right_token = None # type: Token
1628
1629	# MUST use a new word parser (with same lexer).
1630	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1631	words = [] # type: List[CompoundWord]
1632	done = False
1633	while not done:
1634	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1635	with tagswitch(w) as case:
1636	if case(word_e.Operator):
1637	tok = cast(Token, w)
1638	if tok.id == Id.Right_ShArrayLiteral:
1639	right_token = tok
1640	done = True # can't use break here
1641	# Unlike command parsing, array parsing allows embedded \n.
1642	elif tok.id == Id.Op_Newline:
1643	continue
1644	else:
1645	p_die('Unexpected token in array literal', loc.Word(w))
1646
1647	elif case(word_e.Compound):
1648	words.append(cast(CompoundWord, w))
1649
1650	else:
1651	raise AssertionError()
1652
1653	if len(words) == 0: # a=() is empty indexed array
1654	# Needed for type safety, doh
1655	no_words = [] # type: List[word_t]
1656	node = ShArrayLiteral(left_token, no_words, right_token)
1657	return node
1658
1659	pairs = [] # type: List[AssocPair]
1660	# If the first one is a key/value pair, then the rest are assumed to be.
1661	pair = word_.DetectAssocPair(words[0])
1662	if pair:
1663	pairs.append(pair)
1664
1665	n = len(words)
1666	for i in xrange(1, n):
1667	w2 = words[i]
1668	pair = word_.DetectAssocPair(w2)
1669	if not pair:
1670	p_die("Expected associative array pair", loc.Word(w2))
1671
1672	pairs.append(pair)
1673
1674	# invariant List?
1675	return word_part.BashAssocLiteral(left_token, pairs, right_token)
1676
1677	# Brace detection for arrays but NOT associative arrays
1678	words2 = braces.BraceDetectAll(words)
1679	words3 = word_.TildeDetectAll(words2)
1680	return ShArrayLiteral(left_token, words3, right_token)
1681
1682	def ParseProcCallArgs(self, start_symbol):
1683	# type: (int) -> ArgList
1684	""" json write (x) """
1685	self.lexer.MaybeUnreadOne()
1686
1687	arg_list = ArgList.CreateNull(alloc_lists=True)
1688	arg_list.left = self.cur_token
1689	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1690	return arg_list
1691
1692	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1693	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1694	"""Helper for _ReadCompoundWord3."""
1695	done = False
1696
1697	if self.token_type == Id.Lit_EscapedChar:
1698	tok = self.cur_token
1699	assert tok.length == 2
1700	ch = lexer.TokenSliceLeft(tok, 1)
1701	if not self.parse_opts.parse_backslash():
1702	if not pyutil.IsValidCharEscape(ch):
1703	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1704	self.cur_token)
1705
1706	part = word_part.EscapedLiteral(self.cur_token,
1707	ch) # type: word_part_t
1708	else:
1709	part = self.cur_token
1710
1711	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1712	parts.append(part)
1713	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1714	# _ReadWord.
1715	next_id = self.lexer.LookPastSpace(lex_mode)
1716	if next_id == Id.Op_LParen:
1717	self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1718	part2 = self._ReadArrayLiteral()
1719	parts.append(part2)
1720
1721	# Array literal must be the last part of the word.
1722	self._SetNext(lex_mode)
1723	self._GetToken()
1724	# EOF, whitespace, newline, Right_Subshell
1725	if self.token_kind not in KINDS_THAT_END_WORDS:
1726	p_die('Unexpected token after array literal',
1727	self.cur_token)
1728	done = True
1729
1730	elif (is_first and self.parse_opts.parse_at() and
1731	self.token_type == Id.Lit_Splice):
1732
1733	splice_tok = self.cur_token
1734	part2 = word_part.Splice(splice_tok,
1735	lexer.TokenSliceLeft(splice_tok, 1))
1736
1737	parts.append(part2)
1738
1739	# @words must be the last part of the word
1740	self._SetNext(lex_mode)
1741	self._GetToken()
1742	# EOF, whitespace, newline, Right_Subshell
1743	if self.token_kind not in KINDS_THAT_END_WORDS:
1744	p_die('Unexpected token after array splice', self.cur_token)
1745	done = True
1746
1747	elif (is_first and self.parse_opts.parse_at() and
1748	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1749	part2 = self._ReadExprSub(lex_mode_e.DQ)
1750	parts.append(part2)
1751
1752	# @[split(x)]
1753	self._SetNext(lex_mode)
1754	self._GetToken()
1755	# EOF, whitespace, newline, Right_Subshell
1756	if self.token_kind not in KINDS_THAT_END_WORDS:
1757	p_die('Unexpected token after Expr splice', self.cur_token)
1758	done = True
1759
1760	elif (is_first and self.parse_opts.parse_at() and
1761	self.token_type == Id.Lit_AtLBraceDot):
1762	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1763
1764	elif (is_first and self.parse_opts.parse_at_all() and
1765	self.token_type == Id.Lit_At):
1766	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1767	# at the beginning of a word to be reserved.
1768
1769	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1770	# @_argv and
1771	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1772	self.cur_token)
1773
1774	else:
1775	# not a literal with lookahead; append it
1776	parts.append(part)
1777
1778	return done
1779
1780	def _ReadCompoundWord(self, lex_mode):
1781	# type: (lex_mode_t) -> CompoundWord
1782	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1783
1784	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1785	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1786	"""
1787	Precondition: Looking at the first token of the first word part
1788	Postcondition: Looking at the token after, e.g. space or operator
1789
1790	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1791	could be an operator delimiting a compound word. Can we change lexer modes
1792	and remove this special case?
1793	"""
1794	w = CompoundWord([])
1795	num_parts = 0
1796	brace_count = 0
1797	done = False
1798	is_triple_quoted = None # type: Optional[BoolParamBox]
1799
1800	while not done:
1801	self._GetToken()
1802
1803	allow_done = empty_ok or num_parts != 0
1804	if allow_done and self.token_type == eof_type:
1805	done = True # e.g. for ${foo//pat/replace}
1806
1807	# Keywords like "for" are treated like literals
1808	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1809	Kind.ControlFlow, Kind.BoolUnary,
1810	Kind.BoolBinary):
1811
1812	# Syntax error for { and }
1813	if self.token_type == Id.Lit_LBrace:
1814	brace_count += 1
1815	elif self.token_type == Id.Lit_RBrace:
1816	brace_count -= 1
1817	elif self.token_type == Id.Lit_Dollar:
1818	if not self.parse_opts.parse_dollar():
1819	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1820	next_byte = self.lexer.ByteLookAhead()
1821	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1822	if next_byte == '/':
1823	#log('next_byte %r', next_byte)
1824	pass
1825
1826	p_die('Literal $ should be quoted like \$',
1827	self.cur_token)
1828
1829	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1830	w.parts)
1831
1832	elif self.token_kind == Kind.VSub:
1833	vsub_token = self.cur_token
1834
1835	part = SimpleVarSub(vsub_token) # type: word_part_t
1836	w.parts.append(part)
1837
1838	elif self.token_kind == Kind.ExtGlob:
1839	# If parse_at, we can take over @( to start @(seq 3)
1840	# Users can also use look at ,(.py\|.sh)
1841	if (self.parse_opts.parse_at() and
1842	self.token_type == Id.ExtGlob_At and num_parts == 0):
1843	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1844	d_quoted=False)
1845	# RARE mutation of tok.id!
1846	cs_part.left_token.id = Id.Left_AtParen
1847	part = cs_part # for type safety
1848
1849	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1850	# a=(one two)x and @arrayfunc(3)x.
1851	self._GetToken()
1852	if self.token_kind not in KINDS_THAT_END_WORDS:
1853	p_die('Unexpected token after @()', self.cur_token)
1854	done = True
1855
1856	else:
1857	part = self._ReadExtGlob()
1858	w.parts.append(part)
1859
1860	elif self.token_kind == Kind.BashRegex:
1861	if self.token_type == Id.BashRegex_LParen: # Opening (
1862	part = self._ReadBashRegexGroup()
1863	w.parts.append(part)
1864	else:
1865	assert self.token_type == Id.BashRegex_AllowedInParens
1866	p_die('Invalid token in bash regex', self.cur_token)
1867
1868	elif self.token_kind == Kind.Left:
1869	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1870	lex_mode == lex_mode_e.ShCommand and
1871	num_parts == 0)
1872
1873	# Save allocation
1874	if try_triple_quote:
1875	is_triple_quoted = BoolParamBox(False)
1876
1877	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1878	w.parts.append(part)
1879
1880	# NOT done yet, will advance below
1881	elif self.token_kind == Kind.Right:
1882	# Still part of the word; will be done on the next iter.
1883	if self.token_type == Id.Right_DoubleQuote:
1884	pass
1885	# Never happens, no PushHint for this case.
1886	#elif self.token_type == Id.Right_DollarParen:
1887	# pass
1888	elif self.token_type == Id.Right_Subshell:
1889	# LEXER HACK for (case x in x) ;; esac )
1890	# Rewind before it's used
1891	assert self.next_lex_mode == lex_mode_e.Undefined
1892	if self.lexer.MaybeUnreadOne():
1893	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1894	self._SetNext(lex_mode)
1895	done = True
1896	else:
1897	done = True
1898
1899	elif self.token_kind == Kind.Ignored:
1900	done = True
1901
1902	else:
1903	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1904	# so to test for ESAC, we can read ) before getting a chance to
1905	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1906	# token and do it again.
1907
1908	# We get Id.Op_RParen at top level: case x in x) ;; esac
1909	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1910	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1911	# Rewind before it's used
1912	assert self.next_lex_mode == lex_mode_e.Undefined
1913	if self.lexer.MaybeUnreadOne():
1914	if self.token_type == Id.Eof_RParen:
1915	# Redo translation
1916	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1917	self._SetNext(lex_mode)
1918
1919	done = True # anything we don't recognize means we're done
1920
1921	if not done:
1922	self._SetNext(lex_mode)
1923	num_parts += 1
1924
1925	if (self.parse_opts.parse_brace() and num_parts > 1 and
1926	brace_count != 0):
1927	# accept { and }, but not foo{
1928	p_die(
1929	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1930	loc.Word(w))
1931
1932	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1933	p_die('Unexpected parts after triple quoted string',
1934	loc.WordPart(w.parts[-1]))
1935
1936	if 0:
1937	from _devbuild.gen.syntax_asdl import word_part_str
1938	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1939	WORD_HIST[word_key] += 1
1940	return w
1941
1942	def _ReadArithWord(self):
1943	# type: () -> Optional[word_t]
1944	""" Helper for ReadArithWord() """
1945	self._GetToken()
1946
1947	if self.token_kind == Kind.Unknown:
1948	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1949	p_die(
1950	'Unexpected token while parsing arithmetic: %r' %
1951	lexer.TokenVal(self.cur_token), self.cur_token)
1952
1953	elif self.token_kind == Kind.Eof:
1954	return self.cur_token
1955
1956	elif self.token_kind == Kind.Ignored:
1957	# Space should be ignored.
1958	self._SetNext(lex_mode_e.Arith)
1959	return None
1960
1961	elif self.token_kind in (Kind.Arith, Kind.Right):
1962	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1963	self._SetNext(lex_mode_e.Arith)
1964	return self.cur_token
1965
1966	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1967	return self._ReadCompoundWord(lex_mode_e.Arith)
1968
1969	else:
1970	raise AssertionError(self.cur_token)
1971
1972	def _ReadWord(self, word_mode):
1973	# type: (lex_mode_t) -> Optional[word_t]
1974	"""Helper function for ReadWord()."""
1975
1976	# Change the pseudo lexer mode to a real lexer mode
1977	if word_mode == lex_mode_e.ShCommandFakeBrack:
1978	lex_mode = lex_mode_e.ShCommand
1979	else:
1980	lex_mode = word_mode
1981
1982	self._GetToken()
1983
1984	if self.token_kind == Kind.Eof:
1985	# No advance
1986	return self.cur_token
1987
1988	# Allow Arith for ) at end of for loop?
1989	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1990	self._SetNext(lex_mode)
1991
1992	# Newlines are complicated. See 3x2 matrix in the comment about
1993	# self.multiline and self.newline_state above.
1994	if self.token_type == Id.Op_Newline:
1995	if self.multiline:
1996	if self.newline_state > 1:
1997	# This points at a blank line, but at least it gives the line number
1998	p_die('Invalid blank line in multiline mode',
1999	self.cur_token)
2000	return None
2001
2002	if self.returned_newline: # skip
2003	return None
2004
2005	return self.cur_token
2006
2007	elif self.token_kind == Kind.Right:
2008	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2009	Id.Right_CasePat,
2010	Id.Right_ShArrayLiteral):
2011	raise AssertionError(self.cur_token)
2012
2013	self._SetNext(lex_mode)
2014	return self.cur_token
2015
2016	elif self.token_kind in (Kind.Ignored, Kind.WS):
2017	self._SetNext(lex_mode)
2018	return None
2019
2020	else:
2021	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2022	Kind.Left, Kind.KW, Kind.ControlFlow,
2023	Kind.BoolUnary, Kind.BoolBinary,
2024	Kind.ExtGlob,
2025	Kind.BashRegex), 'Unhandled token kind'
2026
2027	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2028	self.parse_opts.parse_bracket() and
2029	self.token_type == Id.Lit_LBracket):
2030	# Change [ from Kind.Lit -> Kind.Op
2031	# So CommandParser can treat
2032	# assert [42 === x]
2033	# like
2034	# json write (x)
2035	bracket_word = self.cur_token
2036	bracket_word.id = Id.Op_LBracket
2037
2038	self._SetNext(lex_mode)
2039	return bracket_word
2040
2041	# We're beginning a word. If we see Id.Lit_Pound, change to
2042	# lex_mode_e.Comment and read until end of line.
2043	if self.token_type == Id.Lit_Pound:
2044	self._SetNext(lex_mode_e.Comment)
2045	self._GetToken()
2046
2047	# NOTE: The # could be the last character in the file. It can't be
2048	# Eof_{RParen,Backtick} because #) and #` are comments.
2049	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2050	self.cur_token
2051
2052	# The next iteration will go into Kind.Ignored and set lex state to
2053	# lex_mode_e.ShCommand/etc.
2054	return None # tell ReadWord() to try again after comment
2055
2056	elif self.token_type == Id.Lit_TPound: ### doc comment
2057	self._SetNext(lex_mode_e.Comment)
2058	self._GetToken()
2059
2060	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2061	return self.cur_token
2062
2063	return None # tell ReadWord() to try again after comment
2064
2065	else:
2066	# r'' u'' b''
2067	if (self.token_type == Id.Lit_Chars and
2068	self.lexer.LookAheadOne(
2069	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2070
2071	# When shopt -s parse_raw_string:
2072	# echo r'hi' is like echo 'hi'
2073	#
2074	# echo u'\u{3bc}' b'\yff' works
2075
2076	tok = self.cur_token
2077	if self.parse_opts.parse_ysh_string():
2078	if lexer.TokenEquals(tok, 'r'):
2079	left_id = Id.Left_RSingleQuote
2080	elif lexer.TokenEquals(tok, 'u'):
2081	left_id = Id.Left_USingleQuote
2082	elif lexer.TokenEquals(tok, 'b'):
2083	left_id = Id.Left_BSingleQuote
2084	else:
2085	left_id = Id.Undefined_Tok
2086
2087	if left_id != Id.Undefined_Tok:
2088	# skip the r, and then 'foo' will be read as normal
2089	self._SetNext(lex_mode_e.ShCommand)
2090
2091	self._GetToken()
2092	assert self.token_type == Id.Left_SingleQuote, self.token_type
2093
2094	# Read the word in a different lexer mode
2095	return self._ReadYshSingleQuoted(left_id)
2096
2097	return self._ReadCompoundWord(lex_mode)
2098
2099	def ParseVarRef(self):
2100	# type: () -> BracedVarSub
2101	"""DYNAMIC parsing of what's inside ${!ref}
2102
2103	# Same as VarOf production
2104	VarRefExpr = VarOf EOF
2105	"""
2106	self._SetNext(lex_mode_e.VSub_1)
2107
2108	self._GetToken()
2109	if self.token_kind != Kind.VSub:
2110	p_die('Expected var name', self.cur_token)
2111
2112	part = self._ParseVarOf()
2113	# NOTE: no ${ } means no part.left and part.right
2114	part.left = part.token # cheat to make test pass
2115	part.right = part.token
2116
2117	self._GetToken()
2118	if self.token_type != Id.Eof_Real:
2119	p_die('Expected end of var ref expression', self.cur_token)
2120	return part
2121
2122	def LookPastSpace(self):
2123	# type: () -> Id_t
2124	"""Look ahead to the next token.
2125
2126	For the CommandParser to recognize
2127	array= (1 2 3)
2128	YSH for ( versus bash for ((
2129	YSH if ( versus if test
2130	YSH while ( versus while test
2131	YSH bare assignment 'grep =' versus 'grep foo'
2132	"""
2133	assert self.token_type != Id.Undefined_Tok
2134	if self.cur_token.id == Id.WS_Space:
2135	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2136	else:
2137	id_ = self.cur_token.id
2138	return id_
2139
2140	def LookAheadFuncParens(self):
2141	# type: () -> bool
2142	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2143	assert self.token_type != Id.Undefined_Tok
2144
2145	# We have to handle 2 cases because we buffer a token
2146	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2147	return self.lexer.LookAheadFuncParens(1) # go back one char
2148
2149	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2150	return self.lexer.LookAheadFuncParens(0)
2151
2152	else:
2153	return False
2154
2155	def ReadWord(self, word_mode):
2156	# type: (lex_mode_t) -> word_t
2157	"""Read the next word, using the given lexer mode.
2158
2159	This is a stateful wrapper for the stateless _ReadWord function.
2160	"""
2161	assert word_mode in (lex_mode_e.ShCommand,
2162	lex_mode_e.ShCommandFakeBrack,
2163	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2164
2165	if self.buffered_word: # For integration with pgen2
2166	w = self.buffered_word
2167	self.buffered_word = None
2168	else:
2169	while True:
2170	w = self._ReadWord(word_mode)
2171	if w is not None:
2172	break
2173
2174	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2175	return w
2176
2177	def ReadArithWord(self):
2178	# type: () -> word_t
2179	while True:
2180	w = self._ReadArithWord()
2181	if w is not None:
2182	break
2183	return w
2184
2185	def ReadHereDocBody(self, parts):
2186	# type: (List[word_part_t]) -> None
2187	"""
2188	A here doc is like a double quoted context, except " isn't special.
2189	"""
2190	self._ReadLikeDQ(None, False, parts)
2191	# Returns nothing
2192
2193	def ReadForPlugin(self):
2194	# type: () -> CompoundWord
2195	"""For $PS1, $PS4, etc.
2196
2197	This is just like reading a here doc line. "\n" is allowed, as
2198	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2199	"""
2200	w = CompoundWord([])
2201	self._ReadLikeDQ(None, False, w.parts)
2202	return w
2203
2204	def EmitDocToken(self, b):
2205	# type: (bool) -> None
2206	self.emit_doc_token = b
2207
2208	def Multiline(self, b):
2209	# type: (bool) -> None
2210	self.multiline = b
2211
2212
2213	if 0:
2214	import collections
2215	WORD_HIST = collections.Counter()
2216
2217	# vim: sw=4