osh/word_parse.py

OILS / osh / word_parse.py View on Github | oilshell.org

2216 lines, 1182 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	ShArrayLiteral,
65	AssocPair,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	arith_expr,
91	)
92	from core import alloc
93	from core.error import p_die
94	from mycpp.mylib import log
95	from core import pyutil
96	from core import ui
97	from frontend import consts
98	from frontend import lexer
99	from frontend import reader
100	from osh import tdop
101	from osh import arith_parse
102	from osh import braces
103	from osh import word_
104	from osh import word_compile
105	from mycpp.mylib import tagswitch
106
107	from typing import List, Optional, Tuple, cast
108	from typing import TYPE_CHECKING
109	if TYPE_CHECKING:
110	from frontend.lexer import Lexer
111	from frontend.parse_lib import ParseContext
112	from frontend.reader import _Reader
113	from osh.cmd_parse import VarChecker
114
115	unused1 = log
116	unused2 = Id_str
117
118	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
119
120
121	class WordEmitter(object):
122	"""Common interface for [ and [["""
123
124	def __init__(self):
125	# type: () -> None
126	"""Empty constructor for mycpp."""
127	pass
128
129	def ReadWord(self, lex_mode):
130	# type: (lex_mode_t) -> word_t
131	raise NotImplementedError()
132
133
134	class WordParser(WordEmitter):
135
136	def __init__(self, parse_ctx, lexer, line_reader):
137	# type: (ParseContext, Lexer, _Reader) -> None
138	self.parse_ctx = parse_ctx
139	self.lexer = lexer
140	self.line_reader = line_reader
141	self.arena = line_reader.arena
142
143	self.parse_opts = parse_ctx.parse_opts
144	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
145	self.parse_opts)
146	self.Reset()
147
148	def Init(self, lex_mode):
149	# type: (lex_mode_t) -> None
150	"""Used to parse arithmetic, see ParseContext."""
151	self.next_lex_mode = lex_mode
152
153	def Reset(self):
154	# type: () -> None
155	"""Called by interactive loop."""
156	# For _GetToken()
157	self.cur_token = None # type: Token
158	self.token_kind = Kind.Undefined
159	self.token_type = Id.Undefined_Tok
160
161	self.next_lex_mode = lex_mode_e.ShCommand
162
163	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
164	# comments
165	self.emit_doc_token = False
166	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
167	# multiline mode.
168	self.multiline = False
169
170	# For detecting invalid \n\n in multiline mode. Counts what we got
171	# directly from the lexer.
172	self.newline_state = 0
173	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
174	# that consume words.
175	self.returned_newline = False
176
177	# For integration with pgen2
178	self.buffered_word = None # type: word_t
179
180	def _GetToken(self):
181	# type: () -> None
182	"""Call this when you need to make a decision based on any of:
183
184	self.token_type
185	self.token_kind
186	self.cur_token
187	"""
188	if self.next_lex_mode == lex_mode_e.Undefined:
189	return # _SetNext() not called, so do nothing
190
191	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
192	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
193
194	self.cur_token = self.lexer.Read(real_mode)
195
196	# MUTATE TOKEN for fake lexer mode.
197	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
198	if (is_fake and self.cur_token.id
199	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
200	self.cur_token.id = Id.Lit_Chars
201
202	self.token_type = self.cur_token.id
203	self.token_kind = consts.GetKind(self.token_type)
204
205	# number of consecutive newlines, ignoring whitespace
206	if self.token_type == Id.Op_Newline:
207	self.newline_state += 1
208	elif self.token_kind != Kind.WS:
209	self.newline_state = 0
210
211	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
212	self.next_lex_mode = lex_mode_e.Undefined
213
214	def _SetNext(self, lex_mode):
215	# type: (lex_mode_t) -> None
216	"""Set the next lex state, but don't actually read a token.
217
218	We need this for proper interactive parsing.
219	"""
220	self.next_lex_mode = lex_mode
221
222	def _ReadVarOpArg(self, arg_lex_mode):
223	# type: (lex_mode_t) -> rhs_word_t
224
225	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
226	# valid, even when unquoted.
227	self._SetNext(arg_lex_mode)
228	self._GetToken()
229
230	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
231	True) # empty_ok
232
233	# If the Compound has no parts, and we're in a double-quoted VarSub
234	# arg, and empty_ok, then return Empty. This is so it can evaluate to
235	# the empty string and not get elided.
236	#
237	# Examples:
238	# - "${s:-}", "${s/%pat/}"
239	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
240	# has the same potential problem of not having Token location info.
241	#
242	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
243	# return a Compound with no parts, which is explicitly checked with a
244	# custom error message.
245	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
246	return rhs_word.Empty
247
248	return w
249
250	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
251	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
252	"""Return a CompoundWord.
253
254	Helper function for _ReadVarOpArg and used directly by
255	_ReadPatSubVarOp.
256	"""
257	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
258	#log('w %s', w)
259	tilde = word_.TildeDetect(w)
260	if tilde:
261	w = tilde
262	return w
263
264	def _ReadSliceVarOp(self):
265	# type: () -> suffix_op.Slice
266	"""
267	Looking token after first ':'
268
269	ArithExpr? (':' ArithExpr? )? '}'
270	"""
271	self._NextNonSpace()
272
273	cur_id = self.token_type
274
275	if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
276	begin = arith_expr.EmptyZero # type: arith_expr_t
277	else:
278	begin = self.a_parser.Parse()
279	cur_id = self.a_parser.CurrentId() # advance
280
281	if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
282	no_length = None # type: Optional[arith_expr_t] # No length specified
283	return suffix_op.Slice(begin, no_length)
284
285	elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
286	self._NextNonSpace()
287
288	if self.token_type != Id.Arith_RBrace:
289	length = self._ReadArithExpr(Id.Arith_RBrace)
290	else:
291	# quirky bash behavior:
292	# ${a:1:} or ${a::} means length ZERO
293	# but ${a:1} or ${a:} means length N
294	if self.parse_opts.strict_parse_slice():
295	p_die(
296	"Explicit slice length required - zero or N (strict_parse_slice)",
297	self.cur_token)
298
299	length = arith_expr.EmptyZero
300
301	return suffix_op.Slice(begin, length)
302
303	else:
304	p_die("Expected : or } in slice", self.cur_token)
305
306	raise AssertionError() # for MyPy
307
308	def _ReadPatSubVarOp(self):
309	# type: () -> suffix_op.PatSub
310	"""Looking at the first '/' after VarOf:
311
312	VarSub = ...
313	\| VarOf '/' Match ( '/' WORD? )?
314	Match = '/' WORD # can't be empty
315	\| '#' WORD? # may be empty
316	\| '%' WORD?
317	"""
318	slash_tok = self.cur_token # location info
319	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
320
321	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
322
323	self._GetToken()
324	if self.token_type == Id.Right_DollarBrace:
325	pat = CompoundWord([])
326	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
327	slash_tok)
328
329	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
330	replace_mode = self.token_type
331	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
332
333	# Bash quirk:
334	# echo ${x/#/replace} has an empty pattern
335	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
336	empty_ok = replace_mode != Id.Lit_Slash
337	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
338	empty_ok)
339	#log('pat 1 %r', pat)
340
341	if self.token_type == Id.Lit_Slash:
342	# read until }
343	replace = self._ReadVarOpArg(
344	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
345	#log('r 1 %r', replace)
346	else:
347	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
348	replace = rhs_word.Empty
349
350	self._GetToken()
351	if self.token_type != Id.Right_DollarBrace:
352	# This happens on invalid code
353	p_die(
354	"Expected } after replacement string, got %s" %
355	ui.PrettyId(self.token_type), self.cur_token)
356
357	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
358
359	def _ReadSubscript(self):
360	# type: () -> bracket_op_t
361	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
362	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
363	# expression.
364	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
365	if next_id in (Id.Lit_At, Id.Arith_Star):
366	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
367
368	self._SetNext(lex_mode_e.Arith) # skip past [
369	self._GetToken()
370	self._SetNext(lex_mode_e.Arith) # skip past @
371	self._GetToken()
372	else:
373	self._SetNext(lex_mode_e.Arith) # skip past [
374	anode = self._ReadArithExpr(Id.Arith_RBracket)
375	op = bracket_op.ArrayIndex(anode)
376
377	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
378	p_die('Expected ] to close subscript', self.cur_token)
379
380	self._SetNext(lex_mode_e.VSub_2) # skip past ]
381	self._GetToken() # Needed to be in the same spot as no subscript
382
383	return op
384
385	def _ParseVarOf(self):
386	# type: () -> BracedVarSub
387	"""
388	VarOf = NAME Subscript?
389	\| NUMBER # no subscript allowed, none of these are arrays
390	# ${@[1]} doesn't work, even though slicing does
391	\| VarSymbol
392	"""
393	self._GetToken()
394	name_token = self.cur_token
395	self._SetNext(lex_mode_e.VSub_2)
396
397	self._GetToken() # Check for []
398	if self.token_type == Id.VOp2_LBracket:
399	bracket_op = self._ReadSubscript()
400	else:
401	bracket_op = None
402
403	part = BracedVarSub.CreateNull()
404	part.token = name_token
405	part.var_name = lexer.TokenVal(name_token)
406	part.bracket_op = bracket_op
407	return part
408
409	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
410	# type: (lex_mode_t, bool) -> BracedVarSub
411	"""Start parsing at the op -- we already skipped past the name."""
412	part = self._ParseVarOf()
413
414	self._GetToken()
415	if self.token_type == Id.Right_DollarBrace:
416	return part # no ops
417
418	op_kind = self.token_kind
419
420	if op_kind == Kind.VTest:
421	tok = self.cur_token
422	arg_word = self._ReadVarOpArg(arg_lex_mode)
423	if self.token_type != Id.Right_DollarBrace:
424	p_die('Expected } to close ${', self.cur_token)
425
426	part.suffix_op = suffix_op.Unary(tok, arg_word)
427
428	elif op_kind == Kind.VOpYsh:
429	tok = self.cur_token
430	arg_word = self._ReadVarOpArg(arg_lex_mode)
431	if self.token_type != Id.Right_DollarBrace:
432	p_die('Expected } to close ${', self.cur_token)
433
434	UP_arg_word = arg_word
435	with tagswitch(arg_word) as case:
436	if case(rhs_word_e.Empty):
437	pass
438	elif case(rhs_word_e.Compound):
439	arg_word = cast(CompoundWord, UP_arg_word)
440	# This handles ${x\|html} and ${x %.3f} now
441	# However I think ${x %.3f} should be statically parsed? It can enter
442	# the printf lexer modes.
443	ok, arg, quoted = word_.StaticEval(arg_word)
444	if not ok or quoted:
445	p_die('Expected a constant argument',
446	loc.Word(arg_word))
447
448	part.suffix_op = suffix_op.Static(tok, arg)
449
450	elif op_kind == Kind.VOp0:
451	part.suffix_op = self.cur_token # Nullary
452	self._SetNext(lex_mode_e.VSub_2) # Expecting }
453	self._GetToken()
454
455	elif op_kind == Kind.VOp1: # % %% # ## etc.
456	tok = self.cur_token
457	# Weird exception that all shells have: these operators take a glob
458	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
459	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
460	if self.token_type != Id.Right_DollarBrace:
461	p_die('Expected } to close ${', self.cur_token)
462
463	part.suffix_op = suffix_op.Unary(tok, arg_word)
464
465	elif op_kind == Kind.VOp2: # / : [ ]
466	if self.token_type == Id.VOp2_Slash:
467	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
468	part.suffix_op = patsub_op
469
470	# Checked by the method above
471	assert self.token_type == Id.Right_DollarBrace, self.cur_token
472
473	elif self.token_type == Id.VOp2_Colon:
474	part.suffix_op = self._ReadSliceVarOp()
475	# NOTE: } in arithmetic mode.
476	if self.token_type != Id.Arith_RBrace:
477	# Token seems off; doesn't point to X in # ${a:1:2 X
478	p_die('Expected } to close ${', self.cur_token)
479
480	else:
481	# TODO: Does this ever happen?
482	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
483
484	elif op_kind == Kind.VOp3: # ${prefix@} etc.
485	if allow_query:
486	part.suffix_op = self.cur_token # Nullary
487	self._SetNext(lex_mode_e.VSub_2) # Expecting }
488	self._GetToken()
489	else:
490	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
491
492	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
493	# mode. It's redundantly checked above.
494	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
495	# ${a.} or ${!a.}
496	p_die('Expected } to close ${', self.cur_token)
497
498	# Now look for ops
499	return part
500
501	def _ReadZshVarSub(self, left_token):
502	# type: (Token) -> word_part.ZshVarSub
503
504	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
505
506	# Can be empty
507	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
508	True)
509	self._GetToken()
510	return word_part.ZshVarSub(left_token, w, self.cur_token)
511
512	def ReadBracedVarSub(self, left_token):
513	# type: (Token) -> Tuple[BracedVarSub, Token]
514	""" For YSH expressions like var x = ${x:-"default"}. """
515	part = self._ReadBracedVarSub(left_token, d_quoted=False)
516	last_token = self.cur_token
517	return part, last_token
518
519	def _ReadBracedVarSub(self, left_token, d_quoted):
520	# type: (Token, bool) -> BracedVarSub
521	"""For the ${} expression language.
522
523	NAME = [a-zA-Z_][a-zA-Z0-9_]*
524	NUMBER = [0-9]+ # ${10}, ${11}, ...
525
526	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
527	VarSymbol = '!' \| '@' \| '#' \| ...
528	VarOf = NAME Subscript?
529	\| NUMBER # no subscript allowed, none of these are arrays
530	# ${@[1]} doesn't work, even though slicing does
531	\| VarSymbol
532
533	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
534
535	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
536	STRIP_OP = '#' \| '##' \| '%' \| '%%'
537	CASE_OP = ',' \| ',,' \| '^' \| '^^'
538	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
539
540	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
541	# SPACE is operator not %
542	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
543	VarExpr = VarOf
544	\| VarOf NULLARY_OP
545	\| VarOf UnaryOp WORD
546	\| VarOf YSH_UNARY STATIC_WORD
547	\| VarOf ':' ArithExpr (':' ArithExpr )?
548	\| VarOf '/' Match '/' WORD
549
550	LengthExpr = '#' VarOf # can't apply operators after length
551
552	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
553	# ${!ref[0]} vs ${!keys[@]} resolved later
554
555	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
556
557	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
558
559	VarSub = LengthExpr
560	\| RefOrKeys
561	\| PrefixQuery
562	\| VarExpr
563	\| BuiltinSub
564
565	NOTES:
566	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
567	slicing ${a:x+1:y+2}
568	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
569	- @ and * are technically arithmetic expressions in this implementation
570	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
571	it's also vectorized.
572
573	Strictness over bash:
574	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
575	grammar
576	- ! and # prefixes can't be composed, even though named refs can be
577	composed with other operators
578	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
579	a prefix, and it can also be a literal part of WORD.
580
581	From the parser's point of view, the prefix # can't be combined with
582	UnaryOp/slicing/matching, and the ! can. However
583
584	- ${a[@]:1:2} is not allowed
585	- ${#a[@]:1:2} is allowed, but gives the wrong answer
586	"""
587	if d_quoted:
588	arg_lex_mode = lex_mode_e.VSub_ArgDQ
589	else:
590	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
591
592	self._SetNext(lex_mode_e.VSub_1)
593	self._GetToken()
594
595	ty = self.token_type
596	first_tok = self.cur_token
597
598	if ty == Id.VSub_Pound:
599	# Disambiguate
600	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
601	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
602	# e.g. a name, '#' is the prefix
603	self._SetNext(lex_mode_e.VSub_1)
604	part = self._ParseVarOf()
605
606	self._GetToken()
607	if self.token_type != Id.Right_DollarBrace:
608	p_die('Expected } after length expression', self.cur_token)
609
610	part.prefix_op = first_tok
611
612	else: # not a prefix, '#' is the variable
613	part = self._ParseVarExpr(arg_lex_mode)
614
615	elif ty == Id.VSub_Bang:
616	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
617	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
618	# e.g. a name, '!' is the prefix
619	# ${!a} -- this is a ref
620	# ${!3} -- this is ref
621	# ${!a[1]} -- this is a ref
622	# ${!a[@]} -- this is a keys
623	# No lookahead -- do it in a second step, or at runtime
624	self._SetNext(lex_mode_e.VSub_1)
625	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
626
627	part.prefix_op = first_tok
628
629	else: # not a prefix, '!' is the variable
630	part = self._ParseVarExpr(arg_lex_mode)
631
632	elif ty == Id.VSub_Dot:
633	# Note: this will become a new builtin_sub type, so this method must
634	# return word_part_t rather than BracedVarSub. I don't think that
635	# should cause problems.
636	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
637
638	# VS_NAME, VS_NUMBER, symbol that isn't # or !
639	elif self.token_kind == Kind.VSub:
640	part = self._ParseVarExpr(arg_lex_mode)
641
642	else:
643	# e.g. ${^}
644	p_die('Unexpected token in ${}', self.cur_token)
645
646	part.left = left_token # attach the argument
647	part.right = self.cur_token
648	return part
649
650	def _ReadSingleQuoted(self, left_token, lex_mode):
651	# type: (Token, lex_mode_t) -> SingleQuoted
652	"""Internal method to read a word_part."""
653	tokens = [] # type: List[Token]
654	# In command mode, we never disallow backslashes like '\'
655	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
656	False)
657	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
658	node = SingleQuoted(left_token, sval, right_quote)
659	return node
660
661	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
662	# type: (lex_mode_t, Token, List[Token], bool) -> Token
663	"""Appends to out_tokens; returns last token
664
665	Used by expr_parse.py
666	"""
667	# TODO: Remove and use out_tokens
668	tokens = [] # type: List[Token]
669
670	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
671	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
672
673	expected_end_tokens = 3 if left_token.id in (
674	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
675	Id.Left_BTSingleQuote) else 1
676	num_end_tokens = 0
677
678	while num_end_tokens < expected_end_tokens:
679	self._SetNext(lex_mode)
680	self._GetToken()
681
682	# Kind.Char emitted in lex_mode.SQ_C
683	if self.token_kind in (Kind.Lit, Kind.Char):
684	tok = self.cur_token
685	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
686	# r'one\two' or c'one\\two'
687	if no_backslashes and lexer.TokenContains(tok, '\\'):
688	p_die(
689	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
690	tok)
691
692	if is_ysh_expr:
693	# Disallow var x = $'\001'. Arguably we don't need these
694	# checks because u'\u{1}' is the way to write it.
695	if self.token_type == Id.Char_Octal3:
696	p_die(
697	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
698	tok)
699
700	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
701	# disallow \xH
702	p_die(
703	r'Invalid hex escape in YSH string (must be \xHH)',
704	tok)
705
706	tokens.append(tok)
707
708	elif self.token_kind == Kind.Unknown:
709	tok = self.cur_token
710	assert tok.id == Id.Unknown_Backslash, tok
711
712	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
713	if is_ysh_expr or not self.parse_opts.parse_backslash():
714	p_die(
715	"Invalid char escape in C-style string literal (OILS-ERR-11)",
716	tok)
717
718	tokens.append(tok)
719
720	elif self.token_kind == Kind.Eof:
721	p_die('Unexpected EOF in single-quoted string that began here',
722	left_token)
723
724	elif self.token_kind == Kind.Right:
725	# assume Id.Right_SingleQuote
726	num_end_tokens += 1
727	tokens.append(self.cur_token)
728
729	else:
730	raise AssertionError(self.cur_token)
731
732	if self.token_kind != Kind.Right:
733	num_end_tokens = 0 # we need three in a ROW
734
735	if expected_end_tokens == 1:
736	tokens.pop()
737	elif expected_end_tokens == 3: # Get rid of spurious end tokens
738	tokens.pop()
739	tokens.pop()
740	tokens.pop()
741
742	# Remove space from ''' r''' $''' in both expression mode and command mode
743	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
744	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
745	word_compile.RemoveLeadingSpaceSQ(tokens)
746
747	# Validation after lexing - same 2 checks in j8.LexerDecoder
748	is_u_string = left_token.id in (Id.Left_USingleQuote,
749	Id.Left_UTSingleQuote)
750
751	for tok in tokens:
752	# u'\yff' is not valid, but b'\yff' is
753	if is_u_string and tok.id == Id.Char_YHex:
754	p_die(
755	r"%s escapes not allowed in u'' strings" %
756	lexer.TokenVal(tok), tok)
757
758	out_tokens.extend(tokens)
759	return self.cur_token
760
761	def _ReadDoubleQuotedLeftParts(self):
762	# type: () -> word_part_t
763	"""Read substitution parts in a double quoted context."""
764	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
765	return self._ReadCommandSub(self.token_type, d_quoted=True)
766
767	if self.token_type == Id.Left_DollarBrace:
768	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
769
770	if self.token_type == Id.Left_DollarDParen:
771	return self._ReadArithSub()
772
773	if self.token_type == Id.Left_DollarBracket:
774	return self._ReadExprSub(lex_mode_e.DQ)
775
776	raise AssertionError(self.cur_token)
777
778	def _ReadYshSingleQuoted(self, left_id):
779	# type: (Id_t) -> CompoundWord
780	"""Read YSH style strings
781
782	r'' u'' b''
783	r''' ''' u''' ''' b''' '''
784	"""
785	#log('BEF self.cur_token %s', self.cur_token)
786	if left_id == Id.Left_RSingleQuote:
787	lexer_mode = lex_mode_e.SQ_Raw
788	triple_left_id = Id.Left_RTSingleQuote
789	elif left_id == Id.Left_USingleQuote:
790	lexer_mode = lex_mode_e.J8_Str
791	triple_left_id = Id.Left_UTSingleQuote
792	elif left_id == Id.Left_BSingleQuote:
793	lexer_mode = lex_mode_e.J8_Str
794	triple_left_id = Id.Left_BTSingleQuote
795	else:
796	raise AssertionError(left_id)
797
798	# Needed for syntax checks
799	left_tok = self.cur_token
800	left_tok.id = left_id
801
802	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
803
804	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
805	self._SetNext(lex_mode_e.ShCommand)
806	self._GetToken()
807
808	assert self.token_type == Id.Left_SingleQuote
809	# HACK: magically transform the third ' in u''' to
810	# Id.Left_UTSingleQuote, so that ''' is the terminator
811	left_tok = self.cur_token
812	left_tok.id = triple_left_id
813
814	# Handles stripping leading whitespace
815	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
816
817	# Advance and validate
818	self._SetNext(lex_mode_e.ShCommand)
819
820	self._GetToken()
821	if self.token_kind not in KINDS_THAT_END_WORDS:
822	p_die('Unexpected token after YSH single-quoted string',
823	self.cur_token)
824
825	return CompoundWord([sq_part])
826
827	def _ReadUnquotedLeftParts(self, triple_out):
828	# type: (Optional[BoolParamBox]) -> word_part_t
829	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
830
831	If triple_out is set, then we try parsing triple quoted strings,
832	and set its value to True if we got one.
833	"""
834	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
835	# Note: $"" is a synonym for "". It might make sense if it added
836	# \n \0 \x00 \u{123} etc. But that's not what bash does!
837	dq_part = self._ReadDoubleQuoted(self.cur_token)
838	# Got empty word "" and there's a " after
839	if (triple_out and len(dq_part.parts) == 0 and
840	self.lexer.ByteLookAhead() == '"'):
841
842	self._SetNext(lex_mode_e.ShCommand)
843	self._GetToken()
844	# HACK: magically transform the third " in """ to
845	# Id.Left_TDoubleQuote, so that """ is the terminator
846	left_dq_token = self.cur_token
847	left_dq_token.id = Id.Left_TDoubleQuote
848	triple_out.b = True # let caller know we got it
849	return self._ReadDoubleQuoted(left_dq_token)
850
851	return dq_part
852
853	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
854	Id.Left_DollarSingleQuote):
855	if self.token_type == Id.Left_SingleQuote:
856	lexer_mode = lex_mode_e.SQ_Raw
857	triple_left_id = Id.Left_TSingleQuote
858	elif self.token_type == Id.Left_RSingleQuote:
859	lexer_mode = lex_mode_e.SQ_Raw
860	triple_left_id = Id.Left_RTSingleQuote
861	else:
862	lexer_mode = lex_mode_e.SQ_C
863	# there is no such thing as $'''
864	triple_left_id = Id.Undefined_Tok
865
866	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
867
868	# Got empty '' or r'' and there's a ' after
869	# u'' and b'' are handled in _ReadYshSingleQuoted
870	if (triple_left_id != Id.Undefined_Tok and
871	triple_out is not None and len(sq_part.sval) == 0 and
872	self.lexer.ByteLookAhead() == "'"):
873
874	self._SetNext(lex_mode_e.ShCommand)
875	self._GetToken()
876
877	# HACK: magically transform the third ' in ''' to
878	# Id.Left_TSingleQuote, so that ''' is the terminator
879	left_sq_token = self.cur_token
880	left_sq_token.id = triple_left_id
881
882	triple_out.b = True # let caller know we got it
883	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
884
885	return sq_part
886
887	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
888	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
889	return self._ReadCommandSub(self.token_type, d_quoted=False)
890
891	if self.token_type == Id.Left_DollarBrace:
892	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
893
894	if self.token_type == Id.Left_DollarDParen:
895	return self._ReadArithSub()
896
897	if self.token_type == Id.Left_DollarBracket:
898	return self._ReadExprSub(lex_mode_e.ShCommand)
899
900	if self.token_type == Id.Left_DollarBraceZsh:
901	return self._ReadZshVarSub(self.cur_token)
902
903	raise AssertionError(self.cur_token)
904
905	def _ReadExtGlob(self):
906	# type: () -> word_part.ExtGlob
907	"""
908	Grammar:
909	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
910	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
911	RIGHT = ')'
912	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
913	Compound includes ExtGlob
914	"""
915	left_token = self.cur_token
916	right_token = None # type: Token
917	arms = [] # type: List[CompoundWord]
918
919	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
920	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
921
922	read_word = False # did we just a read a word? To handle @(\|\|).
923
924	while True:
925	self._GetToken()
926
927	if self.token_type == Id.Right_ExtGlob:
928	if not read_word:
929	arms.append(CompoundWord([]))
930	right_token = self.cur_token
931	break
932
933	elif self.token_type == Id.Op_Pipe:
934	if not read_word:
935	arms.append(CompoundWord([]))
936	read_word = False
937	self._SetNext(lex_mode_e.ExtGlob)
938
939	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
940	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
941	Kind.ExtGlob):
942	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
943	arms.append(w)
944	read_word = True
945
946	elif self.token_kind == Kind.Eof:
947	p_die('Unexpected EOF reading extended glob that began here',
948	left_token)
949
950	else:
951	raise AssertionError(self.cur_token)
952
953	return word_part.ExtGlob(left_token, arms, right_token)
954
955	def _ReadBashRegexGroup(self):
956	# type: () -> word_part.BashRegexGroup
957	"""
958	Grammar:
959	BashRegexGroup = '(' WORD? ')
960	"""
961	left_token = self.cur_token
962	assert left_token.id == Id.BashRegex_LParen, left_token
963
964	right_token = None # type: Token
965	arms = [] # type: List[CompoundWord]
966
967	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
968	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
969
970	self._GetToken()
971	if self.token_type == Id.Right_BashRegexGroup: # empty ()
972	return word_part.BashRegexGroup(left_token, None, self.cur_token)
973
974	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
975	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
976	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
977	# To allow bash style [[ s =~ (a b) ]]
978	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
979	arms.append(w)
980
981	self._GetToken()
982	if self.token_type != Id.Right_BashRegexGroup:
983	p_die('Expected ) to close bash regex group', self.cur_token)
984
985	return word_part.BashRegexGroup(left_token, w, self.cur_token)
986
987	p_die('Expected word after ( opening bash regex group', self.cur_token)
988
989	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
990	# type: (Optional[Token], bool, List[word_part_t]) -> None
991	"""
992	Args:
993	left_token: A token if we are reading a double quoted part, or None if
994	we're reading a here doc.
995	is_ysh_expr: Whether to disallow backticks and invalid char escapes
996	out_parts: list of word_part to append to
997	"""
998	if left_token:
999	if left_token.id in (Id.Left_TDoubleQuote,
1000	Id.Left_DollarTDoubleQuote):
1001	expected_end_tokens = 3
1002	else:
1003	expected_end_tokens = 1
1004	else:
1005	expected_end_tokens = 1000 # here doc will break
1006
1007	num_end_tokens = 0
1008	while num_end_tokens < expected_end_tokens:
1009	self._SetNext(lex_mode_e.DQ)
1010	self._GetToken()
1011
1012	if self.token_kind == Kind.Lit:
1013	if self.token_type == Id.Lit_EscapedChar:
1014	tok = self.cur_token
1015	ch = lexer.TokenSliceLeft(tok, 1)
1016	part = word_part.EscapedLiteral(tok,
1017	ch) # type: word_part_t
1018	else:
1019	if self.token_type == Id.Lit_BadBackslash:
1020	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1021	# YSH.
1022	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1023	# recursion (unless parse_backslash)
1024	if (is_ysh_expr or
1025	not self.parse_opts.parse_backslash()):
1026	p_die(
1027	"Invalid char escape in double quoted string (OILS-ERR-12)",
1028	self.cur_token)
1029	elif self.token_type == Id.Lit_Dollar:
1030	if is_ysh_expr or not self.parse_opts.parse_dollar():
1031	p_die("Literal $ should be quoted like \$",
1032	self.cur_token)
1033
1034	part = self.cur_token
1035	out_parts.append(part)
1036
1037	elif self.token_kind == Kind.Left:
1038	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1039	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1040	self.cur_token)
1041
1042	part = self._ReadDoubleQuotedLeftParts()
1043	out_parts.append(part)
1044
1045	elif self.token_kind == Kind.VSub:
1046	tok = self.cur_token
1047	part = SimpleVarSub(tok)
1048	out_parts.append(part)
1049	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1050	# later.
1051
1052	elif self.token_kind == Kind.Right:
1053	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1054	if left_token:
1055	num_end_tokens += 1
1056
1057	# In a here doc, the right quote is literal!
1058	out_parts.append(self.cur_token)
1059
1060	elif self.token_kind == Kind.Eof:
1061	if left_token:
1062	p_die(
1063	'Unexpected EOF reading double-quoted string that began here',
1064	left_token)
1065	else: # here docs will have an EOF in their token stream
1066	break
1067
1068	else:
1069	raise AssertionError(self.cur_token)
1070
1071	if self.token_kind != Kind.Right:
1072	num_end_tokens = 0 # """ must be CONSECUTIVE
1073
1074	if expected_end_tokens == 1:
1075	out_parts.pop()
1076	elif expected_end_tokens == 3:
1077	out_parts.pop()
1078	out_parts.pop()
1079	out_parts.pop()
1080
1081	# Remove space from """ in both expression mode and command mode
1082	if (left_token and left_token.id
1083	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1084	word_compile.RemoveLeadingSpaceDQ(out_parts)
1085
1086	# Return nothing, since we appended to 'out_parts'
1087
1088	def _ReadDoubleQuoted(self, left_token):
1089	# type: (Token) -> DoubleQuoted
1090	"""Helper function for "hello $name".
1091
1092	Args:
1093	eof_type: for stopping at }, Id.Lit_RBrace
1094	here_doc: Whether we are reading in a here doc context
1095
1096	Also ${foo%%a b c} # treat this as double quoted. until you hit
1097	"""
1098	parts = [] # type: List[word_part_t]
1099	self._ReadLikeDQ(left_token, False, parts)
1100
1101	right_quote = self.cur_token
1102	return DoubleQuoted(left_token, parts, right_quote)
1103
1104	def ReadDoubleQuoted(self, left_token, parts):
1105	# type: (Token, List[word_part_t]) -> Token
1106	"""For expression mode.
1107
1108	Read var x = "${dir:-}/$name"; etc.
1109	"""
1110	self._ReadLikeDQ(left_token, True, parts)
1111	return self.cur_token
1112
1113	def _ReadCommandSub(self, left_id, d_quoted=False):
1114	# type: (Id_t, bool) -> CommandSub
1115	"""
1116	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1117
1118	command_sub = '$(' command_list ')'
1119	\| '@(' command_list ')'
1120	\| '<(' command_list ')'
1121	\| '>(' command_list ')'
1122	\| ` command_list `
1123	"""
1124	left_token = self.cur_token
1125
1126	# Set the lexer in a state so ) becomes the EOF token.
1127	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1128	Id.Left_ProcSubOut):
1129	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1130
1131	right_id = Id.Eof_RParen
1132	self.lexer.PushHint(Id.Op_RParen, right_id)
1133	c_parser = self.parse_ctx.MakeParserForCommandSub(
1134	self.line_reader, self.lexer, right_id)
1135	# NOTE: This doesn't use something like main_loop because we don't want
1136	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1137	node = c_parser.ParseCommandSub()
1138
1139	right_token = c_parser.w_parser.cur_token
1140
1141	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1142	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1143	# test/osh2oil.
1144
1145	right_id = Id.Eof_Backtick
1146	self.lexer.PushHint(Id.Left_Backtick, right_id)
1147	c_parser = self.parse_ctx.MakeParserForCommandSub(
1148	self.line_reader, self.lexer, right_id)
1149	node = c_parser.ParseCommandSub()
1150	right_token = c_parser.w_parser.cur_token
1151
1152	elif left_id == Id.Left_Backtick:
1153	if not self.parse_opts.parse_backticks():
1154	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1155	left_token)
1156
1157	self._SetNext(lex_mode_e.Backtick) # advance past `
1158
1159	parts = [] # type: List[str]
1160	while True:
1161	self._GetToken()
1162	#log("TOK %s", self.cur_token)
1163
1164	if self.token_type == Id.Backtick_Quoted:
1165	# Remove leading \
1166	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1167
1168	elif self.token_type == Id.Backtick_DoubleQuote:
1169	# Compatibility: If backticks are double quoted, then double quotes
1170	# within them have to be \"
1171	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1172	# is)
1173	if d_quoted:
1174	# Remove leading \
1175	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1176	else:
1177	parts.append(lexer.TokenVal(self.cur_token))
1178
1179	elif self.token_type == Id.Backtick_Other:
1180	parts.append(lexer.TokenVal(self.cur_token))
1181
1182	elif self.token_type == Id.Backtick_Right:
1183	break
1184
1185	elif self.token_type == Id.Eof_Real:
1186	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1187	p_die('Unexpected EOF while looking for closing backtick',
1188	left_token)
1189
1190	else:
1191	raise AssertionError(self.cur_token)
1192
1193	self._SetNext(lex_mode_e.Backtick)
1194
1195	# Calculate right SPID on CommandSub BEFORE re-parsing.
1196	right_token = self.cur_token
1197
1198	code_str = ''.join(parts)
1199	#log('code %r', code_str)
1200
1201	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1202	# won't have the same location info as MakeParserForCommandSub(), because
1203	# the lexer is different.
1204	arena = self.parse_ctx.arena
1205	#arena = alloc.Arena()
1206	line_reader = reader.StringLineReader(code_str, arena)
1207	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1208	src = source.Reparsed('backticks', left_token, right_token)
1209	with alloc.ctx_SourceCode(arena, src):
1210	node = c_parser.ParseCommandSub()
1211
1212	else:
1213	raise AssertionError(left_id)
1214
1215	return CommandSub(left_token, node, right_token)
1216
1217	def _ReadExprSub(self, lex_mode):
1218	# type: (lex_mode_t) -> word_part.ExprSub
1219	"""$[d->key] $[obj.method()] etc."""
1220	left_token = self.cur_token
1221
1222	self._SetNext(lex_mode_e.Expr)
1223	enode, right_token = self.parse_ctx.ParseYshExpr(
1224	self.lexer, grammar_nt.ysh_expr_sub)
1225
1226	self._SetNext(lex_mode) # Move past ]
1227	return word_part.ExprSub(left_token, enode, right_token)
1228
1229	def ParseVarDecl(self, kw_token):
1230	# type: (Token) -> command.VarDecl
1231	"""
1232	oil_var_decl: name_type_list '=' testlist end_stmt
1233
1234	Note that assignments must end with \n ; } or EOF. Unlike shell
1235	assignments, we disallow:
1236
1237	var x = 42 \| wc -l
1238	var x = 42 && echo hi
1239	"""
1240	self._SetNext(lex_mode_e.Expr)
1241	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1242	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1243	# wants
1244	if last_token.id == Id.Op_RBrace:
1245	last_token.id = Id.Lit_RBrace
1246
1247	# Let the CommandParser see the Op_Semi or Op_Newline.
1248	self.buffered_word = last_token
1249	self._SetNext(lex_mode_e.ShCommand) # always back to this
1250	return enode
1251
1252	def ParseMutation(self, kw_token, var_checker):
1253	# type: (Token, VarChecker) -> command.Mutation
1254	"""
1255	setvar i = 42
1256	setvar i += 1
1257	setvar a[i] = 42
1258	setvar a[i] += 1
1259	setvar d.key = 42
1260	setvar d.key += 1
1261	"""
1262	self._SetNext(lex_mode_e.Expr)
1263	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1264	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1265	# wants
1266	if last_token.id == Id.Op_RBrace:
1267	last_token.id = Id.Lit_RBrace
1268
1269	for lhs in enode.lhs:
1270	UP_lhs = lhs
1271	with tagswitch(lhs) as case:
1272	if case(y_lhs_e.Var):
1273	lhs = cast(Token, UP_lhs)
1274	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1275
1276	# Note: this does not cover cases like
1277	# setvar (a[0])[1] = v
1278	# setvar (d.key).other = v
1279	# This leaks into catching all typos statically, which may be
1280	# possible if 'use' makes all names explicit.
1281	elif case(y_lhs_e.Subscript):
1282	lhs = cast(Subscript, UP_lhs)
1283	if lhs.obj.tag() == expr_e.Var:
1284	v = cast(expr.Var, lhs.obj)
1285	var_checker.Check(kw_token.id, v.name, v.left)
1286
1287	elif case(y_lhs_e.Attribute):
1288	lhs = cast(Attribute, UP_lhs)
1289	if lhs.obj.tag() == expr_e.Var:
1290	v = cast(expr.Var, lhs.obj)
1291	var_checker.Check(kw_token.id, v.name, v.left)
1292
1293	# Let the CommandParser see the Op_Semi or Op_Newline.
1294	self.buffered_word = last_token
1295	self._SetNext(lex_mode_e.ShCommand) # always back to this
1296	return enode
1297
1298	def ParseBareDecl(self):
1299	# type: () -> expr_t
1300	"""
1301	x = {name: val}
1302	"""
1303	self._SetNext(lex_mode_e.Expr)
1304	self._GetToken()
1305	enode, last_token = self.parse_ctx.ParseYshExpr(
1306	self.lexer, grammar_nt.command_expr)
1307	if last_token.id == Id.Op_RBrace:
1308	last_token.id = Id.Lit_RBrace
1309	self.buffered_word = last_token
1310	self._SetNext(lex_mode_e.ShCommand)
1311	return enode
1312
1313	def ParseYshExprForCommand(self):
1314	# type: () -> expr_t
1315
1316	# Fudge for this case
1317	# for x in(y) {
1318	# versus
1319	# for x in (y) {
1320	#
1321	# In the former case, ReadWord on 'in' puts the lexer past (.
1322	# Also see LookPastSpace in CommandParers.
1323	# A simpler solution would be nicer.
1324
1325	if self.token_type == Id.Op_LParen:
1326	self.lexer.MaybeUnreadOne()
1327
1328	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1329
1330	self._SetNext(lex_mode_e.ShCommand)
1331	return enode
1332
1333	def ParseCommandExpr(self):
1334	# type: () -> expr_t
1335	"""
1336	= 1+2
1337	"""
1338	enode, last_token = self.parse_ctx.ParseYshExpr(
1339	self.lexer, grammar_nt.command_expr)
1340
1341	# In some cases, such as the case statement, we expect the lexer to be
1342	# pointing at the token right after the expression. But the expression
1343	# parser must have read to the `last_token`. Unreading places the lexer
1344	# back in the expected state. Ie:
1345	#
1346	# case (x) { case (x) {
1347	# (else) { = x } (else) { = x }
1348	# ^ The lexer is here ^ Unread to here
1349	# } }
1350	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1351	Id.Op_RBrace), last_token
1352	if last_token.id != Id.Eof_Real:
1353	# Eof_Real is the only token we cannot unread
1354	self.lexer.MaybeUnreadOne()
1355
1356	return enode
1357
1358	def ParseProc(self, node):
1359	# type: (Proc) -> None
1360
1361	# proc name-with-hyphens() must be accepted
1362	self._SetNext(lex_mode_e.ShCommand)
1363	self._GetToken()
1364	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1365	if self.token_type != Id.Lit_Chars:
1366	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1367	self.cur_token)
1368
1369	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1370	# for shell functions. Similar to IsValidVarName().
1371	node.name = self.cur_token
1372
1373	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1374
1375	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1376	assert last_token.id == Id.Op_LBrace
1377	last_token.id = Id.Lit_LBrace
1378	self.buffered_word = last_token
1379
1380	self._SetNext(lex_mode_e.ShCommand)
1381
1382	def ParseFunc(self, node):
1383	# type: (Func) -> None
1384	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1385
1386	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1387	assert last_token.id == Id.Op_LBrace
1388	last_token.id = Id.Lit_LBrace
1389	self.buffered_word = last_token
1390
1391	self._SetNext(lex_mode_e.ShCommand)
1392
1393	def ParseYshCasePattern(self):
1394	# type: () -> Tuple[pat_t, Token]
1395	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1396	self.lexer)
1397
1398	if last_token.id == Id.Op_LBrace:
1399	last_token.id = Id.Lit_LBrace
1400	self.buffered_word = last_token
1401
1402	return pat, left_tok
1403
1404	def NewlineOkForYshCase(self):
1405	# type: () -> Id_t
1406	"""Check for optional newline and consume it.
1407
1408	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1409	which crop up while parsing Ysh Case Arms. For more details, see
1410	#oil-dev > Progress On YSH Case Grammar on zulip.
1411
1412	Returns a token id which is filled with the choice of
1413
1414	word { echo word }
1415	(3) { echo expr }
1416	/e/ { echo eggex }
1417	} # right brace
1418	"""
1419	while True:
1420	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1421
1422	# Cannot lookahead past lines
1423	if next_id == Id.Unknown_Tok:
1424	self.lexer.MoveToNextLine()
1425	continue
1426
1427	next_kind = consts.GetKind(next_id)
1428	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1429	break
1430
1431	self.lexer.Read(lex_mode_e.Expr)
1432
1433	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1434	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1435	else:
1436	# Consume the trailing Op_Newline
1437	self._SetNext(lex_mode_e.ShCommand)
1438	self._GetToken()
1439
1440	return next_id
1441
1442	def _ReadArithExpr(self, end_id):
1443	# type: (Id_t) -> arith_expr_t
1444	"""Read and parse an arithmetic expression in various contexts.
1445
1446	$(( 1+2 ))
1447	(( a=1+2 ))
1448	${a[ 1+2 ]}
1449	${a : 1+2 : 1+2}
1450
1451	See tests/arith-context.test.sh for ambiguous cases.
1452
1453	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1454
1455	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1456
1457	See the assertion in ArithParser.Parse() -- unexpected extra input.
1458	"""
1459	# calls self.ReadWord(lex_mode_e.Arith)
1460	anode = self.a_parser.Parse()
1461	cur_id = self.a_parser.CurrentId()
1462	if end_id != Id.Undefined_Tok and cur_id != end_id:
1463	p_die(
1464	'Unexpected token after arithmetic expression (%s != %s)' %
1465	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1466	loc.Word(self.a_parser.cur_word))
1467	return anode
1468
1469	def _ReadArithSub(self):
1470	# type: () -> word_part.ArithSub
1471	"""Read an arith substitution, which contains an arith expression, e.g.
1472
1473	$((a + 1)).
1474	"""
1475	left_tok = self.cur_token
1476
1477	# The second one needs to be disambiguated in stuff like stuff like:
1478	# $(echo $(( 1+2 )) )
1479	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1480
1481	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1482	# could save the lexer/reader state here, and retry if the arithmetic parse
1483	# fails. But we can almost always catch this at parse time. There could
1484	# be some exceptions like:
1485	# $((echo * foo)) # looks like multiplication
1486	# $((echo / foo)) # looks like division
1487
1488	# $(( )) is valid
1489	anode = arith_expr.EmptyZero # type: arith_expr_t
1490
1491	self._NextNonSpace()
1492	if self.token_type != Id.Arith_RParen:
1493	anode = self._ReadArithExpr(Id.Arith_RParen)
1494
1495	self._SetNext(lex_mode_e.ShCommand)
1496
1497	# Ensure we get closing )
1498	self._GetToken()
1499	if self.token_type != Id.Right_DollarDParen:
1500	p_die('Expected second ) to end arith sub', self.cur_token)
1501
1502	right_tok = self.cur_token
1503	return word_part.ArithSub(left_tok, anode, right_tok)
1504
1505	def ReadDParen(self):
1506	# type: () -> Tuple[arith_expr_t, Token]
1507	"""Read ((1+ 2)) -- command context.
1508
1509	We're using the word parser because it's very similar to _ReadArithExpr
1510	above.
1511
1512	This also returns the terminating Id.Op_DRightParen token for location
1513	info.
1514	"""
1515	# (( )) is valid
1516	anode = arith_expr.EmptyZero # type: arith_expr_t
1517
1518	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1519
1520	self._NextNonSpace()
1521	if self.token_type != Id.Arith_RParen:
1522	anode = self._ReadArithExpr(Id.Arith_RParen)
1523
1524	self._SetNext(lex_mode_e.ShCommand)
1525
1526	# Ensure we get the second )
1527	self._GetToken()
1528	right = self.cur_token
1529	if right.id != Id.Op_DRightParen:
1530	p_die('Expected second ) to end arith statement', right)
1531
1532	self._SetNext(lex_mode_e.ShCommand)
1533
1534	return anode, right
1535
1536	def _NextNonSpace(self):
1537	# type: () -> None
1538	"""Advance in lex_mode_e.Arith until non-space token.
1539
1540	Same logic as _ReadWord, but used in
1541	$(( ))
1542	(( ))
1543	for (( ))
1544
1545	You can read self.token_type after this, without calling _GetToken.
1546	"""
1547	while True:
1548	self._SetNext(lex_mode_e.Arith)
1549	self._GetToken()
1550	if self.token_kind not in (Kind.Ignored, Kind.WS):
1551	break
1552
1553	def ReadForExpression(self):
1554	# type: () -> command.ForExpr
1555	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1556	self._NextNonSpace() # skip over ((
1557	cur_id = self.token_type # for end of arith expressions
1558
1559	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1560	init_node = arith_expr.EmptyZero # type: arith_expr_t
1561	else:
1562	init_node = self.a_parser.Parse()
1563	cur_id = self.a_parser.CurrentId()
1564	self._NextNonSpace()
1565
1566	# It's odd to keep track of both cur_id and self.token_type in this
1567	# function, but it works, and is tested in 'test/parse_error.sh
1568	# arith-integration'
1569	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1570	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1571
1572	self._GetToken()
1573	cur_id = self.token_type
1574
1575	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1576	# empty condition is TRUE
1577	cond_node = arith_expr.EmptyOne # type: arith_expr_t
1578	else:
1579	cond_node = self.a_parser.Parse()
1580	cur_id = self.a_parser.CurrentId()
1581
1582	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1583	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1584
1585	self._NextNonSpace()
1586	if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1587	update_node = arith_expr.EmptyZero # type: arith_expr_t
1588	else:
1589	update_node = self._ReadArithExpr(Id.Arith_RParen)
1590
1591	self._NextNonSpace()
1592	if self.token_type != Id.Arith_RParen:
1593	p_die('Expected ) to end for loop expression', self.cur_token)
1594	self._SetNext(lex_mode_e.ShCommand)
1595
1596	# redirects is None, will be assigned in CommandEvaluator
1597	node = command.ForExpr.CreateNull()
1598	node.init = init_node
1599	node.cond = cond_node
1600	node.update = update_node
1601	return node
1602
1603	def _ReadArrayLiteral(self):
1604	# type: () -> word_part_t
1605	"""a=(1 2 3)
1606
1607	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1608
1609	We want:
1610
1611	A=(['x']=1 ["x"]=2 [$x$y]=3)
1612
1613	Maybe allow this as a literal string? Because I think I've seen it before?
1614	Or maybe force people to patch to learn the rule.
1615
1616	A=([x]=4)
1617
1618	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1619	Maybe enforce that ALL have keys or NONE of have keys.
1620	"""
1621	self._SetNext(lex_mode_e.ShCommand) # advance past (
1622	self._GetToken()
1623	if self.cur_token.id != Id.Op_LParen:
1624	p_die('Expected ( after =', self.cur_token)
1625	left_token = self.cur_token
1626	right_token = None # type: Token
1627
1628	# MUST use a new word parser (with same lexer).
1629	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1630	words = [] # type: List[CompoundWord]
1631	done = False
1632	while not done:
1633	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1634	with tagswitch(w) as case:
1635	if case(word_e.Operator):
1636	tok = cast(Token, w)
1637	if tok.id == Id.Right_ShArrayLiteral:
1638	right_token = tok
1639	done = True # can't use break here
1640	# Unlike command parsing, array parsing allows embedded \n.
1641	elif tok.id == Id.Op_Newline:
1642	continue
1643	else:
1644	p_die('Unexpected token in array literal', loc.Word(w))
1645
1646	elif case(word_e.Compound):
1647	words.append(cast(CompoundWord, w))
1648
1649	else:
1650	raise AssertionError()
1651
1652	if len(words) == 0: # a=() is empty indexed array
1653	# Needed for type safety, doh
1654	no_words = [] # type: List[word_t]
1655	node = ShArrayLiteral(left_token, no_words, right_token)
1656	return node
1657
1658	pairs = [] # type: List[AssocPair]
1659	# If the first one is a key/value pair, then the rest are assumed to be.
1660	pair = word_.DetectAssocPair(words[0])
1661	if pair:
1662	pairs.append(pair)
1663
1664	n = len(words)
1665	for i in xrange(1, n):
1666	w2 = words[i]
1667	pair = word_.DetectAssocPair(w2)
1668	if not pair:
1669	p_die("Expected associative array pair", loc.Word(w2))
1670
1671	pairs.append(pair)
1672
1673	# invariant List?
1674	return word_part.BashAssocLiteral(left_token, pairs, right_token)
1675
1676	# Brace detection for arrays but NOT associative arrays
1677	words2 = braces.BraceDetectAll(words)
1678	words3 = word_.TildeDetectAll(words2)
1679	return ShArrayLiteral(left_token, words3, right_token)
1680
1681	def ParseProcCallArgs(self, start_symbol):
1682	# type: (int) -> ArgList
1683	""" json write (x) """
1684	self.lexer.MaybeUnreadOne()
1685
1686	arg_list = ArgList.CreateNull(alloc_lists=True)
1687	arg_list.left = self.cur_token
1688	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1689	return arg_list
1690
1691	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1692	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1693	"""Helper for _ReadCompoundWord3."""
1694	done = False
1695
1696	if self.token_type == Id.Lit_EscapedChar:
1697	tok = self.cur_token
1698	assert tok.length == 2
1699	ch = lexer.TokenSliceLeft(tok, 1)
1700	if not self.parse_opts.parse_backslash():
1701	if not pyutil.IsValidCharEscape(ch):
1702	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1703	self.cur_token)
1704
1705	part = word_part.EscapedLiteral(self.cur_token,
1706	ch) # type: word_part_t
1707	else:
1708	part = self.cur_token
1709
1710	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1711	parts.append(part)
1712	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1713	# _ReadWord.
1714	next_id = self.lexer.LookPastSpace(lex_mode)
1715	if next_id == Id.Op_LParen:
1716	self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1717	part2 = self._ReadArrayLiteral()
1718	parts.append(part2)
1719
1720	# Array literal must be the last part of the word.
1721	self._SetNext(lex_mode)
1722	self._GetToken()
1723	# EOF, whitespace, newline, Right_Subshell
1724	if self.token_kind not in KINDS_THAT_END_WORDS:
1725	p_die('Unexpected token after array literal',
1726	self.cur_token)
1727	done = True
1728
1729	elif (is_first and self.parse_opts.parse_at() and
1730	self.token_type == Id.Lit_Splice):
1731
1732	splice_tok = self.cur_token
1733	part2 = word_part.Splice(splice_tok,
1734	lexer.TokenSliceLeft(splice_tok, 1))
1735
1736	parts.append(part2)
1737
1738	# @words must be the last part of the word
1739	self._SetNext(lex_mode)
1740	self._GetToken()
1741	# EOF, whitespace, newline, Right_Subshell
1742	if self.token_kind not in KINDS_THAT_END_WORDS:
1743	p_die('Unexpected token after array splice', self.cur_token)
1744	done = True
1745
1746	elif (is_first and self.parse_opts.parse_at() and
1747	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1748	part2 = self._ReadExprSub(lex_mode_e.DQ)
1749	parts.append(part2)
1750
1751	# @[split(x)]
1752	self._SetNext(lex_mode)
1753	self._GetToken()
1754	# EOF, whitespace, newline, Right_Subshell
1755	if self.token_kind not in KINDS_THAT_END_WORDS:
1756	p_die('Unexpected token after Expr splice', self.cur_token)
1757	done = True
1758
1759	elif (is_first and self.parse_opts.parse_at() and
1760	self.token_type == Id.Lit_AtLBraceDot):
1761	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1762
1763	elif (is_first and self.parse_opts.parse_at_all() and
1764	self.token_type == Id.Lit_At):
1765	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1766	# at the beginning of a word to be reserved.
1767
1768	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1769	# @_argv and
1770	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1771	self.cur_token)
1772
1773	else:
1774	# not a literal with lookahead; append it
1775	parts.append(part)
1776
1777	return done
1778
1779	def _ReadCompoundWord(self, lex_mode):
1780	# type: (lex_mode_t) -> CompoundWord
1781	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1782
1783	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1784	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1785	"""
1786	Precondition: Looking at the first token of the first word part
1787	Postcondition: Looking at the token after, e.g. space or operator
1788
1789	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1790	could be an operator delimiting a compound word. Can we change lexer modes
1791	and remove this special case?
1792	"""
1793	w = CompoundWord([])
1794	num_parts = 0
1795	brace_count = 0
1796	done = False
1797	is_triple_quoted = None # type: Optional[BoolParamBox]
1798
1799	while not done:
1800	self._GetToken()
1801
1802	allow_done = empty_ok or num_parts != 0
1803	if allow_done and self.token_type == eof_type:
1804	done = True # e.g. for ${foo//pat/replace}
1805
1806	# Keywords like "for" are treated like literals
1807	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1808	Kind.ControlFlow, Kind.BoolUnary,
1809	Kind.BoolBinary):
1810
1811	# Syntax error for { and }
1812	if self.token_type == Id.Lit_LBrace:
1813	brace_count += 1
1814	elif self.token_type == Id.Lit_RBrace:
1815	brace_count -= 1
1816	elif self.token_type == Id.Lit_Dollar:
1817	if not self.parse_opts.parse_dollar():
1818	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1819	next_byte = self.lexer.ByteLookAhead()
1820	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1821	if next_byte == '/':
1822	#log('next_byte %r', next_byte)
1823	pass
1824
1825	p_die('Literal $ should be quoted like \$',
1826	self.cur_token)
1827
1828	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1829	w.parts)
1830
1831	elif self.token_kind == Kind.VSub:
1832	vsub_token = self.cur_token
1833
1834	part = SimpleVarSub(vsub_token) # type: word_part_t
1835	w.parts.append(part)
1836
1837	elif self.token_kind == Kind.ExtGlob:
1838	# If parse_at, we can take over @( to start @(seq 3)
1839	# Users can also use look at ,(.py\|.sh)
1840	if (self.parse_opts.parse_at() and
1841	self.token_type == Id.ExtGlob_At and num_parts == 0):
1842	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1843	d_quoted=False)
1844	# RARE mutation of tok.id!
1845	cs_part.left_token.id = Id.Left_AtParen
1846	part = cs_part # for type safety
1847
1848	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1849	# a=(one two)x and @arrayfunc(3)x.
1850	self._GetToken()
1851	if self.token_kind not in KINDS_THAT_END_WORDS:
1852	p_die('Unexpected token after @()', self.cur_token)
1853	done = True
1854
1855	else:
1856	part = self._ReadExtGlob()
1857	w.parts.append(part)
1858
1859	elif self.token_kind == Kind.BashRegex:
1860	if self.token_type == Id.BashRegex_LParen: # Opening (
1861	part = self._ReadBashRegexGroup()
1862	w.parts.append(part)
1863	else:
1864	assert self.token_type == Id.BashRegex_AllowedInParens
1865	p_die('Invalid token in bash regex', self.cur_token)
1866
1867	elif self.token_kind == Kind.Left:
1868	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1869	lex_mode == lex_mode_e.ShCommand and
1870	num_parts == 0)
1871
1872	# Save allocation
1873	if try_triple_quote:
1874	is_triple_quoted = BoolParamBox(False)
1875
1876	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1877	w.parts.append(part)
1878
1879	# NOT done yet, will advance below
1880	elif self.token_kind == Kind.Right:
1881	# Still part of the word; will be done on the next iter.
1882	if self.token_type == Id.Right_DoubleQuote:
1883	pass
1884	# Never happens, no PushHint for this case.
1885	#elif self.token_type == Id.Right_DollarParen:
1886	# pass
1887	elif self.token_type == Id.Right_Subshell:
1888	# LEXER HACK for (case x in x) ;; esac )
1889	# Rewind before it's used
1890	assert self.next_lex_mode == lex_mode_e.Undefined
1891	if self.lexer.MaybeUnreadOne():
1892	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1893	self._SetNext(lex_mode)
1894	done = True
1895	else:
1896	done = True
1897
1898	elif self.token_kind == Kind.Ignored:
1899	done = True
1900
1901	else:
1902	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1903	# so to test for ESAC, we can read ) before getting a chance to
1904	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1905	# token and do it again.
1906
1907	# We get Id.Op_RParen at top level: case x in x) ;; esac
1908	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1909	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1910	# Rewind before it's used
1911	assert self.next_lex_mode == lex_mode_e.Undefined
1912	if self.lexer.MaybeUnreadOne():
1913	if self.token_type == Id.Eof_RParen:
1914	# Redo translation
1915	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1916	self._SetNext(lex_mode)
1917
1918	done = True # anything we don't recognize means we're done
1919
1920	if not done:
1921	self._SetNext(lex_mode)
1922	num_parts += 1
1923
1924	if (self.parse_opts.parse_brace() and num_parts > 1 and
1925	brace_count != 0):
1926	# accept { and }, but not foo{
1927	p_die(
1928	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1929	loc.Word(w))
1930
1931	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1932	p_die('Unexpected parts after triple quoted string',
1933	loc.WordPart(w.parts[-1]))
1934
1935	if 0:
1936	from _devbuild.gen.syntax_asdl import word_part_str
1937	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1938	WORD_HIST[word_key] += 1
1939	return w
1940
1941	def _ReadArithWord(self):
1942	# type: () -> Optional[word_t]
1943	""" Helper for ReadArithWord() """
1944	self._GetToken()
1945
1946	if self.token_kind == Kind.Unknown:
1947	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1948	p_die(
1949	'Unexpected token while parsing arithmetic: %r' %
1950	lexer.TokenVal(self.cur_token), self.cur_token)
1951
1952	elif self.token_kind == Kind.Eof:
1953	return self.cur_token
1954
1955	elif self.token_kind == Kind.Ignored:
1956	# Space should be ignored.
1957	self._SetNext(lex_mode_e.Arith)
1958	return None
1959
1960	elif self.token_kind in (Kind.Arith, Kind.Right):
1961	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1962	self._SetNext(lex_mode_e.Arith)
1963	return self.cur_token
1964
1965	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1966	return self._ReadCompoundWord(lex_mode_e.Arith)
1967
1968	else:
1969	raise AssertionError(self.cur_token)
1970
1971	def _ReadWord(self, word_mode):
1972	# type: (lex_mode_t) -> Optional[word_t]
1973	"""Helper function for ReadWord()."""
1974
1975	# Change the pseudo lexer mode to a real lexer mode
1976	if word_mode == lex_mode_e.ShCommandFakeBrack:
1977	lex_mode = lex_mode_e.ShCommand
1978	else:
1979	lex_mode = word_mode
1980
1981	self._GetToken()
1982
1983	if self.token_kind == Kind.Eof:
1984	# No advance
1985	return self.cur_token
1986
1987	# Allow Arith for ) at end of for loop?
1988	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1989	self._SetNext(lex_mode)
1990
1991	# Newlines are complicated. See 3x2 matrix in the comment about
1992	# self.multiline and self.newline_state above.
1993	if self.token_type == Id.Op_Newline:
1994	if self.multiline:
1995	if self.newline_state > 1:
1996	# This points at a blank line, but at least it gives the line number
1997	p_die('Invalid blank line in multiline mode',
1998	self.cur_token)
1999	return None
2000
2001	if self.returned_newline: # skip
2002	return None
2003
2004	return self.cur_token
2005
2006	elif self.token_kind == Kind.Right:
2007	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2008	Id.Right_CasePat,
2009	Id.Right_ShArrayLiteral):
2010	raise AssertionError(self.cur_token)
2011
2012	self._SetNext(lex_mode)
2013	return self.cur_token
2014
2015	elif self.token_kind in (Kind.Ignored, Kind.WS):
2016	self._SetNext(lex_mode)
2017	return None
2018
2019	else:
2020	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2021	Kind.Left, Kind.KW, Kind.ControlFlow,
2022	Kind.BoolUnary, Kind.BoolBinary,
2023	Kind.ExtGlob,
2024	Kind.BashRegex), 'Unhandled token kind'
2025
2026	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2027	self.parse_opts.parse_bracket() and
2028	self.token_type == Id.Lit_LBracket):
2029	# Change [ from Kind.Lit -> Kind.Op
2030	# So CommandParser can treat
2031	# assert [42 === x]
2032	# like
2033	# json write (x)
2034	bracket_word = self.cur_token
2035	bracket_word.id = Id.Op_LBracket
2036
2037	self._SetNext(lex_mode)
2038	return bracket_word
2039
2040	# We're beginning a word. If we see Id.Lit_Pound, change to
2041	# lex_mode_e.Comment and read until end of line.
2042	if self.token_type == Id.Lit_Pound:
2043	self._SetNext(lex_mode_e.Comment)
2044	self._GetToken()
2045
2046	# NOTE: The # could be the last character in the file. It can't be
2047	# Eof_{RParen,Backtick} because #) and #` are comments.
2048	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2049	self.cur_token
2050
2051	# The next iteration will go into Kind.Ignored and set lex state to
2052	# lex_mode_e.ShCommand/etc.
2053	return None # tell ReadWord() to try again after comment
2054
2055	elif self.token_type == Id.Lit_TPound: ### doc comment
2056	self._SetNext(lex_mode_e.Comment)
2057	self._GetToken()
2058
2059	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2060	return self.cur_token
2061
2062	return None # tell ReadWord() to try again after comment
2063
2064	else:
2065	# r'' u'' b''
2066	if (self.token_type == Id.Lit_Chars and
2067	self.lexer.LookAheadOne(
2068	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2069
2070	# When shopt -s parse_raw_string:
2071	# echo r'hi' is like echo 'hi'
2072	#
2073	# echo u'\u{3bc}' b'\yff' works
2074
2075	tok = self.cur_token
2076	if self.parse_opts.parse_ysh_string():
2077	if lexer.TokenEquals(tok, 'r'):
2078	left_id = Id.Left_RSingleQuote
2079	elif lexer.TokenEquals(tok, 'u'):
2080	left_id = Id.Left_USingleQuote
2081	elif lexer.TokenEquals(tok, 'b'):
2082	left_id = Id.Left_BSingleQuote
2083	else:
2084	left_id = Id.Undefined_Tok
2085
2086	if left_id != Id.Undefined_Tok:
2087	# skip the r, and then 'foo' will be read as normal
2088	self._SetNext(lex_mode_e.ShCommand)
2089
2090	self._GetToken()
2091	assert self.token_type == Id.Left_SingleQuote, self.token_type
2092
2093	# Read the word in a different lexer mode
2094	return self._ReadYshSingleQuoted(left_id)
2095
2096	return self._ReadCompoundWord(lex_mode)
2097
2098	def ParseVarRef(self):
2099	# type: () -> BracedVarSub
2100	"""DYNAMIC parsing of what's inside ${!ref}
2101
2102	# Same as VarOf production
2103	VarRefExpr = VarOf EOF
2104	"""
2105	self._SetNext(lex_mode_e.VSub_1)
2106
2107	self._GetToken()
2108	if self.token_kind != Kind.VSub:
2109	p_die('Expected var name', self.cur_token)
2110
2111	part = self._ParseVarOf()
2112	# NOTE: no ${ } means no part.left and part.right
2113	part.left = part.token # cheat to make test pass
2114	part.right = part.token
2115
2116	self._GetToken()
2117	if self.token_type != Id.Eof_Real:
2118	p_die('Expected end of var ref expression', self.cur_token)
2119	return part
2120
2121	def LookPastSpace(self):
2122	# type: () -> Id_t
2123	"""Look ahead to the next token.
2124
2125	For the CommandParser to recognize
2126	array= (1 2 3)
2127	YSH for ( versus bash for ((
2128	YSH if ( versus if test
2129	YSH while ( versus while test
2130	YSH bare assignment 'grep =' versus 'grep foo'
2131	"""
2132	assert self.token_type != Id.Undefined_Tok
2133	if self.cur_token.id == Id.WS_Space:
2134	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2135	else:
2136	id_ = self.cur_token.id
2137	return id_
2138
2139	def LookAheadFuncParens(self):
2140	# type: () -> bool
2141	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2142	assert self.token_type != Id.Undefined_Tok
2143
2144	# We have to handle 2 cases because we buffer a token
2145	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2146	return self.lexer.LookAheadFuncParens(1) # go back one char
2147
2148	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2149	return self.lexer.LookAheadFuncParens(0)
2150
2151	else:
2152	return False
2153
2154	def ReadWord(self, word_mode):
2155	# type: (lex_mode_t) -> word_t
2156	"""Read the next word, using the given lexer mode.
2157
2158	This is a stateful wrapper for the stateless _ReadWord function.
2159	"""
2160	assert word_mode in (lex_mode_e.ShCommand,
2161	lex_mode_e.ShCommandFakeBrack,
2162	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2163
2164	if self.buffered_word: # For integration with pgen2
2165	w = self.buffered_word
2166	self.buffered_word = None
2167	else:
2168	while True:
2169	w = self._ReadWord(word_mode)
2170	if w is not None:
2171	break
2172
2173	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2174	return w
2175
2176	def ReadArithWord(self):
2177	# type: () -> word_t
2178	while True:
2179	w = self._ReadArithWord()
2180	if w is not None:
2181	break
2182	return w
2183
2184	def ReadHereDocBody(self, parts):
2185	# type: (List[word_part_t]) -> None
2186	"""
2187	A here doc is like a double quoted context, except " isn't special.
2188	"""
2189	self._ReadLikeDQ(None, False, parts)
2190	# Returns nothing
2191
2192	def ReadForPlugin(self):
2193	# type: () -> CompoundWord
2194	"""For $PS1, $PS4, etc.
2195
2196	This is just like reading a here doc line. "\n" is allowed, as
2197	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2198	"""
2199	w = CompoundWord([])
2200	self._ReadLikeDQ(None, False, w.parts)
2201	return w
2202
2203	def EmitDocToken(self, b):
2204	# type: (bool) -> None
2205	self.emit_doc_token = b
2206
2207	def Multiline(self, b):
2208	# type: (bool) -> None
2209	self.multiline = b
2210
2211
2212	if 0:
2213	import collections
2214	WORD_HIST = collections.Counter()
2215
2216	# vim: sw=4