osh/word_parse.py

OILS / osh / word_parse.py View on Github | oilshell.org

2212 lines, 1179 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	ShArrayLiteral,
65	AssocPair,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	arith_expr,
91	)
92	from core import alloc
93	from core.error import p_die
94	from mycpp.mylib import log
95	from core import pyutil
96	from core import ui
97	from frontend import consts
98	from frontend import lexer
99	from frontend import reader
100	from osh import tdop
101	from osh import arith_parse
102	from osh import braces
103	from osh import word_
104	from osh import word_compile
105	from mycpp.mylib import tagswitch
106
107	from typing import List, Optional, Tuple, cast
108	from typing import TYPE_CHECKING
109	if TYPE_CHECKING:
110	from frontend.lexer import Lexer
111	from frontend.parse_lib import ParseContext
112	from frontend.reader import _Reader
113	from osh.cmd_parse import VarChecker
114
115	unused1 = log
116	unused2 = Id_str
117
118	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
119
120
121	class WordEmitter(object):
122	"""Common interface for [ and [["""
123
124	def __init__(self):
125	# type: () -> None
126	"""Empty constructor for mycpp."""
127	pass
128
129	def ReadWord(self, lex_mode):
130	# type: (lex_mode_t) -> word_t
131	raise NotImplementedError()
132
133
134	class WordParser(WordEmitter):
135
136	def __init__(self, parse_ctx, lexer, line_reader):
137	# type: (ParseContext, Lexer, _Reader) -> None
138	self.parse_ctx = parse_ctx
139	self.lexer = lexer
140	self.line_reader = line_reader
141	self.arena = line_reader.arena
142
143	self.parse_opts = parse_ctx.parse_opts
144	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
145	self.parse_opts)
146	self.Reset()
147
148	def Init(self, lex_mode):
149	# type: (lex_mode_t) -> None
150	"""Used to parse arithmetic, see ParseContext."""
151	self.next_lex_mode = lex_mode
152
153	def Reset(self):
154	# type: () -> None
155	"""Called by interactive loop."""
156	# For _GetToken()
157	self.cur_token = None # type: Token
158	self.token_kind = Kind.Undefined
159	self.token_type = Id.Undefined_Tok
160
161	self.next_lex_mode = lex_mode_e.ShCommand
162
163	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
164	# comments
165	self.emit_doc_token = False
166	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
167	# multiline mode.
168	self.multiline = False
169
170	# For detecting invalid \n\n in multiline mode. Counts what we got
171	# directly from the lexer.
172	self.newline_state = 0
173	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
174	# that consume words.
175	self.returned_newline = False
176
177	# For integration with pgen2
178	self.buffered_word = None # type: word_t
179
180	def _GetToken(self):
181	# type: () -> None
182	"""Call this when you need to make a decision based on any of:
183
184	self.token_type
185	self.token_kind
186	self.cur_token
187	"""
188	if self.next_lex_mode == lex_mode_e.Undefined:
189	return # _SetNext() not called, so do nothing
190
191	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
192	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
193
194	self.cur_token = self.lexer.Read(real_mode)
195
196	# MUTATE TOKEN for fake lexer mode.
197	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
198	if (is_fake and self.cur_token.id
199	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
200	self.cur_token.id = Id.Lit_Chars
201
202	self.token_type = self.cur_token.id
203	self.token_kind = consts.GetKind(self.token_type)
204
205	# number of consecutive newlines, ignoring whitespace
206	if self.token_type == Id.Op_Newline:
207	self.newline_state += 1
208	elif self.token_kind != Kind.WS:
209	self.newline_state = 0
210
211	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
212	self.next_lex_mode = lex_mode_e.Undefined
213
214	def _SetNext(self, lex_mode):
215	# type: (lex_mode_t) -> None
216	"""Set the next lex state, but don't actually read a token.
217
218	We need this for proper interactive parsing.
219	"""
220	self.next_lex_mode = lex_mode
221
222	def _ReadVarOpArg(self, arg_lex_mode):
223	# type: (lex_mode_t) -> rhs_word_t
224
225	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
226	# valid, even when unquoted.
227	self._SetNext(arg_lex_mode)
228	self._GetToken()
229
230	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
231	True) # empty_ok
232
233	# If the Compound has no parts, and we're in a double-quoted VarSub
234	# arg, and empty_ok, then return Empty. This is so it can evaluate to
235	# the empty string and not get elided.
236	#
237	# Examples:
238	# - "${s:-}", "${s/%pat/}"
239	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
240	# has the same potential problem of not having Token location info.
241	#
242	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
243	# return a Compound with no parts, which is explicitly checked with a
244	# custom error message.
245	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
246	return rhs_word.Empty
247
248	return w
249
250	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
251	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
252	"""Return a CompoundWord.
253
254	Helper function for _ReadVarOpArg and used directly by
255	_ReadPatSubVarOp.
256	"""
257	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
258	#log('w %s', w)
259	tilde = word_.TildeDetect(w)
260	if tilde:
261	w = tilde
262	return w
263
264	def _ReadSliceVarOp(self):
265	# type: () -> suffix_op.Slice
266	"""
267	Looking token after first ':'
268
269	ArithExpr? (':' ArithExpr? )? '}'
270	"""
271	self._NextNonSpace()
272
273	cur_id = self.token_type
274
275	if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
276	begin = arith_expr.EmptyZero # type: arith_expr_t
277	else:
278	begin = self.a_parser.Parse()
279	cur_id = self.a_parser.CurrentId() # advance
280
281	if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
282	no_length = None # type: Optional[arith_expr_t] # No length specified
283	return suffix_op.Slice(begin, no_length)
284
285	elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
286	self._SetNext(lex_mode_e.Arith)
287	self._GetToken()
288
289	if self.token_type != Id.Arith_RBrace:
290	length = self._ReadArithExpr(Id.Arith_RBrace)
291	else:
292	# quirky bash behavior:
293	# ${a:1:} or ${a::} means length ZERO
294	# but ${a:1} or ${a:} means length N
295	length = arith_expr.EmptyZero
296
297	return suffix_op.Slice(begin, length)
298
299	else:
300	p_die("Expected : or } in slice", self.cur_token)
301
302	raise AssertionError() # for MyPy
303
304	def _ReadPatSubVarOp(self):
305	# type: () -> suffix_op.PatSub
306	"""Looking at the first '/' after VarOf:
307
308	VarSub = ...
309	\| VarOf '/' Match ( '/' WORD? )?
310	Match = '/' WORD # can't be empty
311	\| '#' WORD? # may be empty
312	\| '%' WORD?
313	"""
314	slash_tok = self.cur_token # location info
315	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
316
317	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
318
319	self._GetToken()
320	if self.token_type == Id.Right_DollarBrace:
321	pat = CompoundWord([])
322	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
323	slash_tok)
324
325	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
326	replace_mode = self.token_type
327	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
328
329	# Bash quirk:
330	# echo ${x/#/replace} has an empty pattern
331	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
332	empty_ok = replace_mode != Id.Lit_Slash
333	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
334	empty_ok)
335	#log('pat 1 %r', pat)
336
337	if self.token_type == Id.Lit_Slash:
338	# read until }
339	replace = self._ReadVarOpArg(
340	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
341	#log('r 1 %r', replace)
342	else:
343	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
344	replace = rhs_word.Empty
345
346	self._GetToken()
347	if self.token_type != Id.Right_DollarBrace:
348	# This happens on invalid code
349	p_die(
350	"Expected } after replacement string, got %s" %
351	ui.PrettyId(self.token_type), self.cur_token)
352
353	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
354
355	def _ReadSubscript(self):
356	# type: () -> bracket_op_t
357	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
358	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
359	# expression.
360	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
361	if next_id in (Id.Lit_At, Id.Arith_Star):
362	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
363
364	self._SetNext(lex_mode_e.Arith) # skip past [
365	self._GetToken()
366	self._SetNext(lex_mode_e.Arith) # skip past @
367	self._GetToken()
368	else:
369	self._SetNext(lex_mode_e.Arith) # skip past [
370	anode = self._ReadArithExpr(Id.Arith_RBracket)
371	op = bracket_op.ArrayIndex(anode)
372
373	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
374	p_die('Expected ] to close subscript', self.cur_token)
375
376	self._SetNext(lex_mode_e.VSub_2) # skip past ]
377	self._GetToken() # Needed to be in the same spot as no subscript
378
379	return op
380
381	def _ParseVarOf(self):
382	# type: () -> BracedVarSub
383	"""
384	VarOf = NAME Subscript?
385	\| NUMBER # no subscript allowed, none of these are arrays
386	# ${@[1]} doesn't work, even though slicing does
387	\| VarSymbol
388	"""
389	self._GetToken()
390	name_token = self.cur_token
391	self._SetNext(lex_mode_e.VSub_2)
392
393	self._GetToken() # Check for []
394	if self.token_type == Id.VOp2_LBracket:
395	bracket_op = self._ReadSubscript()
396	else:
397	bracket_op = None
398
399	part = BracedVarSub.CreateNull()
400	part.token = name_token
401	part.var_name = lexer.TokenVal(name_token)
402	part.bracket_op = bracket_op
403	return part
404
405	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
406	# type: (lex_mode_t, bool) -> BracedVarSub
407	"""Start parsing at the op -- we already skipped past the name."""
408	part = self._ParseVarOf()
409
410	self._GetToken()
411	if self.token_type == Id.Right_DollarBrace:
412	return part # no ops
413
414	op_kind = self.token_kind
415
416	if op_kind == Kind.VTest:
417	tok = self.cur_token
418	arg_word = self._ReadVarOpArg(arg_lex_mode)
419	if self.token_type != Id.Right_DollarBrace:
420	p_die('Expected } to close ${', self.cur_token)
421
422	part.suffix_op = suffix_op.Unary(tok, arg_word)
423
424	elif op_kind == Kind.VOpYsh:
425	tok = self.cur_token
426	arg_word = self._ReadVarOpArg(arg_lex_mode)
427	if self.token_type != Id.Right_DollarBrace:
428	p_die('Expected } to close ${', self.cur_token)
429
430	UP_arg_word = arg_word
431	with tagswitch(arg_word) as case:
432	if case(rhs_word_e.Empty):
433	pass
434	elif case(rhs_word_e.Compound):
435	arg_word = cast(CompoundWord, UP_arg_word)
436	# This handles ${x\|html} and ${x %.3f} now
437	# However I think ${x %.3f} should be statically parsed? It can enter
438	# the printf lexer modes.
439	ok, arg, quoted = word_.StaticEval(arg_word)
440	if not ok or quoted:
441	p_die('Expected a constant argument',
442	loc.Word(arg_word))
443
444	part.suffix_op = suffix_op.Static(tok, arg)
445
446	elif op_kind == Kind.VOp0:
447	part.suffix_op = self.cur_token # Nullary
448	self._SetNext(lex_mode_e.VSub_2) # Expecting }
449	self._GetToken()
450
451	elif op_kind == Kind.VOp1: # % %% # ## etc.
452	tok = self.cur_token
453	# Weird exception that all shells have: these operators take a glob
454	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
455	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
456	if self.token_type != Id.Right_DollarBrace:
457	p_die('Expected } to close ${', self.cur_token)
458
459	part.suffix_op = suffix_op.Unary(tok, arg_word)
460
461	elif op_kind == Kind.VOp2: # / : [ ]
462	if self.token_type == Id.VOp2_Slash:
463	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
464	part.suffix_op = patsub_op
465
466	# Checked by the method above
467	assert self.token_type == Id.Right_DollarBrace, self.cur_token
468
469	elif self.token_type == Id.VOp2_Colon:
470	part.suffix_op = self._ReadSliceVarOp()
471	# NOTE: } in arithmetic mode.
472	if self.token_type != Id.Arith_RBrace:
473	# Token seems off; doesn't point to X in # ${a:1:2 X
474	p_die('Expected } to close ${', self.cur_token)
475
476	else:
477	# TODO: Does this ever happen?
478	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
479
480	elif op_kind == Kind.VOp3: # ${prefix@} etc.
481	if allow_query:
482	part.suffix_op = self.cur_token # Nullary
483	self._SetNext(lex_mode_e.VSub_2) # Expecting }
484	self._GetToken()
485	else:
486	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
487
488	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
489	# mode. It's redundantly checked above.
490	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
491	# ${a.} or ${!a.}
492	p_die('Expected } to close ${', self.cur_token)
493
494	# Now look for ops
495	return part
496
497	def _ReadZshVarSub(self, left_token):
498	# type: (Token) -> word_part.ZshVarSub
499
500	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
501
502	# Can be empty
503	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
504	True)
505	self._GetToken()
506	return word_part.ZshVarSub(left_token, w, self.cur_token)
507
508	def ReadBracedVarSub(self, left_token):
509	# type: (Token) -> Tuple[BracedVarSub, Token]
510	""" For YSH expressions like var x = ${x:-"default"}. """
511	part = self._ReadBracedVarSub(left_token, d_quoted=False)
512	last_token = self.cur_token
513	return part, last_token
514
515	def _ReadBracedVarSub(self, left_token, d_quoted):
516	# type: (Token, bool) -> BracedVarSub
517	"""For the ${} expression language.
518
519	NAME = [a-zA-Z_][a-zA-Z0-9_]*
520	NUMBER = [0-9]+ # ${10}, ${11}, ...
521
522	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
523	VarSymbol = '!' \| '@' \| '#' \| ...
524	VarOf = NAME Subscript?
525	\| NUMBER # no subscript allowed, none of these are arrays
526	# ${@[1]} doesn't work, even though slicing does
527	\| VarSymbol
528
529	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
530
531	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
532	STRIP_OP = '#' \| '##' \| '%' \| '%%'
533	CASE_OP = ',' \| ',,' \| '^' \| '^^'
534	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
535
536	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
537	# SPACE is operator not %
538	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
539	VarExpr = VarOf
540	\| VarOf NULLARY_OP
541	\| VarOf UnaryOp WORD
542	\| VarOf YSH_UNARY STATIC_WORD
543	\| VarOf ':' ArithExpr (':' ArithExpr )?
544	\| VarOf '/' Match '/' WORD
545
546	LengthExpr = '#' VarOf # can't apply operators after length
547
548	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
549	# ${!ref[0]} vs ${!keys[@]} resolved later
550
551	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
552
553	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
554
555	VarSub = LengthExpr
556	\| RefOrKeys
557	\| PrefixQuery
558	\| VarExpr
559	\| BuiltinSub
560
561	NOTES:
562	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
563	slicing ${a:x+1:y+2}
564	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
565	- @ and * are technically arithmetic expressions in this implementation
566	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
567	it's also vectorized.
568
569	Strictness over bash:
570	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
571	grammar
572	- ! and # prefixes can't be composed, even though named refs can be
573	composed with other operators
574	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
575	a prefix, and it can also be a literal part of WORD.
576
577	From the parser's point of view, the prefix # can't be combined with
578	UnaryOp/slicing/matching, and the ! can. However
579
580	- ${a[@]:1:2} is not allowed
581	- ${#a[@]:1:2} is allowed, but gives the wrong answer
582	"""
583	if d_quoted:
584	arg_lex_mode = lex_mode_e.VSub_ArgDQ
585	else:
586	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
587
588	self._SetNext(lex_mode_e.VSub_1)
589	self._GetToken()
590
591	ty = self.token_type
592	first_tok = self.cur_token
593
594	if ty == Id.VSub_Pound:
595	# Disambiguate
596	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
597	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
598	# e.g. a name, '#' is the prefix
599	self._SetNext(lex_mode_e.VSub_1)
600	part = self._ParseVarOf()
601
602	self._GetToken()
603	if self.token_type != Id.Right_DollarBrace:
604	p_die('Expected } after length expression', self.cur_token)
605
606	part.prefix_op = first_tok
607
608	else: # not a prefix, '#' is the variable
609	part = self._ParseVarExpr(arg_lex_mode)
610
611	elif ty == Id.VSub_Bang:
612	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
613	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
614	# e.g. a name, '!' is the prefix
615	# ${!a} -- this is a ref
616	# ${!3} -- this is ref
617	# ${!a[1]} -- this is a ref
618	# ${!a[@]} -- this is a keys
619	# No lookahead -- do it in a second step, or at runtime
620	self._SetNext(lex_mode_e.VSub_1)
621	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
622
623	part.prefix_op = first_tok
624
625	else: # not a prefix, '!' is the variable
626	part = self._ParseVarExpr(arg_lex_mode)
627
628	elif ty == Id.VSub_Dot:
629	# Note: this will become a new builtin_sub type, so this method must
630	# return word_part_t rather than BracedVarSub. I don't think that
631	# should cause problems.
632	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
633
634	# VS_NAME, VS_NUMBER, symbol that isn't # or !
635	elif self.token_kind == Kind.VSub:
636	part = self._ParseVarExpr(arg_lex_mode)
637
638	else:
639	# e.g. ${^}
640	p_die('Unexpected token in ${}', self.cur_token)
641
642	part.left = left_token # attach the argument
643	part.right = self.cur_token
644	return part
645
646	def _ReadSingleQuoted(self, left_token, lex_mode):
647	# type: (Token, lex_mode_t) -> SingleQuoted
648	"""Internal method to read a word_part."""
649	tokens = [] # type: List[Token]
650	# In command mode, we never disallow backslashes like '\'
651	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
652	False)
653	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
654	node = SingleQuoted(left_token, sval, right_quote)
655	return node
656
657	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
658	# type: (lex_mode_t, Token, List[Token], bool) -> Token
659	"""Appends to out_tokens; returns last token
660
661	Used by expr_parse.py
662	"""
663	# TODO: Remove and use out_tokens
664	tokens = [] # type: List[Token]
665
666	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
667	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
668
669	expected_end_tokens = 3 if left_token.id in (
670	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
671	Id.Left_BTSingleQuote) else 1
672	num_end_tokens = 0
673
674	while num_end_tokens < expected_end_tokens:
675	self._SetNext(lex_mode)
676	self._GetToken()
677
678	# Kind.Char emitted in lex_mode.SQ_C
679	if self.token_kind in (Kind.Lit, Kind.Char):
680	tok = self.cur_token
681	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
682	# r'one\two' or c'one\\two'
683	if no_backslashes and lexer.TokenContains(tok, '\\'):
684	p_die(
685	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
686	tok)
687
688	if is_ysh_expr:
689	# Disallow var x = $'\001'. Arguably we don't need these
690	# checks because u'\u{1}' is the way to write it.
691	if self.token_type == Id.Char_Octal3:
692	p_die(
693	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
694	tok)
695
696	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
697	# disallow \xH
698	p_die(
699	r'Invalid hex escape in YSH string (must be \xHH)',
700	tok)
701
702	tokens.append(tok)
703
704	elif self.token_kind == Kind.Unknown:
705	tok = self.cur_token
706	assert tok.id == Id.Unknown_Backslash, tok
707
708	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
709	if is_ysh_expr or not self.parse_opts.parse_backslash():
710	p_die(
711	"Invalid char escape in C-style string literal (OILS-ERR-11)",
712	tok)
713
714	tokens.append(tok)
715
716	elif self.token_kind == Kind.Eof:
717	p_die('Unexpected EOF in single-quoted string that began here',
718	left_token)
719
720	elif self.token_kind == Kind.Right:
721	# assume Id.Right_SingleQuote
722	num_end_tokens += 1
723	tokens.append(self.cur_token)
724
725	else:
726	raise AssertionError(self.cur_token)
727
728	if self.token_kind != Kind.Right:
729	num_end_tokens = 0 # we need three in a ROW
730
731	if expected_end_tokens == 1:
732	tokens.pop()
733	elif expected_end_tokens == 3: # Get rid of spurious end tokens
734	tokens.pop()
735	tokens.pop()
736	tokens.pop()
737
738	# Remove space from ''' r''' $''' in both expression mode and command mode
739	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
740	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
741	word_compile.RemoveLeadingSpaceSQ(tokens)
742
743	# Validation after lexing - same 2 checks in j8.LexerDecoder
744	is_u_string = left_token.id in (Id.Left_USingleQuote,
745	Id.Left_UTSingleQuote)
746
747	for tok in tokens:
748	# u'\yff' is not valid, but b'\yff' is
749	if is_u_string and tok.id == Id.Char_YHex:
750	p_die(
751	r"%s escapes not allowed in u'' strings" %
752	lexer.TokenVal(tok), tok)
753
754	out_tokens.extend(tokens)
755	return self.cur_token
756
757	def _ReadDoubleQuotedLeftParts(self):
758	# type: () -> word_part_t
759	"""Read substitution parts in a double quoted context."""
760	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
761	return self._ReadCommandSub(self.token_type, d_quoted=True)
762
763	if self.token_type == Id.Left_DollarBrace:
764	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
765
766	if self.token_type == Id.Left_DollarDParen:
767	return self._ReadArithSub()
768
769	if self.token_type == Id.Left_DollarBracket:
770	return self._ReadExprSub(lex_mode_e.DQ)
771
772	raise AssertionError(self.cur_token)
773
774	def _ReadYshSingleQuoted(self, left_id):
775	# type: (Id_t) -> CompoundWord
776	"""Read YSH style strings
777
778	r'' u'' b''
779	r''' ''' u''' ''' b''' '''
780	"""
781	#log('BEF self.cur_token %s', self.cur_token)
782	if left_id == Id.Left_RSingleQuote:
783	lexer_mode = lex_mode_e.SQ_Raw
784	triple_left_id = Id.Left_RTSingleQuote
785	elif left_id == Id.Left_USingleQuote:
786	lexer_mode = lex_mode_e.J8_Str
787	triple_left_id = Id.Left_UTSingleQuote
788	elif left_id == Id.Left_BSingleQuote:
789	lexer_mode = lex_mode_e.J8_Str
790	triple_left_id = Id.Left_BTSingleQuote
791	else:
792	raise AssertionError(left_id)
793
794	# Needed for syntax checks
795	left_tok = self.cur_token
796	left_tok.id = left_id
797
798	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
799
800	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
801	self._SetNext(lex_mode_e.ShCommand)
802	self._GetToken()
803
804	assert self.token_type == Id.Left_SingleQuote
805	# HACK: magically transform the third ' in u''' to
806	# Id.Left_UTSingleQuote, so that ''' is the terminator
807	left_tok = self.cur_token
808	left_tok.id = triple_left_id
809
810	# Handles stripping leading whitespace
811	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
812
813	# Advance and validate
814	self._SetNext(lex_mode_e.ShCommand)
815
816	self._GetToken()
817	if self.token_kind not in KINDS_THAT_END_WORDS:
818	p_die('Unexpected token after YSH single-quoted string',
819	self.cur_token)
820
821	return CompoundWord([sq_part])
822
823	def _ReadUnquotedLeftParts(self, triple_out):
824	# type: (Optional[BoolParamBox]) -> word_part_t
825	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
826
827	If triple_out is set, then we try parsing triple quoted strings,
828	and set its value to True if we got one.
829	"""
830	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
831	# Note: $"" is a synonym for "". It might make sense if it added
832	# \n \0 \x00 \u{123} etc. But that's not what bash does!
833	dq_part = self._ReadDoubleQuoted(self.cur_token)
834	# Got empty word "" and there's a " after
835	if (triple_out and len(dq_part.parts) == 0 and
836	self.lexer.ByteLookAhead() == '"'):
837
838	self._SetNext(lex_mode_e.ShCommand)
839	self._GetToken()
840	# HACK: magically transform the third " in """ to
841	# Id.Left_TDoubleQuote, so that """ is the terminator
842	left_dq_token = self.cur_token
843	left_dq_token.id = Id.Left_TDoubleQuote
844	triple_out.b = True # let caller know we got it
845	return self._ReadDoubleQuoted(left_dq_token)
846
847	return dq_part
848
849	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
850	Id.Left_DollarSingleQuote):
851	if self.token_type == Id.Left_SingleQuote:
852	lexer_mode = lex_mode_e.SQ_Raw
853	triple_left_id = Id.Left_TSingleQuote
854	elif self.token_type == Id.Left_RSingleQuote:
855	lexer_mode = lex_mode_e.SQ_Raw
856	triple_left_id = Id.Left_RTSingleQuote
857	else:
858	lexer_mode = lex_mode_e.SQ_C
859	# there is no such thing as $'''
860	triple_left_id = Id.Undefined_Tok
861
862	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
863
864	# Got empty '' or r'' and there's a ' after
865	# u'' and b'' are handled in _ReadYshSingleQuoted
866	if (triple_left_id != Id.Undefined_Tok and
867	triple_out is not None and len(sq_part.sval) == 0 and
868	self.lexer.ByteLookAhead() == "'"):
869
870	self._SetNext(lex_mode_e.ShCommand)
871	self._GetToken()
872
873	# HACK: magically transform the third ' in ''' to
874	# Id.Left_TSingleQuote, so that ''' is the terminator
875	left_sq_token = self.cur_token
876	left_sq_token.id = triple_left_id
877
878	triple_out.b = True # let caller know we got it
879	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
880
881	return sq_part
882
883	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
884	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
885	return self._ReadCommandSub(self.token_type, d_quoted=False)
886
887	if self.token_type == Id.Left_DollarBrace:
888	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
889
890	if self.token_type == Id.Left_DollarDParen:
891	return self._ReadArithSub()
892
893	if self.token_type == Id.Left_DollarBracket:
894	return self._ReadExprSub(lex_mode_e.ShCommand)
895
896	if self.token_type == Id.Left_DollarBraceZsh:
897	return self._ReadZshVarSub(self.cur_token)
898
899	raise AssertionError(self.cur_token)
900
901	def _ReadExtGlob(self):
902	# type: () -> word_part.ExtGlob
903	"""
904	Grammar:
905	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
906	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
907	RIGHT = ')'
908	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
909	Compound includes ExtGlob
910	"""
911	left_token = self.cur_token
912	right_token = None # type: Token
913	arms = [] # type: List[CompoundWord]
914
915	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
916	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
917
918	read_word = False # did we just a read a word? To handle @(\|\|).
919
920	while True:
921	self._GetToken()
922
923	if self.token_type == Id.Right_ExtGlob:
924	if not read_word:
925	arms.append(CompoundWord([]))
926	right_token = self.cur_token
927	break
928
929	elif self.token_type == Id.Op_Pipe:
930	if not read_word:
931	arms.append(CompoundWord([]))
932	read_word = False
933	self._SetNext(lex_mode_e.ExtGlob)
934
935	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
936	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
937	Kind.ExtGlob):
938	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
939	arms.append(w)
940	read_word = True
941
942	elif self.token_kind == Kind.Eof:
943	p_die('Unexpected EOF reading extended glob that began here',
944	left_token)
945
946	else:
947	raise AssertionError(self.cur_token)
948
949	return word_part.ExtGlob(left_token, arms, right_token)
950
951	def _ReadBashRegexGroup(self):
952	# type: () -> word_part.BashRegexGroup
953	"""
954	Grammar:
955	BashRegexGroup = '(' WORD? ')
956	"""
957	left_token = self.cur_token
958	assert left_token.id == Id.BashRegex_LParen, left_token
959
960	right_token = None # type: Token
961	arms = [] # type: List[CompoundWord]
962
963	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
964	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
965
966	self._GetToken()
967	if self.token_type == Id.Right_BashRegexGroup: # empty ()
968	return word_part.BashRegexGroup(left_token, None, self.cur_token)
969
970	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
971	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
972	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
973	# To allow bash style [[ s =~ (a b) ]]
974	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
975	arms.append(w)
976
977	self._GetToken()
978	if self.token_type != Id.Right_BashRegexGroup:
979	p_die('Expected ) to close bash regex group', self.cur_token)
980
981	return word_part.BashRegexGroup(left_token, w, self.cur_token)
982
983	p_die('Expected word after ( opening bash regex group', self.cur_token)
984
985	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
986	# type: (Optional[Token], bool, List[word_part_t]) -> None
987	"""
988	Args:
989	left_token: A token if we are reading a double quoted part, or None if
990	we're reading a here doc.
991	is_ysh_expr: Whether to disallow backticks and invalid char escapes
992	out_parts: list of word_part to append to
993	"""
994	if left_token:
995	if left_token.id in (Id.Left_TDoubleQuote,
996	Id.Left_DollarTDoubleQuote):
997	expected_end_tokens = 3
998	else:
999	expected_end_tokens = 1
1000	else:
1001	expected_end_tokens = 1000 # here doc will break
1002
1003	num_end_tokens = 0
1004	while num_end_tokens < expected_end_tokens:
1005	self._SetNext(lex_mode_e.DQ)
1006	self._GetToken()
1007
1008	if self.token_kind == Kind.Lit:
1009	if self.token_type == Id.Lit_EscapedChar:
1010	tok = self.cur_token
1011	ch = lexer.TokenSliceLeft(tok, 1)
1012	part = word_part.EscapedLiteral(tok,
1013	ch) # type: word_part_t
1014	else:
1015	if self.token_type == Id.Lit_BadBackslash:
1016	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1017	# YSH.
1018	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1019	# recursion (unless parse_backslash)
1020	if (is_ysh_expr or
1021	not self.parse_opts.parse_backslash()):
1022	p_die(
1023	"Invalid char escape in double quoted string (OILS-ERR-12)",
1024	self.cur_token)
1025	elif self.token_type == Id.Lit_Dollar:
1026	if is_ysh_expr or not self.parse_opts.parse_dollar():
1027	p_die("Literal $ should be quoted like \$",
1028	self.cur_token)
1029
1030	part = self.cur_token
1031	out_parts.append(part)
1032
1033	elif self.token_kind == Kind.Left:
1034	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1035	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1036	self.cur_token)
1037
1038	part = self._ReadDoubleQuotedLeftParts()
1039	out_parts.append(part)
1040
1041	elif self.token_kind == Kind.VSub:
1042	tok = self.cur_token
1043	part = SimpleVarSub(tok)
1044	out_parts.append(part)
1045	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1046	# later.
1047
1048	elif self.token_kind == Kind.Right:
1049	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1050	if left_token:
1051	num_end_tokens += 1
1052
1053	# In a here doc, the right quote is literal!
1054	out_parts.append(self.cur_token)
1055
1056	elif self.token_kind == Kind.Eof:
1057	if left_token:
1058	p_die(
1059	'Unexpected EOF reading double-quoted string that began here',
1060	left_token)
1061	else: # here docs will have an EOF in their token stream
1062	break
1063
1064	else:
1065	raise AssertionError(self.cur_token)
1066
1067	if self.token_kind != Kind.Right:
1068	num_end_tokens = 0 # """ must be CONSECUTIVE
1069
1070	if expected_end_tokens == 1:
1071	out_parts.pop()
1072	elif expected_end_tokens == 3:
1073	out_parts.pop()
1074	out_parts.pop()
1075	out_parts.pop()
1076
1077	# Remove space from """ in both expression mode and command mode
1078	if (left_token and left_token.id
1079	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1080	word_compile.RemoveLeadingSpaceDQ(out_parts)
1081
1082	# Return nothing, since we appended to 'out_parts'
1083
1084	def _ReadDoubleQuoted(self, left_token):
1085	# type: (Token) -> DoubleQuoted
1086	"""Helper function for "hello $name".
1087
1088	Args:
1089	eof_type: for stopping at }, Id.Lit_RBrace
1090	here_doc: Whether we are reading in a here doc context
1091
1092	Also ${foo%%a b c} # treat this as double quoted. until you hit
1093	"""
1094	parts = [] # type: List[word_part_t]
1095	self._ReadLikeDQ(left_token, False, parts)
1096
1097	right_quote = self.cur_token
1098	return DoubleQuoted(left_token, parts, right_quote)
1099
1100	def ReadDoubleQuoted(self, left_token, parts):
1101	# type: (Token, List[word_part_t]) -> Token
1102	"""For expression mode.
1103
1104	Read var x = "${dir:-}/$name"; etc.
1105	"""
1106	self._ReadLikeDQ(left_token, True, parts)
1107	return self.cur_token
1108
1109	def _ReadCommandSub(self, left_id, d_quoted=False):
1110	# type: (Id_t, bool) -> CommandSub
1111	"""
1112	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1113
1114	command_sub = '$(' command_list ')'
1115	\| '@(' command_list ')'
1116	\| '<(' command_list ')'
1117	\| '>(' command_list ')'
1118	\| ` command_list `
1119	"""
1120	left_token = self.cur_token
1121
1122	# Set the lexer in a state so ) becomes the EOF token.
1123	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1124	Id.Left_ProcSubOut):
1125	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1126
1127	right_id = Id.Eof_RParen
1128	self.lexer.PushHint(Id.Op_RParen, right_id)
1129	c_parser = self.parse_ctx.MakeParserForCommandSub(
1130	self.line_reader, self.lexer, right_id)
1131	# NOTE: This doesn't use something like main_loop because we don't want
1132	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1133	node = c_parser.ParseCommandSub()
1134
1135	right_token = c_parser.w_parser.cur_token
1136
1137	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1138	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1139	# test/osh2oil.
1140
1141	right_id = Id.Eof_Backtick
1142	self.lexer.PushHint(Id.Left_Backtick, right_id)
1143	c_parser = self.parse_ctx.MakeParserForCommandSub(
1144	self.line_reader, self.lexer, right_id)
1145	node = c_parser.ParseCommandSub()
1146	right_token = c_parser.w_parser.cur_token
1147
1148	elif left_id == Id.Left_Backtick:
1149	if not self.parse_opts.parse_backticks():
1150	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1151	left_token)
1152
1153	self._SetNext(lex_mode_e.Backtick) # advance past `
1154
1155	parts = [] # type: List[str]
1156	while True:
1157	self._GetToken()
1158	#log("TOK %s", self.cur_token)
1159
1160	if self.token_type == Id.Backtick_Quoted:
1161	# Remove leading \
1162	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1163
1164	elif self.token_type == Id.Backtick_DoubleQuote:
1165	# Compatibility: If backticks are double quoted, then double quotes
1166	# within them have to be \"
1167	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1168	# is)
1169	if d_quoted:
1170	# Remove leading \
1171	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1172	else:
1173	parts.append(lexer.TokenVal(self.cur_token))
1174
1175	elif self.token_type == Id.Backtick_Other:
1176	parts.append(lexer.TokenVal(self.cur_token))
1177
1178	elif self.token_type == Id.Backtick_Right:
1179	break
1180
1181	elif self.token_type == Id.Eof_Real:
1182	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1183	p_die('Unexpected EOF while looking for closing backtick',
1184	left_token)
1185
1186	else:
1187	raise AssertionError(self.cur_token)
1188
1189	self._SetNext(lex_mode_e.Backtick)
1190
1191	# Calculate right SPID on CommandSub BEFORE re-parsing.
1192	right_token = self.cur_token
1193
1194	code_str = ''.join(parts)
1195	#log('code %r', code_str)
1196
1197	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1198	# won't have the same location info as MakeParserForCommandSub(), because
1199	# the lexer is different.
1200	arena = self.parse_ctx.arena
1201	#arena = alloc.Arena()
1202	line_reader = reader.StringLineReader(code_str, arena)
1203	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1204	src = source.Reparsed('backticks', left_token, right_token)
1205	with alloc.ctx_SourceCode(arena, src):
1206	node = c_parser.ParseCommandSub()
1207
1208	else:
1209	raise AssertionError(left_id)
1210
1211	return CommandSub(left_token, node, right_token)
1212
1213	def _ReadExprSub(self, lex_mode):
1214	# type: (lex_mode_t) -> word_part.ExprSub
1215	"""$[d->key] $[obj.method()] etc."""
1216	left_token = self.cur_token
1217
1218	self._SetNext(lex_mode_e.Expr)
1219	enode, right_token = self.parse_ctx.ParseYshExpr(
1220	self.lexer, grammar_nt.ysh_expr_sub)
1221
1222	self._SetNext(lex_mode) # Move past ]
1223	return word_part.ExprSub(left_token, enode, right_token)
1224
1225	def ParseVarDecl(self, kw_token):
1226	# type: (Token) -> command.VarDecl
1227	"""
1228	oil_var_decl: name_type_list '=' testlist end_stmt
1229
1230	Note that assignments must end with \n ; } or EOF. Unlike shell
1231	assignments, we disallow:
1232
1233	var x = 42 \| wc -l
1234	var x = 42 && echo hi
1235	"""
1236	self._SetNext(lex_mode_e.Expr)
1237	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1238	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1239	# wants
1240	if last_token.id == Id.Op_RBrace:
1241	last_token.id = Id.Lit_RBrace
1242
1243	# Let the CommandParser see the Op_Semi or Op_Newline.
1244	self.buffered_word = last_token
1245	self._SetNext(lex_mode_e.ShCommand) # always back to this
1246	return enode
1247
1248	def ParseMutation(self, kw_token, var_checker):
1249	# type: (Token, VarChecker) -> command.Mutation
1250	"""
1251	setvar i = 42
1252	setvar i += 1
1253	setvar a[i] = 42
1254	setvar a[i] += 1
1255	setvar d.key = 42
1256	setvar d.key += 1
1257	"""
1258	self._SetNext(lex_mode_e.Expr)
1259	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1260	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1261	# wants
1262	if last_token.id == Id.Op_RBrace:
1263	last_token.id = Id.Lit_RBrace
1264
1265	for lhs in enode.lhs:
1266	UP_lhs = lhs
1267	with tagswitch(lhs) as case:
1268	if case(y_lhs_e.Var):
1269	lhs = cast(Token, UP_lhs)
1270	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1271
1272	# Note: this does not cover cases like
1273	# setvar (a[0])[1] = v
1274	# setvar (d.key).other = v
1275	# This leaks into catching all typos statically, which may be
1276	# possible if 'use' makes all names explicit.
1277	elif case(y_lhs_e.Subscript):
1278	lhs = cast(Subscript, UP_lhs)
1279	if lhs.obj.tag() == expr_e.Var:
1280	v = cast(expr.Var, lhs.obj)
1281	var_checker.Check(kw_token.id, v.name, v.left)
1282
1283	elif case(y_lhs_e.Attribute):
1284	lhs = cast(Attribute, UP_lhs)
1285	if lhs.obj.tag() == expr_e.Var:
1286	v = cast(expr.Var, lhs.obj)
1287	var_checker.Check(kw_token.id, v.name, v.left)
1288
1289	# Let the CommandParser see the Op_Semi or Op_Newline.
1290	self.buffered_word = last_token
1291	self._SetNext(lex_mode_e.ShCommand) # always back to this
1292	return enode
1293
1294	def ParseBareDecl(self):
1295	# type: () -> expr_t
1296	"""
1297	x = {name: val}
1298	"""
1299	self._SetNext(lex_mode_e.Expr)
1300	self._GetToken()
1301	enode, last_token = self.parse_ctx.ParseYshExpr(
1302	self.lexer, grammar_nt.command_expr)
1303	if last_token.id == Id.Op_RBrace:
1304	last_token.id = Id.Lit_RBrace
1305	self.buffered_word = last_token
1306	self._SetNext(lex_mode_e.ShCommand)
1307	return enode
1308
1309	def ParseYshExprForCommand(self):
1310	# type: () -> expr_t
1311
1312	# Fudge for this case
1313	# for x in(y) {
1314	# versus
1315	# for x in (y) {
1316	#
1317	# In the former case, ReadWord on 'in' puts the lexer past (.
1318	# Also see LookPastSpace in CommandParers.
1319	# A simpler solution would be nicer.
1320
1321	if self.token_type == Id.Op_LParen:
1322	self.lexer.MaybeUnreadOne()
1323
1324	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1325
1326	self._SetNext(lex_mode_e.ShCommand)
1327	return enode
1328
1329	def ParseCommandExpr(self):
1330	# type: () -> expr_t
1331	"""
1332	= 1+2
1333	"""
1334	enode, last_token = self.parse_ctx.ParseYshExpr(
1335	self.lexer, grammar_nt.command_expr)
1336
1337	# In some cases, such as the case statement, we expect the lexer to be
1338	# pointing at the token right after the expression. But the expression
1339	# parser must have read to the `last_token`. Unreading places the lexer
1340	# back in the expected state. Ie:
1341	#
1342	# case (x) { case (x) {
1343	# (else) { = x } (else) { = x }
1344	# ^ The lexer is here ^ Unread to here
1345	# } }
1346	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1347	Id.Op_RBrace), last_token
1348	if last_token.id != Id.Eof_Real:
1349	# Eof_Real is the only token we cannot unread
1350	self.lexer.MaybeUnreadOne()
1351
1352	return enode
1353
1354	def ParseProc(self, node):
1355	# type: (Proc) -> None
1356
1357	# proc name-with-hyphens() must be accepted
1358	self._SetNext(lex_mode_e.ShCommand)
1359	self._GetToken()
1360	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1361	if self.token_type != Id.Lit_Chars:
1362	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1363	self.cur_token)
1364
1365	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1366	# for shell functions. Similar to IsValidVarName().
1367	node.name = self.cur_token
1368
1369	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1370
1371	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1372	assert last_token.id == Id.Op_LBrace
1373	last_token.id = Id.Lit_LBrace
1374	self.buffered_word = last_token
1375
1376	self._SetNext(lex_mode_e.ShCommand)
1377
1378	def ParseFunc(self, node):
1379	# type: (Func) -> None
1380	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1381
1382	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1383	assert last_token.id == Id.Op_LBrace
1384	last_token.id = Id.Lit_LBrace
1385	self.buffered_word = last_token
1386
1387	self._SetNext(lex_mode_e.ShCommand)
1388
1389	def ParseYshCasePattern(self):
1390	# type: () -> Tuple[pat_t, Token]
1391	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1392	self.lexer)
1393
1394	if last_token.id == Id.Op_LBrace:
1395	last_token.id = Id.Lit_LBrace
1396	self.buffered_word = last_token
1397
1398	return pat, left_tok
1399
1400	def NewlineOkForYshCase(self):
1401	# type: () -> Id_t
1402	"""Check for optional newline and consume it.
1403
1404	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1405	which crop up while parsing Ysh Case Arms. For more details, see
1406	#oil-dev > Progress On YSH Case Grammar on zulip.
1407
1408	Returns a token id which is filled with the choice of
1409
1410	word { echo word }
1411	(3) { echo expr }
1412	/e/ { echo eggex }
1413	} # right brace
1414	"""
1415	while True:
1416	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1417
1418	# Cannot lookahead past lines
1419	if next_id == Id.Unknown_Tok:
1420	self.lexer.MoveToNextLine()
1421	continue
1422
1423	next_kind = consts.GetKind(next_id)
1424	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1425	break
1426
1427	self.lexer.Read(lex_mode_e.Expr)
1428
1429	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1430	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1431	else:
1432	# Consume the trailing Op_Newline
1433	self._SetNext(lex_mode_e.ShCommand)
1434	self._GetToken()
1435
1436	return next_id
1437
1438	def _ReadArithExpr(self, end_id):
1439	# type: (Id_t) -> arith_expr_t
1440	"""Read and parse an arithmetic expression in various contexts.
1441
1442	$(( 1+2 ))
1443	(( a=1+2 ))
1444	${a[ 1+2 ]}
1445	${a : 1+2 : 1+2}
1446
1447	See tests/arith-context.test.sh for ambiguous cases.
1448
1449	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1450
1451	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1452
1453	See the assertion in ArithParser.Parse() -- unexpected extra input.
1454	"""
1455	# calls self.ReadWord(lex_mode_e.Arith)
1456	anode = self.a_parser.Parse()
1457	cur_id = self.a_parser.CurrentId()
1458	if end_id != Id.Undefined_Tok and cur_id != end_id:
1459	p_die(
1460	'Unexpected token after arithmetic expression (%s != %s)' %
1461	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1462	loc.Word(self.a_parser.cur_word))
1463	return anode
1464
1465	def _ReadArithSub(self):
1466	# type: () -> word_part.ArithSub
1467	"""Read an arith substitution, which contains an arith expression, e.g.
1468
1469	$((a + 1)).
1470	"""
1471	left_tok = self.cur_token
1472
1473	# The second one needs to be disambiguated in stuff like stuff like:
1474	# $(echo $(( 1+2 )) )
1475	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1476
1477	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1478	# could save the lexer/reader state here, and retry if the arithmetic parse
1479	# fails. But we can almost always catch this at parse time. There could
1480	# be some exceptions like:
1481	# $((echo * foo)) # looks like multiplication
1482	# $((echo / foo)) # looks like division
1483
1484	# $(( )) is valid
1485	anode = arith_expr.EmptyZero # type: arith_expr_t
1486
1487	self._NextNonSpace()
1488	if self.token_type != Id.Arith_RParen:
1489	anode = self._ReadArithExpr(Id.Arith_RParen)
1490
1491	self._SetNext(lex_mode_e.ShCommand)
1492
1493	# Ensure we get closing )
1494	self._GetToken()
1495	if self.token_type != Id.Right_DollarDParen:
1496	p_die('Expected second ) to end arith sub', self.cur_token)
1497
1498	right_tok = self.cur_token
1499	return word_part.ArithSub(left_tok, anode, right_tok)
1500
1501	def ReadDParen(self):
1502	# type: () -> Tuple[arith_expr_t, Token]
1503	"""Read ((1+ 2)) -- command context.
1504
1505	We're using the word parser because it's very similar to _ReadArithExpr
1506	above.
1507
1508	This also returns the terminating Id.Op_DRightParen token for location
1509	info.
1510	"""
1511	# (( )) is valid
1512	anode = arith_expr.EmptyZero # type: arith_expr_t
1513
1514	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1515
1516	self._NextNonSpace()
1517	if self.token_type != Id.Arith_RParen:
1518	anode = self._ReadArithExpr(Id.Arith_RParen)
1519
1520	self._SetNext(lex_mode_e.ShCommand)
1521
1522	# Ensure we get the second )
1523	self._GetToken()
1524	right = self.cur_token
1525	if right.id != Id.Op_DRightParen:
1526	p_die('Expected second ) to end arith statement', right)
1527
1528	self._SetNext(lex_mode_e.ShCommand)
1529
1530	return anode, right
1531
1532	def _NextNonSpace(self):
1533	# type: () -> None
1534	"""Advance in lex_mode_e.Arith until non-space token.
1535
1536	Same logic as _ReadWord, but used in
1537	$(( ))
1538	(( ))
1539	for (( ))
1540
1541	You can read self.token_type after this, without calling _GetToken.
1542	"""
1543	while True:
1544	self._SetNext(lex_mode_e.Arith)
1545	self._GetToken()
1546	if self.token_kind not in (Kind.Ignored, Kind.WS):
1547	break
1548
1549	def ReadForExpression(self):
1550	# type: () -> command.ForExpr
1551	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1552	self._NextNonSpace() # skip over ((
1553	cur_id = self.token_type # for end of arith expressions
1554
1555	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1556	init_node = arith_expr.EmptyZero # type: arith_expr_t
1557	else:
1558	init_node = self.a_parser.Parse()
1559	cur_id = self.a_parser.CurrentId()
1560	self._NextNonSpace()
1561
1562	# It's odd to keep track of both cur_id and self.token_type in this
1563	# function, but it works, and is tested in 'test/parse_error.sh
1564	# arith-integration'
1565	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1566	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1567
1568	self._GetToken()
1569	cur_id = self.token_type
1570
1571	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1572	# empty condition is TRUE
1573	cond_node = arith_expr.EmptyOne # type: arith_expr_t
1574	else:
1575	cond_node = self.a_parser.Parse()
1576	cur_id = self.a_parser.CurrentId()
1577
1578	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1579	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1580
1581	self._NextNonSpace()
1582	if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1583	update_node = arith_expr.EmptyZero # type: arith_expr_t
1584	else:
1585	update_node = self._ReadArithExpr(Id.Arith_RParen)
1586
1587	self._NextNonSpace()
1588	if self.token_type != Id.Arith_RParen:
1589	p_die('Expected ) to end for loop expression', self.cur_token)
1590	self._SetNext(lex_mode_e.ShCommand)
1591
1592	# redirects is None, will be assigned in CommandEvaluator
1593	node = command.ForExpr.CreateNull()
1594	node.init = init_node
1595	node.cond = cond_node
1596	node.update = update_node
1597	return node
1598
1599	def _ReadArrayLiteral(self):
1600	# type: () -> word_part_t
1601	"""a=(1 2 3)
1602
1603	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1604
1605	We want:
1606
1607	A=(['x']=1 ["x"]=2 [$x$y]=3)
1608
1609	Maybe allow this as a literal string? Because I think I've seen it before?
1610	Or maybe force people to patch to learn the rule.
1611
1612	A=([x]=4)
1613
1614	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1615	Maybe enforce that ALL have keys or NONE of have keys.
1616	"""
1617	self._SetNext(lex_mode_e.ShCommand) # advance past (
1618	self._GetToken()
1619	if self.cur_token.id != Id.Op_LParen:
1620	p_die('Expected ( after =', self.cur_token)
1621	left_token = self.cur_token
1622	right_token = None # type: Token
1623
1624	# MUST use a new word parser (with same lexer).
1625	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1626	words = [] # type: List[CompoundWord]
1627	done = False
1628	while not done:
1629	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1630	with tagswitch(w) as case:
1631	if case(word_e.Operator):
1632	tok = cast(Token, w)
1633	if tok.id == Id.Right_ShArrayLiteral:
1634	right_token = tok
1635	done = True # can't use break here
1636	# Unlike command parsing, array parsing allows embedded \n.
1637	elif tok.id == Id.Op_Newline:
1638	continue
1639	else:
1640	p_die('Unexpected token in array literal', loc.Word(w))
1641
1642	elif case(word_e.Compound):
1643	words.append(cast(CompoundWord, w))
1644
1645	else:
1646	raise AssertionError()
1647
1648	if len(words) == 0: # a=() is empty indexed array
1649	# Needed for type safety, doh
1650	no_words = [] # type: List[word_t]
1651	node = ShArrayLiteral(left_token, no_words, right_token)
1652	return node
1653
1654	pairs = [] # type: List[AssocPair]
1655	# If the first one is a key/value pair, then the rest are assumed to be.
1656	pair = word_.DetectAssocPair(words[0])
1657	if pair:
1658	pairs.append(pair)
1659
1660	n = len(words)
1661	for i in xrange(1, n):
1662	w2 = words[i]
1663	pair = word_.DetectAssocPair(w2)
1664	if not pair:
1665	p_die("Expected associative array pair", loc.Word(w2))
1666
1667	pairs.append(pair)
1668
1669	# invariant List?
1670	return word_part.BashAssocLiteral(left_token, pairs, right_token)
1671
1672	# Brace detection for arrays but NOT associative arrays
1673	words2 = braces.BraceDetectAll(words)
1674	words3 = word_.TildeDetectAll(words2)
1675	return ShArrayLiteral(left_token, words3, right_token)
1676
1677	def ParseProcCallArgs(self, start_symbol):
1678	# type: (int) -> ArgList
1679	""" json write (x) """
1680	self.lexer.MaybeUnreadOne()
1681
1682	arg_list = ArgList.CreateNull(alloc_lists=True)
1683	arg_list.left = self.cur_token
1684	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1685	return arg_list
1686
1687	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1688	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1689	"""Helper for _ReadCompoundWord3."""
1690	done = False
1691
1692	if self.token_type == Id.Lit_EscapedChar:
1693	tok = self.cur_token
1694	assert tok.length == 2
1695	ch = lexer.TokenSliceLeft(tok, 1)
1696	if not self.parse_opts.parse_backslash():
1697	if not pyutil.IsValidCharEscape(ch):
1698	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1699	self.cur_token)
1700
1701	part = word_part.EscapedLiteral(self.cur_token,
1702	ch) # type: word_part_t
1703	else:
1704	part = self.cur_token
1705
1706	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1707	parts.append(part)
1708	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1709	# _ReadWord.
1710	next_id = self.lexer.LookPastSpace(lex_mode)
1711	if next_id == Id.Op_LParen:
1712	self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1713	part2 = self._ReadArrayLiteral()
1714	parts.append(part2)
1715
1716	# Array literal must be the last part of the word.
1717	self._SetNext(lex_mode)
1718	self._GetToken()
1719	# EOF, whitespace, newline, Right_Subshell
1720	if self.token_kind not in KINDS_THAT_END_WORDS:
1721	p_die('Unexpected token after array literal',
1722	self.cur_token)
1723	done = True
1724
1725	elif (is_first and self.parse_opts.parse_at() and
1726	self.token_type == Id.Lit_Splice):
1727
1728	splice_tok = self.cur_token
1729	part2 = word_part.Splice(splice_tok,
1730	lexer.TokenSliceLeft(splice_tok, 1))
1731
1732	parts.append(part2)
1733
1734	# @words must be the last part of the word
1735	self._SetNext(lex_mode)
1736	self._GetToken()
1737	# EOF, whitespace, newline, Right_Subshell
1738	if self.token_kind not in KINDS_THAT_END_WORDS:
1739	p_die('Unexpected token after array splice', self.cur_token)
1740	done = True
1741
1742	elif (is_first and self.parse_opts.parse_at() and
1743	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1744	part2 = self._ReadExprSub(lex_mode_e.DQ)
1745	parts.append(part2)
1746
1747	# @[split(x)]
1748	self._SetNext(lex_mode)
1749	self._GetToken()
1750	# EOF, whitespace, newline, Right_Subshell
1751	if self.token_kind not in KINDS_THAT_END_WORDS:
1752	p_die('Unexpected token after Expr splice', self.cur_token)
1753	done = True
1754
1755	elif (is_first and self.parse_opts.parse_at() and
1756	self.token_type == Id.Lit_AtLBraceDot):
1757	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1758
1759	elif (is_first and self.parse_opts.parse_at_all() and
1760	self.token_type == Id.Lit_At):
1761	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1762	# at the beginning of a word to be reserved.
1763
1764	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1765	# @_argv and
1766	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1767	self.cur_token)
1768
1769	else:
1770	# not a literal with lookahead; append it
1771	parts.append(part)
1772
1773	return done
1774
1775	def _ReadCompoundWord(self, lex_mode):
1776	# type: (lex_mode_t) -> CompoundWord
1777	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1778
1779	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1780	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1781	"""
1782	Precondition: Looking at the first token of the first word part
1783	Postcondition: Looking at the token after, e.g. space or operator
1784
1785	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1786	could be an operator delimiting a compound word. Can we change lexer modes
1787	and remove this special case?
1788	"""
1789	w = CompoundWord([])
1790	num_parts = 0
1791	brace_count = 0
1792	done = False
1793	is_triple_quoted = None # type: Optional[BoolParamBox]
1794
1795	while not done:
1796	self._GetToken()
1797
1798	allow_done = empty_ok or num_parts != 0
1799	if allow_done and self.token_type == eof_type:
1800	done = True # e.g. for ${foo//pat/replace}
1801
1802	# Keywords like "for" are treated like literals
1803	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1804	Kind.ControlFlow, Kind.BoolUnary,
1805	Kind.BoolBinary):
1806
1807	# Syntax error for { and }
1808	if self.token_type == Id.Lit_LBrace:
1809	brace_count += 1
1810	elif self.token_type == Id.Lit_RBrace:
1811	brace_count -= 1
1812	elif self.token_type == Id.Lit_Dollar:
1813	if not self.parse_opts.parse_dollar():
1814	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1815	next_byte = self.lexer.ByteLookAhead()
1816	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1817	if next_byte == '/':
1818	#log('next_byte %r', next_byte)
1819	pass
1820
1821	p_die('Literal $ should be quoted like \$',
1822	self.cur_token)
1823
1824	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1825	w.parts)
1826
1827	elif self.token_kind == Kind.VSub:
1828	vsub_token = self.cur_token
1829
1830	part = SimpleVarSub(vsub_token) # type: word_part_t
1831	w.parts.append(part)
1832
1833	elif self.token_kind == Kind.ExtGlob:
1834	# If parse_at, we can take over @( to start @(seq 3)
1835	# Users can also use look at ,(.py\|.sh)
1836	if (self.parse_opts.parse_at() and
1837	self.token_type == Id.ExtGlob_At and num_parts == 0):
1838	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1839	d_quoted=False)
1840	# RARE mutation of tok.id!
1841	cs_part.left_token.id = Id.Left_AtParen
1842	part = cs_part # for type safety
1843
1844	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1845	# a=(one two)x and @arrayfunc(3)x.
1846	self._GetToken()
1847	if self.token_kind not in KINDS_THAT_END_WORDS:
1848	p_die('Unexpected token after @()', self.cur_token)
1849	done = True
1850
1851	else:
1852	part = self._ReadExtGlob()
1853	w.parts.append(part)
1854
1855	elif self.token_kind == Kind.BashRegex:
1856	if self.token_type == Id.BashRegex_LParen: # Opening (
1857	part = self._ReadBashRegexGroup()
1858	w.parts.append(part)
1859	else:
1860	assert self.token_type == Id.BashRegex_AllowedInParens
1861	p_die('Invalid token in bash regex', self.cur_token)
1862
1863	elif self.token_kind == Kind.Left:
1864	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1865	lex_mode == lex_mode_e.ShCommand and
1866	num_parts == 0)
1867
1868	# Save allocation
1869	if try_triple_quote:
1870	is_triple_quoted = BoolParamBox(False)
1871
1872	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1873	w.parts.append(part)
1874
1875	# NOT done yet, will advance below
1876	elif self.token_kind == Kind.Right:
1877	# Still part of the word; will be done on the next iter.
1878	if self.token_type == Id.Right_DoubleQuote:
1879	pass
1880	# Never happens, no PushHint for this case.
1881	#elif self.token_type == Id.Right_DollarParen:
1882	# pass
1883	elif self.token_type == Id.Right_Subshell:
1884	# LEXER HACK for (case x in x) ;; esac )
1885	# Rewind before it's used
1886	assert self.next_lex_mode == lex_mode_e.Undefined
1887	if self.lexer.MaybeUnreadOne():
1888	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1889	self._SetNext(lex_mode)
1890	done = True
1891	else:
1892	done = True
1893
1894	elif self.token_kind == Kind.Ignored:
1895	done = True
1896
1897	else:
1898	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1899	# so to test for ESAC, we can read ) before getting a chance to
1900	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1901	# token and do it again.
1902
1903	# We get Id.Op_RParen at top level: case x in x) ;; esac
1904	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1905	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1906	# Rewind before it's used
1907	assert self.next_lex_mode == lex_mode_e.Undefined
1908	if self.lexer.MaybeUnreadOne():
1909	if self.token_type == Id.Eof_RParen:
1910	# Redo translation
1911	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1912	self._SetNext(lex_mode)
1913
1914	done = True # anything we don't recognize means we're done
1915
1916	if not done:
1917	self._SetNext(lex_mode)
1918	num_parts += 1
1919
1920	if (self.parse_opts.parse_brace() and num_parts > 1 and
1921	brace_count != 0):
1922	# accept { and }, but not foo{
1923	p_die(
1924	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1925	loc.Word(w))
1926
1927	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1928	p_die('Unexpected parts after triple quoted string',
1929	loc.WordPart(w.parts[-1]))
1930
1931	if 0:
1932	from _devbuild.gen.syntax_asdl import word_part_str
1933	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1934	WORD_HIST[word_key] += 1
1935	return w
1936
1937	def _ReadArithWord(self):
1938	# type: () -> Optional[word_t]
1939	""" Helper for ReadArithWord() """
1940	self._GetToken()
1941
1942	if self.token_kind == Kind.Unknown:
1943	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1944	p_die(
1945	'Unexpected token while parsing arithmetic: %r' %
1946	lexer.TokenVal(self.cur_token), self.cur_token)
1947
1948	elif self.token_kind == Kind.Eof:
1949	return self.cur_token
1950
1951	elif self.token_kind == Kind.Ignored:
1952	# Space should be ignored.
1953	self._SetNext(lex_mode_e.Arith)
1954	return None
1955
1956	elif self.token_kind in (Kind.Arith, Kind.Right):
1957	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1958	self._SetNext(lex_mode_e.Arith)
1959	return self.cur_token
1960
1961	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1962	return self._ReadCompoundWord(lex_mode_e.Arith)
1963
1964	else:
1965	raise AssertionError(self.cur_token)
1966
1967	def _ReadWord(self, word_mode):
1968	# type: (lex_mode_t) -> Optional[word_t]
1969	"""Helper function for ReadWord()."""
1970
1971	# Change the pseudo lexer mode to a real lexer mode
1972	if word_mode == lex_mode_e.ShCommandFakeBrack:
1973	lex_mode = lex_mode_e.ShCommand
1974	else:
1975	lex_mode = word_mode
1976
1977	self._GetToken()
1978
1979	if self.token_kind == Kind.Eof:
1980	# No advance
1981	return self.cur_token
1982
1983	# Allow Arith for ) at end of for loop?
1984	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1985	self._SetNext(lex_mode)
1986
1987	# Newlines are complicated. See 3x2 matrix in the comment about
1988	# self.multiline and self.newline_state above.
1989	if self.token_type == Id.Op_Newline:
1990	if self.multiline:
1991	if self.newline_state > 1:
1992	# This points at a blank line, but at least it gives the line number
1993	p_die('Invalid blank line in multiline mode',
1994	self.cur_token)
1995	return None
1996
1997	if self.returned_newline: # skip
1998	return None
1999
2000	return self.cur_token
2001
2002	elif self.token_kind == Kind.Right:
2003	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2004	Id.Right_CasePat,
2005	Id.Right_ShArrayLiteral):
2006	raise AssertionError(self.cur_token)
2007
2008	self._SetNext(lex_mode)
2009	return self.cur_token
2010
2011	elif self.token_kind in (Kind.Ignored, Kind.WS):
2012	self._SetNext(lex_mode)
2013	return None
2014
2015	else:
2016	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2017	Kind.Left, Kind.KW, Kind.ControlFlow,
2018	Kind.BoolUnary, Kind.BoolBinary,
2019	Kind.ExtGlob,
2020	Kind.BashRegex), 'Unhandled token kind'
2021
2022	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2023	self.parse_opts.parse_bracket() and
2024	self.token_type == Id.Lit_LBracket):
2025	# Change [ from Kind.Lit -> Kind.Op
2026	# So CommandParser can treat
2027	# assert [42 === x]
2028	# like
2029	# json write (x)
2030	bracket_word = self.cur_token
2031	bracket_word.id = Id.Op_LBracket
2032
2033	self._SetNext(lex_mode)
2034	return bracket_word
2035
2036	# We're beginning a word. If we see Id.Lit_Pound, change to
2037	# lex_mode_e.Comment and read until end of line.
2038	if self.token_type == Id.Lit_Pound:
2039	self._SetNext(lex_mode_e.Comment)
2040	self._GetToken()
2041
2042	# NOTE: The # could be the last character in the file. It can't be
2043	# Eof_{RParen,Backtick} because #) and #` are comments.
2044	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2045	self.cur_token
2046
2047	# The next iteration will go into Kind.Ignored and set lex state to
2048	# lex_mode_e.ShCommand/etc.
2049	return None # tell ReadWord() to try again after comment
2050
2051	elif self.token_type == Id.Lit_TPound: ### doc comment
2052	self._SetNext(lex_mode_e.Comment)
2053	self._GetToken()
2054
2055	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2056	return self.cur_token
2057
2058	return None # tell ReadWord() to try again after comment
2059
2060	else:
2061	# r'' u'' b''
2062	if (self.token_type == Id.Lit_Chars and
2063	self.lexer.LookAheadOne(
2064	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2065
2066	# When shopt -s parse_raw_string:
2067	# echo r'hi' is like echo 'hi'
2068	#
2069	# echo u'\u{3bc}' b'\yff' works
2070
2071	tok = self.cur_token
2072	if self.parse_opts.parse_ysh_string():
2073	if lexer.TokenEquals(tok, 'r'):
2074	left_id = Id.Left_RSingleQuote
2075	elif lexer.TokenEquals(tok, 'u'):
2076	left_id = Id.Left_USingleQuote
2077	elif lexer.TokenEquals(tok, 'b'):
2078	left_id = Id.Left_BSingleQuote
2079	else:
2080	left_id = Id.Undefined_Tok
2081
2082	if left_id != Id.Undefined_Tok:
2083	# skip the r, and then 'foo' will be read as normal
2084	self._SetNext(lex_mode_e.ShCommand)
2085
2086	self._GetToken()
2087	assert self.token_type == Id.Left_SingleQuote, self.token_type
2088
2089	# Read the word in a different lexer mode
2090	return self._ReadYshSingleQuoted(left_id)
2091
2092	return self._ReadCompoundWord(lex_mode)
2093
2094	def ParseVarRef(self):
2095	# type: () -> BracedVarSub
2096	"""DYNAMIC parsing of what's inside ${!ref}
2097
2098	# Same as VarOf production
2099	VarRefExpr = VarOf EOF
2100	"""
2101	self._SetNext(lex_mode_e.VSub_1)
2102
2103	self._GetToken()
2104	if self.token_kind != Kind.VSub:
2105	p_die('Expected var name', self.cur_token)
2106
2107	part = self._ParseVarOf()
2108	# NOTE: no ${ } means no part.left and part.right
2109	part.left = part.token # cheat to make test pass
2110	part.right = part.token
2111
2112	self._GetToken()
2113	if self.token_type != Id.Eof_Real:
2114	p_die('Expected end of var ref expression', self.cur_token)
2115	return part
2116
2117	def LookPastSpace(self):
2118	# type: () -> Id_t
2119	"""Look ahead to the next token.
2120
2121	For the CommandParser to recognize
2122	array= (1 2 3)
2123	YSH for ( versus bash for ((
2124	YSH if ( versus if test
2125	YSH while ( versus while test
2126	YSH bare assignment 'grep =' versus 'grep foo'
2127	"""
2128	assert self.token_type != Id.Undefined_Tok
2129	if self.cur_token.id == Id.WS_Space:
2130	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2131	else:
2132	id_ = self.cur_token.id
2133	return id_
2134
2135	def LookAheadFuncParens(self):
2136	# type: () -> bool
2137	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2138	assert self.token_type != Id.Undefined_Tok
2139
2140	# We have to handle 2 cases because we buffer a token
2141	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2142	return self.lexer.LookAheadFuncParens(1) # go back one char
2143
2144	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2145	return self.lexer.LookAheadFuncParens(0)
2146
2147	else:
2148	return False
2149
2150	def ReadWord(self, word_mode):
2151	# type: (lex_mode_t) -> word_t
2152	"""Read the next word, using the given lexer mode.
2153
2154	This is a stateful wrapper for the stateless _ReadWord function.
2155	"""
2156	assert word_mode in (lex_mode_e.ShCommand,
2157	lex_mode_e.ShCommandFakeBrack,
2158	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2159
2160	if self.buffered_word: # For integration with pgen2
2161	w = self.buffered_word
2162	self.buffered_word = None
2163	else:
2164	while True:
2165	w = self._ReadWord(word_mode)
2166	if w is not None:
2167	break
2168
2169	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2170	return w
2171
2172	def ReadArithWord(self):
2173	# type: () -> word_t
2174	while True:
2175	w = self._ReadArithWord()
2176	if w is not None:
2177	break
2178	return w
2179
2180	def ReadHereDocBody(self, parts):
2181	# type: (List[word_part_t]) -> None
2182	"""
2183	A here doc is like a double quoted context, except " isn't special.
2184	"""
2185	self._ReadLikeDQ(None, False, parts)
2186	# Returns nothing
2187
2188	def ReadForPlugin(self):
2189	# type: () -> CompoundWord
2190	"""For $PS1, $PS4, etc.
2191
2192	This is just like reading a here doc line. "\n" is allowed, as
2193	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2194	"""
2195	w = CompoundWord([])
2196	self._ReadLikeDQ(None, False, w.parts)
2197	return w
2198
2199	def EmitDocToken(self, b):
2200	# type: (bool) -> None
2201	self.emit_doc_token = b
2202
2203	def Multiline(self, b):
2204	# type: (bool) -> None
2205	self.multiline = b
2206
2207
2208	if 0:
2209	import collections
2210	WORD_HIST = collections.Counter()
2211
2212	# vim: sw=4