opy/_regtest/src/osh/word

OILS / opy / _regtest / src / osh / word_parse.py View on Github | oilshell.org

1239 lines, 679 significant

1	#!/usr/bin/env python
2	# Copyright 2016 Andy Chu. All rights reserved.
3	# Licensed under the Apache License, Version 2.0 (the "License");
4	# you may not use this file except in compliance with the License.
5	# You may obtain a copy of the License at
6	#
7	# http://www.apache.org/licenses/LICENSE-2.0
8	"""
9	word_parse.py - Parse the shell word language.
10	"""
11
12	from asdl import const
13
14	from osh.meta import Id, Kind, LookupKind
15	from core import braces
16	from core import word
17	from core import tdop
18	from core import util
19
20	from osh import arith_parse
21	from osh.meta import ast, types
22
23	word_part_e = ast.word_part_e
24	word_e = ast.word_e
25	lex_mode_e = types.lex_mode_e
26
27	p_die = util.p_die
28	log = util.log
29
30	# Substitutions can be nested, but which inner subs are allowed depends on the
31	# outer sub. See _ReadLeftParts vs. _ReadDoubleQuotedLeftParts.
32
33	# lex_mode_e.OUTER
34	# All subs and quotes are allowed --
35	# $v ${v} $() `` $(()) '' "" $'' $"" <() >()
36	#
37	# lex_mode_e.DQ
38	# Var, Command, Arith, but no quotes
39	# $v ${v} $() `` $(())
40	# No process substitution.
41	#
42	# lex_mode_e.ARITH:
43	# Similar to DQ: Var, Command, Arith sub. No process sub. bash has no
44	# quotes, but we are changing this in oil. We are adding ALL FOUR kinds of
45	# quotes , because we need those for associtative array indexing.
46	#
47	# lex_mode_e.VS_ARG_UNQ
48	# Like UNQUOTED, except we stop at }. Everything is allowed, even process
49	# substitution.
50	#
51	# ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
52	# ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
53	#
54	# But space is SIGNIFICANT. ${a:- b }
55	# So you should NOT just read a bunch of words after :-, unless you also
56	# preserve the space tokens between.
57	# In other words, like DS_VS_ARG, except SINGLE Quotes allowed?
58	#
59	# lex_mode_e.VS_ARG_DQ
60	# Can't be lex_mode_e.DQ because here we respect $' and $" tokens, while <(
61	# token is not respected.
62	#
63	# Like VS_ARG_UNQ, but single quotes are NOT respected (they appear
64	# literally), and process substitution is not respected (ditto).
65	#
66	# "" and $'' and $"" are respected, but not ''. I need a matrix for this.
67	#
68	# Like DQ, except nested "" and $'' and $"" are RESPECTED.
69	#
70	# It's weird that double quotes are allowed. Not sure why that would be.
71	# Unquoted is also allowed, so " a "b" c " $'' and $"" are lame, because they
72	# don't appear in the DQ context. I think I should parse those but DISALLOW.
73	# You should always make $'' and $"" as a separate var!
74
75	class WordParser(object):
76
77	def __init__(self, lexer, line_reader, lex_mode=lex_mode_e.OUTER):
78	self.lexer = lexer
79	self.line_reader = line_reader
80	self.Reset(lex_mode=lex_mode)
81
82	def _Peek(self):
83	"""Helper method."""
84	if self.next_lex_mode is not None:
85	self.prev_token = self.cur_token # for completion
86	self.cur_token = self.lexer.Read(self.next_lex_mode)
87	self.token_kind = LookupKind(self.cur_token.id)
88	self.token_type = self.cur_token.id
89
90	self.next_lex_mode = None
91	return self.cur_token
92
93	def _Next(self, lex_mode):
94	"""Set the next lex state, but don't actually read a token.
95
96	We need this for proper interactive parsing.
97	"""
98	self.next_lex_mode = lex_mode
99
100	def Reset(self, lex_mode=lex_mode_e.OUTER):
101	"""Called by interactive loop."""
102	# For _Peek()
103	self.prev_token = None # for completion
104	self.cur_token = None
105	self.token_kind = Kind.Undefined
106	self.token_type = Id.Undefined_Tok
107
108	self.next_lex_mode = lex_mode
109
110	# For newline. TODO: I think we can do this iteratively, without member
111	# state.
112	self.cursor = None
113	self.cursor_was_newline = False
114
115	self.error_stack = []
116
117	def AddErrorContext(self, msg, args, *kwargs):
118	err = util.ParseError(msg, args, *kwargs)
119	self.error_stack.append(err)
120
121	def Error(self):
122	return self.error_stack
123
124	def _BadToken(self, msg, token):
125	"""
126	Args:
127	msg: format string with a single %s token
128	token: Token
129	"""
130	self.AddErrorContext(msg, token, token=token)
131
132	def PrevToken(self):
133	"""Inspect state. Used by completion.
134
135	cur_token is usually Id.Op_Newline \n, so we need the previous one.
136	"""
137	return self.prev_token
138
139	def _ReadVarOpArg(self, arg_lex_mode, eof_type=Id.Undefined_Tok,
140	empty_ok=True):
141	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
142	# valid, even when unquoted.
143	self._Next(arg_lex_mode)
144	self._Peek()
145
146	w = self._ReadCompoundWord(
147	lex_mode=arg_lex_mode, eof_type=eof_type, empty_ok=empty_ok)
148	# This is for "${s:-}", ${s/a//}, etc. It is analogous to
149	# LooksLikeAssignment where we turn x= into x=''. It has the same
150	# potential problem of not having spids.
151	#
152	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
153	# return a CompoundWord with no parts, which is explicitly checked with a
154	# custom error message.
155	if not w.parts and arg_lex_mode == lex_mode_e.VS_ARG_DQ and empty_ok:
156	w.parts.append(ast.EmptyPart())
157	return w
158
159	def _ReadSliceArg(self):
160	"""Read an arithmetic expression for either part of ${a : i+1 : i+2}."""
161	anode = self._ReadArithExpr(do_next=False)
162	return anode
163
164	def _ReadSliceVarOp(self):
165	""" VarOf ':' ArithExpr (':' ArithExpr )? """
166	self._Next(lex_mode_e.ARITH)
167	self._Peek()
168	if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
169	begin = None # no beginning specified
170	else:
171	begin = self._ReadSliceArg()
172	if not begin: return None
173	#print('BEGIN', begin)
174	#print('BVS2', self.cur_token)
175
176	if self.token_type == Id.Arith_RBrace:
177	return ast.Slice(begin, None) # No length specified
178
179	# Id.Arith_Colon is a pun for Id.VOp2_Colon
180	elif self.token_type == Id.Arith_Colon:
181	self._Next(lex_mode_e.ARITH)
182	length = self._ReadSliceArg()
183	if not length: return None
184
185	#print('after colon', self.cur_token)
186	return ast.Slice(begin, length)
187
188	else:
189	self.AddErrorContext("Unexpected token in slice: %s", self.cur_token)
190	return None
191
192	def _ReadPatSubVarOp(self, lex_mode):
193	"""
194	Match = ('/' \| '#' \| '%') WORD
195	VarSub = ...
196	\| VarOf '/' Match '/' WORD
197	"""
198	do_all = False
199	do_prefix = False
200	do_suffix = False
201
202	pat = self._ReadVarOpArg(lex_mode, eof_type=Id.Lit_Slash, empty_ok=False)
203	if not pat: return None
204
205	if len(pat.parts) == 1:
206	ok, s, quoted = word.StaticEval(pat)
207	if ok and s == '/' and not quoted: # Looks like ${a////c}, read again
208	self._Next(lex_mode)
209	self._Peek()
210	p = ast.LiteralPart(self.cur_token)
211	pat.parts.append(p)
212
213	if len(pat.parts) == 0:
214	self._BadToken("Pattern must not be empty: %r", token=self.cur_token)
215	return None
216	else:
217	first_part = pat.parts[0]
218	if first_part.tag == word_part_e.LiteralPart:
219	lit_id = first_part.token.id
220	if lit_id == Id.Lit_Slash:
221	do_all = True
222	pat.parts.pop(0)
223	elif lit_id == Id.Lit_Pound:
224	do_prefix = True
225	pat.parts.pop(0)
226	elif lit_id == Id.Lit_Percent:
227	do_suffix = True
228	pat.parts.pop(0)
229
230	#self._Peek()
231	if self.token_type == Id.Right_VarSub:
232	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
233	return ast.PatSub(pat, None, do_all, do_prefix, do_suffix)
234
235	elif self.token_type == Id.Lit_Slash:
236	replace = self._ReadVarOpArg(lex_mode) # do not stop at /
237	if not replace: return None
238
239	self._Peek()
240	if self.token_type == Id.Right_VarSub:
241	return ast.PatSub(pat, replace, do_all, do_prefix, do_suffix)
242
243	else:
244	self._BadToken("Expected } after pat sub, got %s", self.cur_token)
245	return None
246
247	else:
248	self._BadToken("Expected } after pat sub, got %s", self.cur_token)
249	return None
250
251	def _ReadSubscript(self):
252	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
253	"""
254	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
255	# expression.
256	t2 = self.lexer.LookAhead(lex_mode_e.ARITH)
257	if t2.id in (Id.Lit_At, Id.Arith_Star):
258	op = ast.WholeArray(t2.id)
259
260	self._Next(lex_mode_e.ARITH) # skip past [
261	self._Peek()
262	self._Next(lex_mode_e.ARITH) # skip past @
263	self._Peek()
264	else:
265	anode = self._ReadArithExpr()
266	if not anode:
267	return None
268	op = ast.ArrayIndex(anode)
269
270	#self._Peek() # Can't do this here. Should the test go elsewhere?
271	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
272	self._BadToken('Expected ] after subscript, got %s', self.cur_token)
273	return None
274
275	self._Next(lex_mode_e.VS_2) # skip past ]
276	self._Peek() # Needed to be in the same spot as no subscript
277
278	return op
279
280	def _ParseVarOf(self):
281	"""
282	VarOf = NAME Subscript?
283	\| NUMBER # no subscript allowed, none of these are arrays
284	# ${@[1]} doesn't work, even though slicing does
285	\| VarSymbol
286	"""
287	self._Peek()
288	name_token = self.cur_token
289	self._Next(lex_mode_e.VS_2)
290
291	self._Peek() # Check for []
292	if self.token_type == Id.VOp2_LBracket:
293	bracket_op = self._ReadSubscript()
294	if not bracket_op: return None
295	else:
296	bracket_op = None
297
298	part = ast.BracedVarSub(name_token)
299	part.bracket_op = bracket_op
300	return part
301
302	def _ParseVarExpr(self, arg_lex_mode):
303	"""
304	Start parsing at the op -- we already skipped past the name.
305	"""
306	part = self._ParseVarOf()
307	if not part: return None
308
309	self._Peek()
310	if self.token_type == Id.Right_VarSub:
311	return part # no ops
312
313	# Or maybe this is a VarOpKind
314
315	op_kind = self.token_kind
316
317	if op_kind == Kind.VTest:
318	op_id = self.token_type
319	arg_word = self._ReadVarOpArg(arg_lex_mode)
320	if self.token_type != Id.Right_VarSub:
321	self._BadToken('Unexpected token after test arg: %s', self.cur_token)
322	return None
323
324	part.suffix_op = ast.StringUnary(op_id, arg_word)
325
326	elif op_kind == Kind.VOp1:
327	op_id = self.token_type
328	arg_word = self._ReadVarOpArg(arg_lex_mode)
329	if self.token_type != Id.Right_VarSub:
330	self._BadToken('Unexpected token after unary op: %s', self.cur_token)
331	return None
332
333	op = ast.StringUnary(op_id, arg_word)
334	part.suffix_op = op
335
336	elif op_kind == Kind.VOp2:
337	if self.token_type == Id.VOp2_Slash:
338	op = self._ReadPatSubVarOp(arg_lex_mode)
339	if not op: return None
340	# Checked by the method above
341	assert self.token_type == Id.Right_VarSub, self.cur_token
342
343	elif self.token_type == Id.VOp2_Colon:
344	op = self._ReadSliceVarOp()
345	if not op: return None
346	if self.token_type != Id.Arith_RBrace:
347	self._BadToken('Unexpected token after slice: %s', self.cur_token)
348	return None
349
350	else:
351	p_die('Unexpected token %s', self.cur_token, token=self.cur_token)
352
353	part.suffix_op = op
354
355	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
356	# mode. It's redundantly checked above.
357	if self.token_type not in (Id.Right_VarSub, Id.Arith_RBrace):
358	self._BadToken('Unexpected token after var sub: %s', self.cur_token)
359	return None
360
361	# Now look for ops
362	return part
363
364	def _ReadBracedBracedVarSub(self, d_quoted=False):
365	"""For the ${} expression language.
366
367	NAME = [a-zA-Z_][a-zA-Z0-9_]*
368	NUMBER = [0-9]+ # ${10}, ${11}, ...
369
370	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
371	VarSymbol = '!' \| '@' \| '#' \| ...
372	VarOf = NAME Subscript?
373	\| NUMBER # no subscript allowed, none of these are arrays
374	# ${@[1]} doesn't work, even though slicing does
375	\| VarSymbol
376
377	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
378	STRIP_OP = '#' \| '##' \| '%' \| '%%'
379	CASE_OP = ',' \| ',,' \| '^' \| '^^'
380
381	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP \| ...
382	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
383	VarExpr = VarOf
384	\| VarOf UnaryOp WORD
385	\| VarOf ':' ArithExpr (':' ArithExpr )?
386	\| VarOf '/' Match '/' WORD
387
388	LengthExpr = '#' VarOf # can't apply operators after length
389
390	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
391	# ${!ref[0]} vs ${!keys[@]} resolved later
392
393	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
394
395	VarSub = LengthExpr
396	\| RefOrKeys
397	\| PrefixQuery
398	\| VarExpr
399
400	NOTES:
401	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
402	slicing ${a:x+1:y+2}
403	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
404	- @ and * are technically arithmetic expressions in this implementation
405	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
406	it's also vectorized.
407
408	Strictness over bash:
409	echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
410	grammar
411	! and # prefixes can't be composed, even though named refs can be composed
412	with other operators
413	'#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip a
414	prefix, and it can also be a literal part of WORD.
415
416	From the parser's point of view, the prefix # can't be combined with
417	UnaryOp/slicing/matching, and the ! can. However
418
419	${a[@]:1:2} is not allowed
420	${#a[@]:1:2} is allowed, but gives the wrong answer
421	"""
422	left_spid = self.cur_token.span_id
423
424	if d_quoted:
425	arg_lex_mode = lex_mode_e.VS_ARG_DQ
426	else:
427	arg_lex_mode = lex_mode_e.VS_ARG_UNQ
428
429	self._Next(lex_mode_e.VS_1)
430	self._Peek()
431
432	ty = self.token_type
433
434	if ty == Id.VSub_Pound:
435	# Disambiguate
436	t = self.lexer.LookAhead(lex_mode_e.VS_1)
437	#print("\t# LOOKAHEAD", t)
438	if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
439	# e.g. a name, '#' is the prefix
440	self._Next(lex_mode_e.VS_1)
441	part = self._ParseVarOf()
442
443	self._Peek()
444	if self.token_type != Id.Right_VarSub:
445	self._BadToken("Expected } after length expression, got %r",
446	self.cur_token)
447	return None
448
449	part.prefix_op = Id.VSub_Pound # length
450
451	else: # not a prefix, '#' is the variable
452	part = self._ParseVarExpr(arg_lex_mode)
453	if not part: return None
454
455	elif ty == Id.VSub_Bang:
456	t = self.lexer.LookAhead(lex_mode_e.VS_1)
457	#print("\t! LOOKAHEAD", t)
458	if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
459	# e.g. a name, '!' is the prefix
460	# ${!a} -- this is a ref
461	# ${!3} -- this is ref
462	# ${!a[1]} -- this is a ref
463	# ${!a[@]} -- this is a keys
464	# No lookahead -- do it in a second step, or at runtime
465	self._Next(lex_mode_e.VS_1)
466	part = self._ParseVarExpr(arg_lex_mode)
467	if not part: return None
468
469	part.prefix_op = Id.VSub_Bang
470
471	else: # not a prefix, '!' is the variable
472	part = self._ParseVarExpr(arg_lex_mode)
473	if not part: return None
474
475	# VS_NAME, VS_NUMBER, symbol that isn't # or !
476	elif self.token_kind == Kind.VSub:
477	part = self._ParseVarExpr(arg_lex_mode)
478	if not part: return None
479
480	else:
481	# e.g. ${^}
482	p_die('Unexpected token %s', self.cur_token, token=self.cur_token)
483
484	part.spids.append(left_spid)
485
486	# Does this work?
487	right_spid = self.cur_token.span_id
488	part.spids.append(right_spid)
489
490	return part
491
492	def _ReadSingleQuotedPart(self, lex_mode):
493	left = self.cur_token
494	tokens = []
495
496	done = False
497	while not done:
498	self._Next(lex_mode)
499	self._Peek()
500
501	# Kind.Char emitted in DOLLAR_SQ state
502	if self.token_kind in (Kind.Lit, Kind.Char):
503	tokens.append(self.cur_token)
504
505	elif self.token_kind == Kind.Eof:
506	self.AddErrorContext('Unexpected EOF in single-quoted string')
507	return False
508
509	elif self.token_kind == Kind.Right:
510	done = True # assume Id.Right_SingleQuote
511
512	else:
513	raise AssertionError(
514	'Unhandled token in single-quoted part %s (%d)' %
515	(self.cur_token, self.token_kind))
516
517	return ast.SingleQuotedPart(left, tokens)
518
519	def _ReadDoubleQuotedLeftParts(self):
520	"""Read substitution parts in a double quoted context."""
521	if self.token_type in (Id.Left_CommandSub, Id.Left_Backtick):
522	return self._ReadCommandSubPart(self.token_type)
523
524	if self.token_type == Id.Left_VarSub:
525	return self._ReadBracedBracedVarSub(d_quoted=True)
526
527	if self.token_type == Id.Left_ArithSub:
528	return self._ReadArithSubPart()
529
530	if self.token_type == Id.Left_ArithSub2:
531	return self._ReadArithSub2Part()
532
533	raise AssertionError(self.cur_token)
534
535	def _ReadLeftParts(self):
536	"""Read substitutions and quoted strings."""
537
538	if self.token_type == Id.Left_DoubleQuote:
539	return self._ReadDoubleQuotedPart()
540
541	if self.token_type == Id.Left_DollarDoubleQuote:
542	# NOTE: $"" is treated as "" for now. Does it make sense to add the
543	# token to the part?
544	return self._ReadDoubleQuotedPart()
545
546	if self.token_type == Id.Left_SingleQuote:
547	return self._ReadSingleQuotedPart(lex_mode_e.SQ)
548
549	if self.token_type == Id.Left_DollarSingleQuote:
550	return self._ReadSingleQuotedPart(lex_mode_e.DOLLAR_SQ)
551
552	if self.token_type in (
553	Id.Left_CommandSub, Id.Left_Backtick, Id.Left_ProcSubIn,
554	Id.Left_ProcSubOut):
555	return self._ReadCommandSubPart(self.token_type)
556
557	if self.token_type == Id.Left_VarSub:
558	return self._ReadBracedBracedVarSub(d_quoted=False)
559
560	if self.token_type == Id.Left_ArithSub:
561	return self._ReadArithSubPart()
562
563	if self.token_type == Id.Left_ArithSub2:
564	return self._ReadArithSub2Part()
565
566	raise AssertionError('%s not handled' % self.cur_token)
567
568	def _ReadExtGlobPart(self):
569	"""
570	Grammar:
571	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
572	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
573	RIGHT = ')'
574	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
575	CompoundWord includes ExtGlobPart
576	"""
577	left_token = self.cur_token
578	arms = []
579	part = ast.ExtGlobPart(left_token, arms) # return value
580	part.spids.append(left_token.span_id)
581
582	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
583	self._Next(lex_mode_e.EXTGLOB) # advance past LEFT
584
585	read_word = False # did we just a read a word? To handle @(\|\|).
586
587	while True:
588	self._Peek()
589	#log('t %r', self.cur_token)
590
591	if self.token_type == Id.Right_ExtGlob:
592	if not read_word:
593	arms.append(ast.CompoundWord())
594	part.spids.append(self.cur_token.span_id)
595	break
596
597	elif self.token_type == Id.Op_Pipe:
598	if not read_word:
599	arms.append(ast.CompoundWord())
600	read_word = False
601	self._Next(lex_mode_e.EXTGLOB)
602
603	# lex mode EXTGLOB should only produce these 4 kinds of tokens
604	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.ExtGlob):
605	w = self._ReadCompoundWord(lex_mode=lex_mode_e.EXTGLOB)
606	arms.append(w)
607	read_word = True
608
609	elif self.token_kind == Kind.Eof:
610	self.AddErrorContext(
611	'Unexpected EOF reading extended glob that began here',
612	token=left_token)
613	return None
614
615	else:
616	raise AssertionError('Unexpected token %r' % self.cur_token)
617
618	return part
619
620	def _ReadDoubleQuotedPart(self, eof_type=Id.Undefined_Tok, here_doc=False):
621	"""
622	Args:
623	eof_type: for stopping at }, Id.Lit_RBrace
624	here_doc: Whether we are reading in a here doc context
625
626	Also ${foo%%a b c} # treat this as double quoted. until you hit
627	"""
628	quoted_part = ast.DoubleQuotedPart()
629	left_spid = const.NO_INTEGER
630	right_spid = const.NO_INTEGER # gets set later
631
632	if self.cur_token is not None: # None in here doc case
633	left_spid = self.cur_token.span_id
634
635	done = False
636	while not done:
637	self._Next(lex_mode_e.DQ)
638	self._Peek()
639	#print(self.cur_token)
640
641	if self.token_type == eof_type: # e.g. stop at }
642	done = True
643	continue
644
645	elif self.token_kind == Kind.Lit:
646	if self.token_type == Id.Lit_EscapedChar:
647	part = ast.EscapedLiteralPart(self.cur_token)
648	else:
649	part = ast.LiteralPart(self.cur_token)
650	quoted_part.parts.append(part)
651
652	elif self.token_kind == Kind.Left:
653	part = self._ReadDoubleQuotedLeftParts()
654	if not part:
655	return None
656	quoted_part.parts.append(part)
657
658	elif self.token_kind == Kind.VSub:
659	part = ast.SimpleVarSub(self.cur_token)
660	quoted_part.parts.append(part)
661
662	elif self.token_kind == Kind.Right:
663	assert self.token_type == Id.Right_DoubleQuote
664	if here_doc:
665	# Turn Id.Right_DoubleQuote into a literal part
666	quoted_part.parts.append(ast.LiteralPart(self.cur_token))
667	else:
668	done = True # assume Id.Right_DoubleQuote
669	right_spid = self.cur_token.span_id
670
671	elif self.token_kind == Kind.Eof:
672	if here_doc: # here docs will have an EOF in their token stream
673	done = True
674	else:
675	self.AddErrorContext(
676	'Unexpected EOF reading double-quoted string that began here',
677	span_id=left_spid)
678	return False
679
680	else:
681	raise AssertionError(self.cur_token)
682
683	quoted_part.spids.extend((left_spid, right_spid))
684	return quoted_part
685
686	def _ReadCommandSubPart(self, token_type):
687	"""
688	NOTE: This is not in the grammar, because word parts aren't in the grammar!
689
690	command_sub = '$(' command_list ')'
691	"""
692	left_token = self.cur_token
693	left_spid = left_token.span_id
694
695	#print('_ReadCommandSubPart', self.cur_token)
696	self._Next(lex_mode_e.OUTER) # advance past $( or `
697
698	# Set the lexer in a state so ) becomes the EOF token.
699	#print('_ReadCommandSubPart lexer.PushHint ) -> EOF')
700	if token_type in (
701	Id.Left_CommandSub, Id.Left_ProcSubIn, Id.Left_ProcSubOut):
702	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
703	elif token_type == Id.Left_Backtick:
704	self.lexer.PushHint(Id.Left_Backtick, Id.Eof_Backtick)
705	else:
706	raise AssertionError(self.token_type)
707
708	from osh import parse_lib
709	c_parser = parse_lib.MakeParserForCommandSub(self.line_reader, self.lexer)
710
711	node = c_parser.ParseWholeFile() # `` and $() allowed
712	if not node:
713	# Example of parse error:
714	# echo $(cat \|) OR
715	# echo `cat \|`
716	error_stack = c_parser.Error()
717	self.error_stack.extend(error_stack)
718	print(self.error_stack)
719	self.AddErrorContext('Error parsing command list in command sub')
720	return None
721
722	# Hm this creates its own word parser, which is thrown away?
723	#print('X', self.cur_token)
724	right_spid = c_parser.w_parser.cur_token.span_id
725
726	cs_part = ast.CommandSubPart(node, left_token)
727	cs_part.spids.append(left_spid)
728	cs_part.spids.append(right_spid)
729	return cs_part
730
731	def _ReadArithExpr(self, do_next=True):
732	"""Read and parse an arithmetic expression in various contexts.
733
734	$(( 1+2 ))
735	(( a=1+2 ))
736	${a[ 1+2 ]}
737	${a : 1+2 : 1+2}
738
739	See tests/arith-context.test.sh for ambiguous cases.
740
741	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
742
743	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
744
745	TODO: Instead of having an eof_type. I think we should use just run the
746	arith parser until it's done. That will take care of both : and ]. We
747	switch the state back.
748
749	See the assertion in ArithParser.Parse() -- unexpected extra input.
750	"""
751	if do_next:
752	self._Next(lex_mode_e.ARITH)
753	# calls self.ReadWord(lex_mode_e.ARITH)
754	a_parser = tdop.TdopParser(arith_parse.SPEC, self)
755	anode = a_parser.Parse()
756	if not anode:
757	error_stack = a_parser.Error()
758	self.error_stack.extend(error_stack)
759	return anode # could be None
760
761	def _ReadArithSubPart(self):
762	"""
763	Read an arith substitution, which contains an arith expression, e.g.
764	$((a + 1)).
765	"""
766	left_span_id = self.cur_token.span_id
767
768	# The second one needs to be disambiguated in stuff like stuff like:
769	# $(echo $(( 1+2 )) )
770	self.lexer.PushHint(Id.Op_RParen, Id.Right_ArithSub)
771
772	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
773	# could save the lexer/reader state here, and retry if the arithmetic parse
774	# fails. But we can almost always catch this at parse time. There could
775	# be some exceptions like:
776	# $((echo * foo)) # looks like multiplication
777	# $((echo / foo)) # looks like division
778
779	anode = self._ReadArithExpr()
780	if not anode:
781	self.AddErrorContext("Error parsing arith sub part")
782	return None
783
784	if self.token_type != Id.Arith_RParen:
785	self._BadToken('Expected first paren to end arith sub, got %s',
786	self.cur_token)
787	return None
788
789	self._Next(lex_mode_e.OUTER) # TODO: This could be DQ or ARITH too
790
791	# PROBLEM: $(echo $(( 1 + 2 )) )
792	# Two right parens break the Id.Eof_RParen scheme
793	self._Peek()
794	if self.token_type != Id.Right_ArithSub:
795	self._BadToken('Expected second paren to end arith sub, got %s',
796	self.cur_token)
797	return None
798	right_span_id = self.cur_token.span_id
799
800	node = ast.ArithSubPart(anode)
801	node.spids.append(left_span_id)
802	node.spids.append(right_span_id)
803	return node
804
805	def _ReadArithSub2Part(self):
806	"""Non-standard arith sub $[a + 1]."""
807	left_span_id = self.cur_token.span_id
808
809	anode = self._ReadArithExpr()
810	if not anode:
811	self.AddErrorContext("Error parsing arith sub part")
812	return None
813
814	if self.token_type != Id.Arith_RBracket:
815	self.AddErrorContext("Expected ], got %s", self.cur_token)
816	return None
817	right_span_id = self.cur_token.span_id
818
819	node = ast.ArithSubPart(anode)
820	node.spids.append(left_span_id)
821	node.spids.append(right_span_id)
822	return node
823
824	def ReadDParen(self):
825	"""Read ((1+ 2)) -- command context.
826
827	We're using the word parser because it's very similar to _ReadArithExpr
828	above.
829	"""
830	# The second one needs to be disambiguated in stuff like stuff like:
831	# TODO: Be consistent with ReadForExpression below and use lex_mode_e.ARITH?
832	# Then you can get rid of this.
833	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
834
835	anode = self._ReadArithExpr()
836	if not anode:
837	self.AddErrorContext("Error parsing dparen statement")
838	return None
839
840	#print('xx ((', self.cur_token)
841	if self.token_type != Id.Arith_RParen:
842	self._BadToken('Expected first paren to end arith sub, got %s',
843	self.cur_token)
844	return None
845	self._Next(lex_mode_e.OUTER)
846
847	# PROBLEM: $(echo $(( 1 + 2 )) )
848	self._Peek()
849	if self.token_type != Id.Op_DRightParen:
850	self._BadToken('Expected second paren to end arith sub, got %s',
851	self.cur_token)
852	return None
853	self._Next(lex_mode_e.OUTER)
854
855	return anode
856
857	def ReadForExpression(self):
858	"""Read ((i=0; i<5; ++i)) -- part of command context.
859
860	"""
861	# No PushHint because we're in arith state.
862	#self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
863
864	self._Next(lex_mode_e.ARITH) # skip over ((
865
866	self._Peek()
867	if self.token_type == Id.Arith_Semi:
868	#print('Got empty init')
869	init_node = None
870	else:
871	init_node = self._ReadArithExpr(do_next=False)
872	if not init_node:
873	self.AddErrorContext("Error parsing for init")
874	return None
875	self._Next(lex_mode_e.ARITH)
876	#print('INIT',init_node)
877
878	self._Peek()
879	if self.token_type == Id.Arith_Semi:
880	#print('Got empty condition')
881	cond_node = None
882	else:
883	cond_node = self._ReadArithExpr(do_next=False)
884	if not cond_node:
885	self.AddErrorContext("Error parsing for cond")
886	return None
887	self._Next(lex_mode_e.ARITH)
888	#print('COND',cond_node)
889
890	self._Peek()
891	if self.token_type == Id.Arith_RParen:
892	#print('Got empty update')
893	update_node = None
894	else:
895	update_node = self._ReadArithExpr(do_next=False)
896	if not update_node:
897	self.AddErrorContext("Error parsing for update")
898	return None
899	self._Next(lex_mode_e.ARITH)
900	#print('UPDATE',update_node)
901
902	#print('TT', self.cur_token)
903	# Second paren
904	self._Peek()
905	if self.token_type != Id.Arith_RParen:
906	self._BadToken('Expected right paren to end for loop expression, got %s',
907	self.cur_token)
908	return None
909	self._Next(lex_mode_e.OUTER)
910
911	return ast.ForExpr(init_node, cond_node, update_node)
912
913	def _ReadArrayLiteralPart(self):
914	self._Next(lex_mode_e.OUTER) # advance past (
915	self._Peek()
916	if self.cur_token.id != Id.Op_LParen:
917	self.AddErrorContext('Expected ( after =', token=self.cur_token)
918	return None
919
920	# MUST use a new word parser (with same lexer).
921	w_parser = WordParser(self.lexer, self.line_reader)
922	words = []
923	while True:
924	w = w_parser.ReadWord(lex_mode_e.OUTER)
925	if not w:
926	self.error_stack.extend(w_parser.Error())
927	return None
928
929	if w.tag == word_e.TokenWord:
930	word_id = word.CommandId(w)
931	if word_id == Id.Right_ArrayLiteral:
932	break
933	# Unlike command parsing, array parsing allows embedded \n.
934	elif word_id == Id.Op_Newline:
935	continue
936	else:
937	self.AddErrorContext(
938	'Unexpected word in array literal: %s', w, word=w)
939	return None
940
941	words.append(w)
942
943	words2 = braces.BraceDetectAll(words)
944	words3 = word.TildeDetectAll(words2)
945
946	return ast.ArrayLiteralPart(words3)
947
948	def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok,
949	lex_mode=lex_mode_e.OUTER, empty_ok=True):
950	"""
951	Precondition: Looking at the first token of the first word part
952	Postcondition: Looking at the token after, e.g. space or operator
953
954	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
955	could be an operator delimiting a compound word. Can we change lexer modes
956	and remove this special case?
957	"""
958	#print('_ReadCompoundWord', lex_mode)
959	word = ast.CompoundWord()
960
961	num_parts = 0
962	done = False
963	while not done:
964	allow_done = empty_ok or num_parts != 0
965	self._Peek()
966	#print('CW',self.cur_token)
967	if allow_done and self.token_type == eof_type:
968	done = True # e.g. for ${foo//pat/replace}
969
970	# Keywords like "for" are treated like literals
971	elif self.token_kind in (
972	Kind.Lit, Kind.KW, Kind.Assign, Kind.ControlFlow, Kind.BoolUnary,
973	Kind.BoolBinary):
974	if self.token_type == Id.Lit_EscapedChar:
975	part = ast.EscapedLiteralPart(self.cur_token)
976	else:
977	part = ast.LiteralPart(self.cur_token)
978	#part.xspans.append(self.cur_token.span_id)
979
980	word.parts.append(part)
981
982	if self.token_type == Id.Lit_VarLike:
983	#print('@', self.cursor)
984	#print('@', self.cur_token)
985
986	t = self.lexer.LookAhead(lex_mode_e.OUTER)
987	if t.id == Id.Op_LParen:
988	self.lexer.PushHint(Id.Op_RParen, Id.Right_ArrayLiteral)
989	part2 = self._ReadArrayLiteralPart()
990	if not part2:
991	self.AddErrorContext('_ReadArrayLiteralPart failed')
992	return False
993	word.parts.append(part2)
994
995	elif self.token_kind == Kind.VSub:
996	part = ast.SimpleVarSub(self.cur_token)
997	word.parts.append(part)
998
999	elif self.token_kind == Kind.ExtGlob:
1000	part = self._ReadExtGlobPart()
1001	if not part:
1002	return None
1003	word.parts.append(part)
1004
1005	elif self.token_kind == Kind.Left:
1006	#print('_ReadLeftParts')
1007	part = self._ReadLeftParts()
1008	if not part:
1009	return None
1010	word.parts.append(part)
1011
1012	# NOT done yet, will advance below
1013	elif self.token_kind == Kind.Right:
1014	# Still part of the word; will be done on the next iter.
1015	if self.token_type == Id.Right_DoubleQuote:
1016	pass
1017	elif self.token_type == Id.Right_CommandSub:
1018	pass
1019	elif self.token_type == Id.Right_Subshell:
1020	# LEXER HACK for (case x in x) ;; esac )
1021	assert self.next_lex_mode is None # Rewind before it's used
1022	if self.lexer.MaybeUnreadOne():
1023	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1024	self._Next(lex_mode)
1025	done = True
1026	else:
1027	done = True
1028
1029	elif self.token_kind == Kind.Ignored:
1030	done = True
1031
1032	else:
1033	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1034	# so to test for ESAC, we can read ) before getting a chance to
1035	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1036	# token and do it again.
1037
1038	# We get Id.Op_RParen at top level: case x in x) ;; esac
1039	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1040	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1041	assert self.next_lex_mode is None # Rewind before it's used
1042	if self.lexer.MaybeUnreadOne():
1043	if self.token_type == Id.Eof_RParen:
1044	# Redo translation
1045	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1046	self._Next(lex_mode)
1047
1048	done = True # anything we don't recognize means we're done
1049
1050	if not done:
1051	self._Next(lex_mode)
1052	num_parts += 1
1053	return word
1054
1055	def _ReadArithWord(self):
1056	"""Helper function for ReadArithWord."""
1057	#assert self.token_type != Id.Undefined_Tok
1058	self._Peek()
1059	#print('_ReadArithWord', self.cur_token)
1060
1061	if self.token_kind == Kind.Unknown:
1062	self.AddErrorContext("Unknown token in arith context: %s",
1063	self.cur_token, token=self.cur_token)
1064	return None, False
1065
1066	elif self.token_kind == Kind.Eof:
1067	# Just return EOF token
1068	w = ast.TokenWord(self.cur_token)
1069	return w, False
1070	#self.AddErrorContext("Unexpected EOF in arith context: %s",
1071	# self.cur_token, token=self.cur_token)
1072	#return None, False
1073
1074	elif self.token_kind == Kind.Ignored:
1075	# Space should be ignored. TODO: change this to SPACE_SPACE and
1076	# SPACE_NEWLINE? or SPACE_TOK.
1077	self._Next(lex_mode_e.ARITH)
1078	return None, True # Tell wrapper to try again
1079
1080	elif self.token_kind in (Kind.Arith, Kind.Right):
1081	# Id.Right_ArithSub IS just a normal token, handled by ArithParser
1082	self._Next(lex_mode_e.ARITH)
1083	w = ast.TokenWord(self.cur_token)
1084	return w, False
1085
1086	elif self.token_kind in (Kind.Lit, Kind.Left):
1087	w = self._ReadCompoundWord(lex_mode=lex_mode_e.ARITH)
1088	if not w:
1089	return None, True
1090	return w, False
1091
1092	elif self.token_kind == Kind.VSub:
1093	part = ast.SimpleVarSub(self.cur_token)
1094	self._Next(lex_mode_e.ARITH)
1095	w = ast.CompoundWord([part])
1096	return w, False
1097
1098	else:
1099	self._BadToken("Unexpected token parsing arith sub: %s", self.cur_token)
1100	return None, False
1101
1102	raise AssertionError("Shouldn't get here")
1103
1104	def _ReadWord(self, lex_mode):
1105	"""Helper function for Read().
1106
1107	Returns:
1108	2-tuple (word, need_more)
1109	word: Word, or None if there was an error, or need_more is set
1110	need_more: True if the caller should call us again
1111	"""
1112	#print('_Read', lex_mode, self.cur_token)
1113	self._Peek()
1114
1115	if self.token_kind == Kind.Eof:
1116	# No advance
1117	return ast.TokenWord(self.cur_token), False
1118
1119	# Allow Arith for ) at end of for loop?
1120	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1121	self._Next(lex_mode)
1122	if self.token_type == Id.Op_Newline:
1123	if self.cursor_was_newline:
1124	#print('SKIP(nl)', self.cur_token)
1125	return None, True
1126
1127	return ast.TokenWord(self.cur_token), False
1128
1129	elif self.token_kind == Kind.Right:
1130	#print('WordParser.Read: Kind.Right', self.cur_token)
1131	if self.token_type not in (
1132	Id.Right_Subshell, Id.Right_FuncDef, Id.Right_CasePat,
1133	Id.Right_ArrayLiteral):
1134	raise AssertionError(self.cur_token)
1135
1136	self._Next(lex_mode)
1137	return ast.TokenWord(self.cur_token), False
1138
1139	elif self.token_kind in (Kind.Ignored, Kind.WS):
1140	self._Next(lex_mode)
1141	return None, True # tell Read() to try again
1142
1143	elif self.token_kind in (
1144	Kind.VSub, Kind.Lit, Kind.Left, Kind.KW, Kind.Assign, Kind.ControlFlow,
1145	Kind.BoolUnary, Kind.BoolBinary, Kind.ExtGlob):
1146	# We're beginning a word. If we see Id.Lit_Pound, change to
1147	# lex_mode_e.COMMENT and read until end of line. (TODO: How to add
1148	# comments to AST?)
1149
1150	# TODO: Can we do the same thing for Tilde here? Enter a state where we
1151	# look for / too.
1152	if self.token_type == Id.Lit_Pound:
1153	self._Next(lex_mode_e.COMMENT)
1154	self._Peek()
1155
1156	# NOTE: The # could be the last character in the file. It can't be
1157	# Eof_{RParen,Backtick} because #) and #` are comments.
1158	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
1159	self.cur_token
1160
1161	# The next iteration will go into Kind.Ignored and set lex state to
1162	# lex_mode_e.OUTER/etc.
1163	return None, True # tell Read() to try again after comment
1164
1165	else:
1166	w = self._ReadCompoundWord(lex_mode=lex_mode)
1167	if not w:
1168	self.AddErrorContext(
1169	'Error reading command word', token=self.cur_token)
1170	return None, False
1171	return w, False
1172
1173	else:
1174	raise AssertionError(
1175	'Unhandled: %s (%s)' % (self.cur_token, self.token_kind))
1176
1177	raise AssertionError("Shouldn't get here")
1178
1179	def LookAhead(self):
1180	"""Look ahead to the next token.
1181
1182	For the command parser to recognize func () { } and array= (1 2 3). And
1183	probably coprocesses.
1184	"""
1185	assert self.token_type != Id.Undefined_Tok
1186	if self.cur_token.id == Id.WS_Space:
1187	t = self.lexer.LookAhead(lex_mode_e.OUTER)
1188	else:
1189	t = self.cur_token
1190	return t.id
1191
1192	def ReadWord(self, lex_mode):
1193	"""Read the next Word.
1194
1195	Returns:
1196	Word, or None if there was an error
1197	"""
1198	# Implementation note: This is an stateful/iterative function that calls
1199	# the stateless "_ReadWord" function.
1200	while True:
1201	if lex_mode == lex_mode_e.ARITH:
1202	# TODO: Can this be unified?
1203	w, need_more = self._ReadArithWord()
1204	elif lex_mode in (
1205	lex_mode_e.OUTER, lex_mode_e.DBRACKET, lex_mode_e.BASH_REGEX):
1206	w, need_more = self._ReadWord(lex_mode)
1207	else:
1208	raise AssertionError('Invalid lex state %s' % lex_mode)
1209	if not need_more:
1210	break
1211
1212	if not w: # Assumes AddErrorContext was already called
1213	return None
1214
1215	self.cursor = w
1216
1217	# TODO: Do consolidation of newlines in the lexer?
1218	# Note that there can be an infinite (Id.Ignored_Comment Id.Op_Newline
1219	# Id.Ignored_Comment Id.Op_Newline) sequence, so we have to keep track of
1220	# the last non-ignored token.
1221	self.cursor_was_newline = (word.CommandId(self.cursor) == Id.Op_Newline)
1222	return self.cursor
1223
1224	def ReadHereDocBody(self):
1225	"""
1226	Sort of like Read(), except we're in a double quoted context, but not using
1227	double quotes.
1228
1229	Returns:
1230	CompoundWord. NOTE: We could also just use a DoubleQuotedPart for both
1231	cases?
1232	"""
1233	w = ast.CompoundWord()
1234	dq = self._ReadDoubleQuotedPart(here_doc=True)
1235	if not dq:
1236	self.AddErrorContext('Error parsing here doc body')
1237	return False
1238	w.parts.append(dq)
1239	return w