1 | #!/usr/bin/env python
|
2 | # Copyright 2016 Andy Chu. All rights reserved.
|
3 | # Licensed under the Apache License, Version 2.0 (the "License");
|
4 | # you may not use this file except in compliance with the License.
|
5 | # You may obtain a copy of the License at
|
6 | #
|
7 | # http://www.apache.org/licenses/LICENSE-2.0
|
8 | """
|
9 | word_parse.py - Parse the shell word language.
|
10 | """
|
11 |
|
12 | from asdl import const
|
13 |
|
14 | from osh.meta import Id, Kind, LookupKind
|
15 | from core import braces
|
16 | from core import word
|
17 | from core import tdop
|
18 | from core import util
|
19 |
|
20 | from osh import arith_parse
|
21 | from osh.meta import ast, types
|
22 |
|
23 | word_part_e = ast.word_part_e
|
24 | word_e = ast.word_e
|
25 | lex_mode_e = types.lex_mode_e
|
26 |
|
27 | p_die = util.p_die
|
28 | log = util.log
|
29 |
|
30 | # Substitutions can be nested, but which inner subs are allowed depends on the
|
31 | # outer sub. See _ReadLeftParts vs. _ReadDoubleQuotedLeftParts.
|
32 |
|
33 | # lex_mode_e.OUTER
|
34 | # All subs and quotes are allowed --
|
35 | # $v ${v} $() `` $(()) '' "" $'' $"" <() >()
|
36 | #
|
37 | # lex_mode_e.DQ
|
38 | # Var, Command, Arith, but no quotes
|
39 | # $v ${v} $() `` $(())
|
40 | # No process substitution.
|
41 | #
|
42 | # lex_mode_e.ARITH:
|
43 | # Similar to DQ: Var, Command, Arith sub. No process sub. bash has no
|
44 | # quotes, but we are changing this in oil. We are adding ALL FOUR kinds of
|
45 | # quotes , because we need those for associtative array indexing.
|
46 | #
|
47 | # lex_mode_e.VS_ARG_UNQ
|
48 | # Like UNQUOTED, except we stop at }. Everything is allowed, even process
|
49 | # substitution.
|
50 | #
|
51 | # ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
|
52 | # ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
|
53 | #
|
54 | # But space is SIGNIFICANT. ${a:- b }
|
55 | # So you should NOT just read a bunch of words after :-, unless you also
|
56 | # preserve the space tokens between.
|
57 | # In other words, like DS_VS_ARG, except SINGLE Quotes allowed?
|
58 | #
|
59 | # lex_mode_e.VS_ARG_DQ
|
60 | # Can't be lex_mode_e.DQ because here we respect $' and $" tokens, while <(
|
61 | # token is not respected.
|
62 | #
|
63 | # Like VS_ARG_UNQ, but single quotes are NOT respected (they appear
|
64 | # literally), and process substitution is not respected (ditto).
|
65 | #
|
66 | # "" and $'' and $"" are respected, but not ''. I need a matrix for this.
|
67 | #
|
68 | # Like DQ, except nested "" and $'' and $"" are RESPECTED.
|
69 | #
|
70 | # It's weird that double quotes are allowed. Not sure why that would be.
|
71 | # Unquoted is also allowed, so " a "b" c " $'' and $"" are lame, because they
|
72 | # don't appear in the DQ context. I think I should parse those but DISALLOW.
|
73 | # You should always make $'' and $"" as a separate var!
|
74 |
|
75 | class WordParser(object):
|
76 |
|
77 | def __init__(self, lexer, line_reader, lex_mode=lex_mode_e.OUTER):
|
78 | self.lexer = lexer
|
79 | self.line_reader = line_reader
|
80 | self.Reset(lex_mode=lex_mode)
|
81 |
|
82 | def _Peek(self):
|
83 | """Helper method."""
|
84 | if self.next_lex_mode is not None:
|
85 | self.prev_token = self.cur_token # for completion
|
86 | self.cur_token = self.lexer.Read(self.next_lex_mode)
|
87 | self.token_kind = LookupKind(self.cur_token.id)
|
88 | self.token_type = self.cur_token.id
|
89 |
|
90 | self.next_lex_mode = None
|
91 | return self.cur_token
|
92 |
|
93 | def _Next(self, lex_mode):
|
94 | """Set the next lex state, but don't actually read a token.
|
95 |
|
96 | We need this for proper interactive parsing.
|
97 | """
|
98 | self.next_lex_mode = lex_mode
|
99 |
|
100 | def Reset(self, lex_mode=lex_mode_e.OUTER):
|
101 | """Called by interactive loop."""
|
102 | # For _Peek()
|
103 | self.prev_token = None # for completion
|
104 | self.cur_token = None
|
105 | self.token_kind = Kind.Undefined
|
106 | self.token_type = Id.Undefined_Tok
|
107 |
|
108 | self.next_lex_mode = lex_mode
|
109 |
|
110 | # For newline. TODO: I think we can do this iteratively, without member
|
111 | # state.
|
112 | self.cursor = None
|
113 | self.cursor_was_newline = False
|
114 |
|
115 | self.error_stack = []
|
116 |
|
117 | def AddErrorContext(self, msg, *args, **kwargs):
|
118 | err = util.ParseError(msg, *args, **kwargs)
|
119 | self.error_stack.append(err)
|
120 |
|
121 | def Error(self):
|
122 | return self.error_stack
|
123 |
|
124 | def _BadToken(self, msg, token):
|
125 | """
|
126 | Args:
|
127 | msg: format string with a single %s token
|
128 | token: Token
|
129 | """
|
130 | self.AddErrorContext(msg, token, token=token)
|
131 |
|
132 | def PrevToken(self):
|
133 | """Inspect state. Used by completion.
|
134 |
|
135 | cur_token is usually Id.Op_Newline \n, so we need the previous one.
|
136 | """
|
137 | return self.prev_token
|
138 |
|
139 | def _ReadVarOpArg(self, arg_lex_mode, eof_type=Id.Undefined_Tok,
|
140 | empty_ok=True):
|
141 | # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
|
142 | # valid, even when unquoted.
|
143 | self._Next(arg_lex_mode)
|
144 | self._Peek()
|
145 |
|
146 | w = self._ReadCompoundWord(
|
147 | lex_mode=arg_lex_mode, eof_type=eof_type, empty_ok=empty_ok)
|
148 | # This is for "${s:-}", ${s/a//}, etc. It is analogous to
|
149 | # LooksLikeAssignment where we turn x= into x=''. It has the same
|
150 | # potential problem of not having spids.
|
151 | #
|
152 | # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
|
153 | # return a CompoundWord with no parts, which is explicitly checked with a
|
154 | # custom error message.
|
155 | if not w.parts and arg_lex_mode == lex_mode_e.VS_ARG_DQ and empty_ok:
|
156 | w.parts.append(ast.EmptyPart())
|
157 | return w
|
158 |
|
159 | def _ReadSliceArg(self):
|
160 | """Read an arithmetic expression for either part of ${a : i+1 : i+2}."""
|
161 | anode = self._ReadArithExpr(do_next=False)
|
162 | return anode
|
163 |
|
164 | def _ReadSliceVarOp(self):
|
165 | """ VarOf ':' ArithExpr (':' ArithExpr )? """
|
166 | self._Next(lex_mode_e.ARITH)
|
167 | self._Peek()
|
168 | if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
|
169 | begin = None # no beginning specified
|
170 | else:
|
171 | begin = self._ReadSliceArg()
|
172 | if not begin: return None
|
173 | #print('BEGIN', begin)
|
174 | #print('BVS2', self.cur_token)
|
175 |
|
176 | if self.token_type == Id.Arith_RBrace:
|
177 | return ast.Slice(begin, None) # No length specified
|
178 |
|
179 | # Id.Arith_Colon is a pun for Id.VOp2_Colon
|
180 | elif self.token_type == Id.Arith_Colon:
|
181 | self._Next(lex_mode_e.ARITH)
|
182 | length = self._ReadSliceArg()
|
183 | if not length: return None
|
184 |
|
185 | #print('after colon', self.cur_token)
|
186 | return ast.Slice(begin, length)
|
187 |
|
188 | else:
|
189 | self.AddErrorContext("Unexpected token in slice: %s", self.cur_token)
|
190 | return None
|
191 |
|
192 | def _ReadPatSubVarOp(self, lex_mode):
|
193 | """
|
194 | Match = ('/' | '#' | '%') WORD
|
195 | VarSub = ...
|
196 | | VarOf '/' Match '/' WORD
|
197 | """
|
198 | do_all = False
|
199 | do_prefix = False
|
200 | do_suffix = False
|
201 |
|
202 | pat = self._ReadVarOpArg(lex_mode, eof_type=Id.Lit_Slash, empty_ok=False)
|
203 | if not pat: return None
|
204 |
|
205 | if len(pat.parts) == 1:
|
206 | ok, s, quoted = word.StaticEval(pat)
|
207 | if ok and s == '/' and not quoted: # Looks like ${a////c}, read again
|
208 | self._Next(lex_mode)
|
209 | self._Peek()
|
210 | p = ast.LiteralPart(self.cur_token)
|
211 | pat.parts.append(p)
|
212 |
|
213 | if len(pat.parts) == 0:
|
214 | self._BadToken("Pattern must not be empty: %r", token=self.cur_token)
|
215 | return None
|
216 | else:
|
217 | first_part = pat.parts[0]
|
218 | if first_part.tag == word_part_e.LiteralPart:
|
219 | lit_id = first_part.token.id
|
220 | if lit_id == Id.Lit_Slash:
|
221 | do_all = True
|
222 | pat.parts.pop(0)
|
223 | elif lit_id == Id.Lit_Pound:
|
224 | do_prefix = True
|
225 | pat.parts.pop(0)
|
226 | elif lit_id == Id.Lit_Percent:
|
227 | do_suffix = True
|
228 | pat.parts.pop(0)
|
229 |
|
230 | #self._Peek()
|
231 | if self.token_type == Id.Right_VarSub:
|
232 | # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
|
233 | return ast.PatSub(pat, None, do_all, do_prefix, do_suffix)
|
234 |
|
235 | elif self.token_type == Id.Lit_Slash:
|
236 | replace = self._ReadVarOpArg(lex_mode) # do not stop at /
|
237 | if not replace: return None
|
238 |
|
239 | self._Peek()
|
240 | if self.token_type == Id.Right_VarSub:
|
241 | return ast.PatSub(pat, replace, do_all, do_prefix, do_suffix)
|
242 |
|
243 | else:
|
244 | self._BadToken("Expected } after pat sub, got %s", self.cur_token)
|
245 | return None
|
246 |
|
247 | else:
|
248 | self._BadToken("Expected } after pat sub, got %s", self.cur_token)
|
249 | return None
|
250 |
|
251 | def _ReadSubscript(self):
|
252 | """ Subscript = '[' ('@' | '*' | ArithExpr) ']'
|
253 | """
|
254 | # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
|
255 | # expression.
|
256 | t2 = self.lexer.LookAhead(lex_mode_e.ARITH)
|
257 | if t2.id in (Id.Lit_At, Id.Arith_Star):
|
258 | op = ast.WholeArray(t2.id)
|
259 |
|
260 | self._Next(lex_mode_e.ARITH) # skip past [
|
261 | self._Peek()
|
262 | self._Next(lex_mode_e.ARITH) # skip past @
|
263 | self._Peek()
|
264 | else:
|
265 | anode = self._ReadArithExpr()
|
266 | if not anode:
|
267 | return None
|
268 | op = ast.ArrayIndex(anode)
|
269 |
|
270 | #self._Peek() # Can't do this here. Should the test go elsewhere?
|
271 | if self.token_type != Id.Arith_RBracket: # Should be looking at ]
|
272 | self._BadToken('Expected ] after subscript, got %s', self.cur_token)
|
273 | return None
|
274 |
|
275 | self._Next(lex_mode_e.VS_2) # skip past ]
|
276 | self._Peek() # Needed to be in the same spot as no subscript
|
277 |
|
278 | return op
|
279 |
|
280 | def _ParseVarOf(self):
|
281 | """
|
282 | VarOf = NAME Subscript?
|
283 | | NUMBER # no subscript allowed, none of these are arrays
|
284 | # ${@[1]} doesn't work, even though slicing does
|
285 | | VarSymbol
|
286 | """
|
287 | self._Peek()
|
288 | name_token = self.cur_token
|
289 | self._Next(lex_mode_e.VS_2)
|
290 |
|
291 | self._Peek() # Check for []
|
292 | if self.token_type == Id.VOp2_LBracket:
|
293 | bracket_op = self._ReadSubscript()
|
294 | if not bracket_op: return None
|
295 | else:
|
296 | bracket_op = None
|
297 |
|
298 | part = ast.BracedVarSub(name_token)
|
299 | part.bracket_op = bracket_op
|
300 | return part
|
301 |
|
302 | def _ParseVarExpr(self, arg_lex_mode):
|
303 | """
|
304 | Start parsing at the op -- we already skipped past the name.
|
305 | """
|
306 | part = self._ParseVarOf()
|
307 | if not part: return None
|
308 |
|
309 | self._Peek()
|
310 | if self.token_type == Id.Right_VarSub:
|
311 | return part # no ops
|
312 |
|
313 | # Or maybe this is a VarOpKind
|
314 |
|
315 | op_kind = self.token_kind
|
316 |
|
317 | if op_kind == Kind.VTest:
|
318 | op_id = self.token_type
|
319 | arg_word = self._ReadVarOpArg(arg_lex_mode)
|
320 | if self.token_type != Id.Right_VarSub:
|
321 | self._BadToken('Unexpected token after test arg: %s', self.cur_token)
|
322 | return None
|
323 |
|
324 | part.suffix_op = ast.StringUnary(op_id, arg_word)
|
325 |
|
326 | elif op_kind == Kind.VOp1:
|
327 | op_id = self.token_type
|
328 | arg_word = self._ReadVarOpArg(arg_lex_mode)
|
329 | if self.token_type != Id.Right_VarSub:
|
330 | self._BadToken('Unexpected token after unary op: %s', self.cur_token)
|
331 | return None
|
332 |
|
333 | op = ast.StringUnary(op_id, arg_word)
|
334 | part.suffix_op = op
|
335 |
|
336 | elif op_kind == Kind.VOp2:
|
337 | if self.token_type == Id.VOp2_Slash:
|
338 | op = self._ReadPatSubVarOp(arg_lex_mode)
|
339 | if not op: return None
|
340 | # Checked by the method above
|
341 | assert self.token_type == Id.Right_VarSub, self.cur_token
|
342 |
|
343 | elif self.token_type == Id.VOp2_Colon:
|
344 | op = self._ReadSliceVarOp()
|
345 | if not op: return None
|
346 | if self.token_type != Id.Arith_RBrace:
|
347 | self._BadToken('Unexpected token after slice: %s', self.cur_token)
|
348 | return None
|
349 |
|
350 | else:
|
351 | p_die('Unexpected token %s', self.cur_token, token=self.cur_token)
|
352 |
|
353 | part.suffix_op = op
|
354 |
|
355 | # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
|
356 | # mode. It's redundantly checked above.
|
357 | if self.token_type not in (Id.Right_VarSub, Id.Arith_RBrace):
|
358 | self._BadToken('Unexpected token after var sub: %s', self.cur_token)
|
359 | return None
|
360 |
|
361 | # Now look for ops
|
362 | return part
|
363 |
|
364 | def _ReadBracedBracedVarSub(self, d_quoted=False):
|
365 | """For the ${} expression language.
|
366 |
|
367 | NAME = [a-zA-Z_][a-zA-Z0-9_]*
|
368 | NUMBER = [0-9]+ # ${10}, ${11}, ...
|
369 |
|
370 | Subscript = '[' ('@' | '*' | ArithExpr) ']'
|
371 | VarSymbol = '!' | '@' | '#' | ...
|
372 | VarOf = NAME Subscript?
|
373 | | NUMBER # no subscript allowed, none of these are arrays
|
374 | # ${@[1]} doesn't work, even though slicing does
|
375 | | VarSymbol
|
376 |
|
377 | TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
|
378 | STRIP_OP = '#' | '##' | '%' | '%%'
|
379 | CASE_OP = ',' | ',,' | '^' | '^^'
|
380 |
|
381 | UnaryOp = TEST_OP | STRIP_OP | CASE_OP | ...
|
382 | Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
|
383 | VarExpr = VarOf
|
384 | | VarOf UnaryOp WORD
|
385 | | VarOf ':' ArithExpr (':' ArithExpr )?
|
386 | | VarOf '/' Match '/' WORD
|
387 |
|
388 | LengthExpr = '#' VarOf # can't apply operators after length
|
389 |
|
390 | RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
|
391 | # ${!ref[0]} vs ${!keys[@]} resolved later
|
392 |
|
393 | PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
|
394 |
|
395 | VarSub = LengthExpr
|
396 | | RefOrKeys
|
397 | | PrefixQuery
|
398 | | VarExpr
|
399 |
|
400 | NOTES:
|
401 | - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
|
402 | slicing ${a:x+1:y+2}
|
403 | - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
|
404 | - @ and * are technically arithmetic expressions in this implementation
|
405 | - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
|
406 | it's also vectorized.
|
407 |
|
408 | Strictness over bash:
|
409 | echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
|
410 | grammar
|
411 | ! and # prefixes can't be composed, even though named refs can be composed
|
412 | with other operators
|
413 | '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip a
|
414 | prefix, and it can also be a literal part of WORD.
|
415 |
|
416 | From the parser's point of view, the prefix # can't be combined with
|
417 | UnaryOp/slicing/matching, and the ! can. However
|
418 |
|
419 | ${a[@]:1:2} is not allowed
|
420 | ${#a[@]:1:2} is allowed, but gives the wrong answer
|
421 | """
|
422 | left_spid = self.cur_token.span_id
|
423 |
|
424 | if d_quoted:
|
425 | arg_lex_mode = lex_mode_e.VS_ARG_DQ
|
426 | else:
|
427 | arg_lex_mode = lex_mode_e.VS_ARG_UNQ
|
428 |
|
429 | self._Next(lex_mode_e.VS_1)
|
430 | self._Peek()
|
431 |
|
432 | ty = self.token_type
|
433 |
|
434 | if ty == Id.VSub_Pound:
|
435 | # Disambiguate
|
436 | t = self.lexer.LookAhead(lex_mode_e.VS_1)
|
437 | #print("\t# LOOKAHEAD", t)
|
438 | if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
|
439 | # e.g. a name, '#' is the prefix
|
440 | self._Next(lex_mode_e.VS_1)
|
441 | part = self._ParseVarOf()
|
442 |
|
443 | self._Peek()
|
444 | if self.token_type != Id.Right_VarSub:
|
445 | self._BadToken("Expected } after length expression, got %r",
|
446 | self.cur_token)
|
447 | return None
|
448 |
|
449 | part.prefix_op = Id.VSub_Pound # length
|
450 |
|
451 | else: # not a prefix, '#' is the variable
|
452 | part = self._ParseVarExpr(arg_lex_mode)
|
453 | if not part: return None
|
454 |
|
455 | elif ty == Id.VSub_Bang:
|
456 | t = self.lexer.LookAhead(lex_mode_e.VS_1)
|
457 | #print("\t! LOOKAHEAD", t)
|
458 | if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
|
459 | # e.g. a name, '!' is the prefix
|
460 | # ${!a} -- this is a ref
|
461 | # ${!3} -- this is ref
|
462 | # ${!a[1]} -- this is a ref
|
463 | # ${!a[@]} -- this is a keys
|
464 | # No lookahead -- do it in a second step, or at runtime
|
465 | self._Next(lex_mode_e.VS_1)
|
466 | part = self._ParseVarExpr(arg_lex_mode)
|
467 | if not part: return None
|
468 |
|
469 | part.prefix_op = Id.VSub_Bang
|
470 |
|
471 | else: # not a prefix, '!' is the variable
|
472 | part = self._ParseVarExpr(arg_lex_mode)
|
473 | if not part: return None
|
474 |
|
475 | # VS_NAME, VS_NUMBER, symbol that isn't # or !
|
476 | elif self.token_kind == Kind.VSub:
|
477 | part = self._ParseVarExpr(arg_lex_mode)
|
478 | if not part: return None
|
479 |
|
480 | else:
|
481 | # e.g. ${^}
|
482 | p_die('Unexpected token %s', self.cur_token, token=self.cur_token)
|
483 |
|
484 | part.spids.append(left_spid)
|
485 |
|
486 | # Does this work?
|
487 | right_spid = self.cur_token.span_id
|
488 | part.spids.append(right_spid)
|
489 |
|
490 | return part
|
491 |
|
492 | def _ReadSingleQuotedPart(self, lex_mode):
|
493 | left = self.cur_token
|
494 | tokens = []
|
495 |
|
496 | done = False
|
497 | while not done:
|
498 | self._Next(lex_mode)
|
499 | self._Peek()
|
500 |
|
501 | # Kind.Char emitted in DOLLAR_SQ state
|
502 | if self.token_kind in (Kind.Lit, Kind.Char):
|
503 | tokens.append(self.cur_token)
|
504 |
|
505 | elif self.token_kind == Kind.Eof:
|
506 | self.AddErrorContext('Unexpected EOF in single-quoted string')
|
507 | return False
|
508 |
|
509 | elif self.token_kind == Kind.Right:
|
510 | done = True # assume Id.Right_SingleQuote
|
511 |
|
512 | else:
|
513 | raise AssertionError(
|
514 | 'Unhandled token in single-quoted part %s (%d)' %
|
515 | (self.cur_token, self.token_kind))
|
516 |
|
517 | return ast.SingleQuotedPart(left, tokens)
|
518 |
|
519 | def _ReadDoubleQuotedLeftParts(self):
|
520 | """Read substitution parts in a double quoted context."""
|
521 | if self.token_type in (Id.Left_CommandSub, Id.Left_Backtick):
|
522 | return self._ReadCommandSubPart(self.token_type)
|
523 |
|
524 | if self.token_type == Id.Left_VarSub:
|
525 | return self._ReadBracedBracedVarSub(d_quoted=True)
|
526 |
|
527 | if self.token_type == Id.Left_ArithSub:
|
528 | return self._ReadArithSubPart()
|
529 |
|
530 | if self.token_type == Id.Left_ArithSub2:
|
531 | return self._ReadArithSub2Part()
|
532 |
|
533 | raise AssertionError(self.cur_token)
|
534 |
|
535 | def _ReadLeftParts(self):
|
536 | """Read substitutions and quoted strings."""
|
537 |
|
538 | if self.token_type == Id.Left_DoubleQuote:
|
539 | return self._ReadDoubleQuotedPart()
|
540 |
|
541 | if self.token_type == Id.Left_DollarDoubleQuote:
|
542 | # NOTE: $"" is treated as "" for now. Does it make sense to add the
|
543 | # token to the part?
|
544 | return self._ReadDoubleQuotedPart()
|
545 |
|
546 | if self.token_type == Id.Left_SingleQuote:
|
547 | return self._ReadSingleQuotedPart(lex_mode_e.SQ)
|
548 |
|
549 | if self.token_type == Id.Left_DollarSingleQuote:
|
550 | return self._ReadSingleQuotedPart(lex_mode_e.DOLLAR_SQ)
|
551 |
|
552 | if self.token_type in (
|
553 | Id.Left_CommandSub, Id.Left_Backtick, Id.Left_ProcSubIn,
|
554 | Id.Left_ProcSubOut):
|
555 | return self._ReadCommandSubPart(self.token_type)
|
556 |
|
557 | if self.token_type == Id.Left_VarSub:
|
558 | return self._ReadBracedBracedVarSub(d_quoted=False)
|
559 |
|
560 | if self.token_type == Id.Left_ArithSub:
|
561 | return self._ReadArithSubPart()
|
562 |
|
563 | if self.token_type == Id.Left_ArithSub2:
|
564 | return self._ReadArithSub2Part()
|
565 |
|
566 | raise AssertionError('%s not handled' % self.cur_token)
|
567 |
|
568 | def _ReadExtGlobPart(self):
|
569 | """
|
570 | Grammar:
|
571 | Item = CompoundWord | EPSILON # important: @(foo|) is allowed
|
572 | LEFT = '@(' | '*(' | '+(' | '?(' | '!('
|
573 | RIGHT = ')'
|
574 | ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
|
575 | CompoundWord includes ExtGlobPart
|
576 | """
|
577 | left_token = self.cur_token
|
578 | arms = []
|
579 | part = ast.ExtGlobPart(left_token, arms) # return value
|
580 | part.spids.append(left_token.span_id)
|
581 |
|
582 | self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
|
583 | self._Next(lex_mode_e.EXTGLOB) # advance past LEFT
|
584 |
|
585 | read_word = False # did we just a read a word? To handle @(||).
|
586 |
|
587 | while True:
|
588 | self._Peek()
|
589 | #log('t %r', self.cur_token)
|
590 |
|
591 | if self.token_type == Id.Right_ExtGlob:
|
592 | if not read_word:
|
593 | arms.append(ast.CompoundWord())
|
594 | part.spids.append(self.cur_token.span_id)
|
595 | break
|
596 |
|
597 | elif self.token_type == Id.Op_Pipe:
|
598 | if not read_word:
|
599 | arms.append(ast.CompoundWord())
|
600 | read_word = False
|
601 | self._Next(lex_mode_e.EXTGLOB)
|
602 |
|
603 | # lex mode EXTGLOB should only produce these 4 kinds of tokens
|
604 | elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.ExtGlob):
|
605 | w = self._ReadCompoundWord(lex_mode=lex_mode_e.EXTGLOB)
|
606 | arms.append(w)
|
607 | read_word = True
|
608 |
|
609 | elif self.token_kind == Kind.Eof:
|
610 | self.AddErrorContext(
|
611 | 'Unexpected EOF reading extended glob that began here',
|
612 | token=left_token)
|
613 | return None
|
614 |
|
615 | else:
|
616 | raise AssertionError('Unexpected token %r' % self.cur_token)
|
617 |
|
618 | return part
|
619 |
|
620 | def _ReadDoubleQuotedPart(self, eof_type=Id.Undefined_Tok, here_doc=False):
|
621 | """
|
622 | Args:
|
623 | eof_type: for stopping at }, Id.Lit_RBrace
|
624 | here_doc: Whether we are reading in a here doc context
|
625 |
|
626 | Also ${foo%%a b c} # treat this as double quoted. until you hit
|
627 | """
|
628 | quoted_part = ast.DoubleQuotedPart()
|
629 | left_spid = const.NO_INTEGER
|
630 | right_spid = const.NO_INTEGER # gets set later
|
631 |
|
632 | if self.cur_token is not None: # None in here doc case
|
633 | left_spid = self.cur_token.span_id
|
634 |
|
635 | done = False
|
636 | while not done:
|
637 | self._Next(lex_mode_e.DQ)
|
638 | self._Peek()
|
639 | #print(self.cur_token)
|
640 |
|
641 | if self.token_type == eof_type: # e.g. stop at }
|
642 | done = True
|
643 | continue
|
644 |
|
645 | elif self.token_kind == Kind.Lit:
|
646 | if self.token_type == Id.Lit_EscapedChar:
|
647 | part = ast.EscapedLiteralPart(self.cur_token)
|
648 | else:
|
649 | part = ast.LiteralPart(self.cur_token)
|
650 | quoted_part.parts.append(part)
|
651 |
|
652 | elif self.token_kind == Kind.Left:
|
653 | part = self._ReadDoubleQuotedLeftParts()
|
654 | if not part:
|
655 | return None
|
656 | quoted_part.parts.append(part)
|
657 |
|
658 | elif self.token_kind == Kind.VSub:
|
659 | part = ast.SimpleVarSub(self.cur_token)
|
660 | quoted_part.parts.append(part)
|
661 |
|
662 | elif self.token_kind == Kind.Right:
|
663 | assert self.token_type == Id.Right_DoubleQuote
|
664 | if here_doc:
|
665 | # Turn Id.Right_DoubleQuote into a literal part
|
666 | quoted_part.parts.append(ast.LiteralPart(self.cur_token))
|
667 | else:
|
668 | done = True # assume Id.Right_DoubleQuote
|
669 | right_spid = self.cur_token.span_id
|
670 |
|
671 | elif self.token_kind == Kind.Eof:
|
672 | if here_doc: # here docs will have an EOF in their token stream
|
673 | done = True
|
674 | else:
|
675 | self.AddErrorContext(
|
676 | 'Unexpected EOF reading double-quoted string that began here',
|
677 | span_id=left_spid)
|
678 | return False
|
679 |
|
680 | else:
|
681 | raise AssertionError(self.cur_token)
|
682 |
|
683 | quoted_part.spids.extend((left_spid, right_spid))
|
684 | return quoted_part
|
685 |
|
686 | def _ReadCommandSubPart(self, token_type):
|
687 | """
|
688 | NOTE: This is not in the grammar, because word parts aren't in the grammar!
|
689 |
|
690 | command_sub = '$(' command_list ')'
|
691 | """
|
692 | left_token = self.cur_token
|
693 | left_spid = left_token.span_id
|
694 |
|
695 | #print('_ReadCommandSubPart', self.cur_token)
|
696 | self._Next(lex_mode_e.OUTER) # advance past $( or `
|
697 |
|
698 | # Set the lexer in a state so ) becomes the EOF token.
|
699 | #print('_ReadCommandSubPart lexer.PushHint ) -> EOF')
|
700 | if token_type in (
|
701 | Id.Left_CommandSub, Id.Left_ProcSubIn, Id.Left_ProcSubOut):
|
702 | self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
|
703 | elif token_type == Id.Left_Backtick:
|
704 | self.lexer.PushHint(Id.Left_Backtick, Id.Eof_Backtick)
|
705 | else:
|
706 | raise AssertionError(self.token_type)
|
707 |
|
708 | from osh import parse_lib
|
709 | c_parser = parse_lib.MakeParserForCommandSub(self.line_reader, self.lexer)
|
710 |
|
711 | node = c_parser.ParseWholeFile() # `` and $() allowed
|
712 | if not node:
|
713 | # Example of parse error:
|
714 | # echo $(cat |) OR
|
715 | # echo `cat |`
|
716 | error_stack = c_parser.Error()
|
717 | self.error_stack.extend(error_stack)
|
718 | print(self.error_stack)
|
719 | self.AddErrorContext('Error parsing command list in command sub')
|
720 | return None
|
721 |
|
722 | # Hm this creates its own word parser, which is thrown away?
|
723 | #print('X', self.cur_token)
|
724 | right_spid = c_parser.w_parser.cur_token.span_id
|
725 |
|
726 | cs_part = ast.CommandSubPart(node, left_token)
|
727 | cs_part.spids.append(left_spid)
|
728 | cs_part.spids.append(right_spid)
|
729 | return cs_part
|
730 |
|
731 | def _ReadArithExpr(self, do_next=True):
|
732 | """Read and parse an arithmetic expression in various contexts.
|
733 |
|
734 | $(( 1+2 ))
|
735 | (( a=1+2 ))
|
736 | ${a[ 1+2 ]}
|
737 | ${a : 1+2 : 1+2}
|
738 |
|
739 | See tests/arith-context.test.sh for ambiguous cases.
|
740 |
|
741 | ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
|
742 |
|
743 | ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
|
744 |
|
745 | TODO: Instead of having an eof_type. I think we should use just run the
|
746 | arith parser until it's done. That will take care of both : and ]. We
|
747 | switch the state back.
|
748 |
|
749 | See the assertion in ArithParser.Parse() -- unexpected extra input.
|
750 | """
|
751 | if do_next:
|
752 | self._Next(lex_mode_e.ARITH)
|
753 | # calls self.ReadWord(lex_mode_e.ARITH)
|
754 | a_parser = tdop.TdopParser(arith_parse.SPEC, self)
|
755 | anode = a_parser.Parse()
|
756 | if not anode:
|
757 | error_stack = a_parser.Error()
|
758 | self.error_stack.extend(error_stack)
|
759 | return anode # could be None
|
760 |
|
761 | def _ReadArithSubPart(self):
|
762 | """
|
763 | Read an arith substitution, which contains an arith expression, e.g.
|
764 | $((a + 1)).
|
765 | """
|
766 | left_span_id = self.cur_token.span_id
|
767 |
|
768 | # The second one needs to be disambiguated in stuff like stuff like:
|
769 | # $(echo $(( 1+2 )) )
|
770 | self.lexer.PushHint(Id.Op_RParen, Id.Right_ArithSub)
|
771 |
|
772 | # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
|
773 | # could save the lexer/reader state here, and retry if the arithmetic parse
|
774 | # fails. But we can almost always catch this at parse time. There could
|
775 | # be some exceptions like:
|
776 | # $((echo * foo)) # looks like multiplication
|
777 | # $((echo / foo)) # looks like division
|
778 |
|
779 | anode = self._ReadArithExpr()
|
780 | if not anode:
|
781 | self.AddErrorContext("Error parsing arith sub part")
|
782 | return None
|
783 |
|
784 | if self.token_type != Id.Arith_RParen:
|
785 | self._BadToken('Expected first paren to end arith sub, got %s',
|
786 | self.cur_token)
|
787 | return None
|
788 |
|
789 | self._Next(lex_mode_e.OUTER) # TODO: This could be DQ or ARITH too
|
790 |
|
791 | # PROBLEM: $(echo $(( 1 + 2 )) )
|
792 | # Two right parens break the Id.Eof_RParen scheme
|
793 | self._Peek()
|
794 | if self.token_type != Id.Right_ArithSub:
|
795 | self._BadToken('Expected second paren to end arith sub, got %s',
|
796 | self.cur_token)
|
797 | return None
|
798 | right_span_id = self.cur_token.span_id
|
799 |
|
800 | node = ast.ArithSubPart(anode)
|
801 | node.spids.append(left_span_id)
|
802 | node.spids.append(right_span_id)
|
803 | return node
|
804 |
|
805 | def _ReadArithSub2Part(self):
|
806 | """Non-standard arith sub $[a + 1]."""
|
807 | left_span_id = self.cur_token.span_id
|
808 |
|
809 | anode = self._ReadArithExpr()
|
810 | if not anode:
|
811 | self.AddErrorContext("Error parsing arith sub part")
|
812 | return None
|
813 |
|
814 | if self.token_type != Id.Arith_RBracket:
|
815 | self.AddErrorContext("Expected ], got %s", self.cur_token)
|
816 | return None
|
817 | right_span_id = self.cur_token.span_id
|
818 |
|
819 | node = ast.ArithSubPart(anode)
|
820 | node.spids.append(left_span_id)
|
821 | node.spids.append(right_span_id)
|
822 | return node
|
823 |
|
824 | def ReadDParen(self):
|
825 | """Read ((1+ 2)) -- command context.
|
826 |
|
827 | We're using the word parser because it's very similar to _ReadArithExpr
|
828 | above.
|
829 | """
|
830 | # The second one needs to be disambiguated in stuff like stuff like:
|
831 | # TODO: Be consistent with ReadForExpression below and use lex_mode_e.ARITH?
|
832 | # Then you can get rid of this.
|
833 | self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
|
834 |
|
835 | anode = self._ReadArithExpr()
|
836 | if not anode:
|
837 | self.AddErrorContext("Error parsing dparen statement")
|
838 | return None
|
839 |
|
840 | #print('xx ((', self.cur_token)
|
841 | if self.token_type != Id.Arith_RParen:
|
842 | self._BadToken('Expected first paren to end arith sub, got %s',
|
843 | self.cur_token)
|
844 | return None
|
845 | self._Next(lex_mode_e.OUTER)
|
846 |
|
847 | # PROBLEM: $(echo $(( 1 + 2 )) )
|
848 | self._Peek()
|
849 | if self.token_type != Id.Op_DRightParen:
|
850 | self._BadToken('Expected second paren to end arith sub, got %s',
|
851 | self.cur_token)
|
852 | return None
|
853 | self._Next(lex_mode_e.OUTER)
|
854 |
|
855 | return anode
|
856 |
|
857 | def ReadForExpression(self):
|
858 | """Read ((i=0; i<5; ++i)) -- part of command context.
|
859 |
|
860 | """
|
861 | # No PushHint because we're in arith state.
|
862 | #self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
|
863 |
|
864 | self._Next(lex_mode_e.ARITH) # skip over ((
|
865 |
|
866 | self._Peek()
|
867 | if self.token_type == Id.Arith_Semi:
|
868 | #print('Got empty init')
|
869 | init_node = None
|
870 | else:
|
871 | init_node = self._ReadArithExpr(do_next=False)
|
872 | if not init_node:
|
873 | self.AddErrorContext("Error parsing for init")
|
874 | return None
|
875 | self._Next(lex_mode_e.ARITH)
|
876 | #print('INIT',init_node)
|
877 |
|
878 | self._Peek()
|
879 | if self.token_type == Id.Arith_Semi:
|
880 | #print('Got empty condition')
|
881 | cond_node = None
|
882 | else:
|
883 | cond_node = self._ReadArithExpr(do_next=False)
|
884 | if not cond_node:
|
885 | self.AddErrorContext("Error parsing for cond")
|
886 | return None
|
887 | self._Next(lex_mode_e.ARITH)
|
888 | #print('COND',cond_node)
|
889 |
|
890 | self._Peek()
|
891 | if self.token_type == Id.Arith_RParen:
|
892 | #print('Got empty update')
|
893 | update_node = None
|
894 | else:
|
895 | update_node = self._ReadArithExpr(do_next=False)
|
896 | if not update_node:
|
897 | self.AddErrorContext("Error parsing for update")
|
898 | return None
|
899 | self._Next(lex_mode_e.ARITH)
|
900 | #print('UPDATE',update_node)
|
901 |
|
902 | #print('TT', self.cur_token)
|
903 | # Second paren
|
904 | self._Peek()
|
905 | if self.token_type != Id.Arith_RParen:
|
906 | self._BadToken('Expected right paren to end for loop expression, got %s',
|
907 | self.cur_token)
|
908 | return None
|
909 | self._Next(lex_mode_e.OUTER)
|
910 |
|
911 | return ast.ForExpr(init_node, cond_node, update_node)
|
912 |
|
913 | def _ReadArrayLiteralPart(self):
|
914 | self._Next(lex_mode_e.OUTER) # advance past (
|
915 | self._Peek()
|
916 | if self.cur_token.id != Id.Op_LParen:
|
917 | self.AddErrorContext('Expected ( after =', token=self.cur_token)
|
918 | return None
|
919 |
|
920 | # MUST use a new word parser (with same lexer).
|
921 | w_parser = WordParser(self.lexer, self.line_reader)
|
922 | words = []
|
923 | while True:
|
924 | w = w_parser.ReadWord(lex_mode_e.OUTER)
|
925 | if not w:
|
926 | self.error_stack.extend(w_parser.Error())
|
927 | return None
|
928 |
|
929 | if w.tag == word_e.TokenWord:
|
930 | word_id = word.CommandId(w)
|
931 | if word_id == Id.Right_ArrayLiteral:
|
932 | break
|
933 | # Unlike command parsing, array parsing allows embedded \n.
|
934 | elif word_id == Id.Op_Newline:
|
935 | continue
|
936 | else:
|
937 | self.AddErrorContext(
|
938 | 'Unexpected word in array literal: %s', w, word=w)
|
939 | return None
|
940 |
|
941 | words.append(w)
|
942 |
|
943 | words2 = braces.BraceDetectAll(words)
|
944 | words3 = word.TildeDetectAll(words2)
|
945 |
|
946 | return ast.ArrayLiteralPart(words3)
|
947 |
|
948 | def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok,
|
949 | lex_mode=lex_mode_e.OUTER, empty_ok=True):
|
950 | """
|
951 | Precondition: Looking at the first token of the first word part
|
952 | Postcondition: Looking at the token after, e.g. space or operator
|
953 |
|
954 | NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
|
955 | could be an operator delimiting a compound word. Can we change lexer modes
|
956 | and remove this special case?
|
957 | """
|
958 | #print('_ReadCompoundWord', lex_mode)
|
959 | word = ast.CompoundWord()
|
960 |
|
961 | num_parts = 0
|
962 | done = False
|
963 | while not done:
|
964 | allow_done = empty_ok or num_parts != 0
|
965 | self._Peek()
|
966 | #print('CW',self.cur_token)
|
967 | if allow_done and self.token_type == eof_type:
|
968 | done = True # e.g. for ${foo//pat/replace}
|
969 |
|
970 | # Keywords like "for" are treated like literals
|
971 | elif self.token_kind in (
|
972 | Kind.Lit, Kind.KW, Kind.Assign, Kind.ControlFlow, Kind.BoolUnary,
|
973 | Kind.BoolBinary):
|
974 | if self.token_type == Id.Lit_EscapedChar:
|
975 | part = ast.EscapedLiteralPart(self.cur_token)
|
976 | else:
|
977 | part = ast.LiteralPart(self.cur_token)
|
978 | #part.xspans.append(self.cur_token.span_id)
|
979 |
|
980 | word.parts.append(part)
|
981 |
|
982 | if self.token_type == Id.Lit_VarLike:
|
983 | #print('@', self.cursor)
|
984 | #print('@', self.cur_token)
|
985 |
|
986 | t = self.lexer.LookAhead(lex_mode_e.OUTER)
|
987 | if t.id == Id.Op_LParen:
|
988 | self.lexer.PushHint(Id.Op_RParen, Id.Right_ArrayLiteral)
|
989 | part2 = self._ReadArrayLiteralPart()
|
990 | if not part2:
|
991 | self.AddErrorContext('_ReadArrayLiteralPart failed')
|
992 | return False
|
993 | word.parts.append(part2)
|
994 |
|
995 | elif self.token_kind == Kind.VSub:
|
996 | part = ast.SimpleVarSub(self.cur_token)
|
997 | word.parts.append(part)
|
998 |
|
999 | elif self.token_kind == Kind.ExtGlob:
|
1000 | part = self._ReadExtGlobPart()
|
1001 | if not part:
|
1002 | return None
|
1003 | word.parts.append(part)
|
1004 |
|
1005 | elif self.token_kind == Kind.Left:
|
1006 | #print('_ReadLeftParts')
|
1007 | part = self._ReadLeftParts()
|
1008 | if not part:
|
1009 | return None
|
1010 | word.parts.append(part)
|
1011 |
|
1012 | # NOT done yet, will advance below
|
1013 | elif self.token_kind == Kind.Right:
|
1014 | # Still part of the word; will be done on the next iter.
|
1015 | if self.token_type == Id.Right_DoubleQuote:
|
1016 | pass
|
1017 | elif self.token_type == Id.Right_CommandSub:
|
1018 | pass
|
1019 | elif self.token_type == Id.Right_Subshell:
|
1020 | # LEXER HACK for (case x in x) ;; esac )
|
1021 | assert self.next_lex_mode is None # Rewind before it's used
|
1022 | if self.lexer.MaybeUnreadOne():
|
1023 | self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
|
1024 | self._Next(lex_mode)
|
1025 | done = True
|
1026 | else:
|
1027 | done = True
|
1028 |
|
1029 | elif self.token_kind == Kind.Ignored:
|
1030 | done = True
|
1031 |
|
1032 | else:
|
1033 | # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
|
1034 | # so to test for ESAC, we can read ) before getting a chance to
|
1035 | # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
|
1036 | # token and do it again.
|
1037 |
|
1038 | # We get Id.Op_RParen at top level: case x in x) ;; esac
|
1039 | # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
|
1040 | if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
|
1041 | assert self.next_lex_mode is None # Rewind before it's used
|
1042 | if self.lexer.MaybeUnreadOne():
|
1043 | if self.token_type == Id.Eof_RParen:
|
1044 | # Redo translation
|
1045 | self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
|
1046 | self._Next(lex_mode)
|
1047 |
|
1048 | done = True # anything we don't recognize means we're done
|
1049 |
|
1050 | if not done:
|
1051 | self._Next(lex_mode)
|
1052 | num_parts += 1
|
1053 | return word
|
1054 |
|
1055 | def _ReadArithWord(self):
|
1056 | """Helper function for ReadArithWord."""
|
1057 | #assert self.token_type != Id.Undefined_Tok
|
1058 | self._Peek()
|
1059 | #print('_ReadArithWord', self.cur_token)
|
1060 |
|
1061 | if self.token_kind == Kind.Unknown:
|
1062 | self.AddErrorContext("Unknown token in arith context: %s",
|
1063 | self.cur_token, token=self.cur_token)
|
1064 | return None, False
|
1065 |
|
1066 | elif self.token_kind == Kind.Eof:
|
1067 | # Just return EOF token
|
1068 | w = ast.TokenWord(self.cur_token)
|
1069 | return w, False
|
1070 | #self.AddErrorContext("Unexpected EOF in arith context: %s",
|
1071 | # self.cur_token, token=self.cur_token)
|
1072 | #return None, False
|
1073 |
|
1074 | elif self.token_kind == Kind.Ignored:
|
1075 | # Space should be ignored. TODO: change this to SPACE_SPACE and
|
1076 | # SPACE_NEWLINE? or SPACE_TOK.
|
1077 | self._Next(lex_mode_e.ARITH)
|
1078 | return None, True # Tell wrapper to try again
|
1079 |
|
1080 | elif self.token_kind in (Kind.Arith, Kind.Right):
|
1081 | # Id.Right_ArithSub IS just a normal token, handled by ArithParser
|
1082 | self._Next(lex_mode_e.ARITH)
|
1083 | w = ast.TokenWord(self.cur_token)
|
1084 | return w, False
|
1085 |
|
1086 | elif self.token_kind in (Kind.Lit, Kind.Left):
|
1087 | w = self._ReadCompoundWord(lex_mode=lex_mode_e.ARITH)
|
1088 | if not w:
|
1089 | return None, True
|
1090 | return w, False
|
1091 |
|
1092 | elif self.token_kind == Kind.VSub:
|
1093 | part = ast.SimpleVarSub(self.cur_token)
|
1094 | self._Next(lex_mode_e.ARITH)
|
1095 | w = ast.CompoundWord([part])
|
1096 | return w, False
|
1097 |
|
1098 | else:
|
1099 | self._BadToken("Unexpected token parsing arith sub: %s", self.cur_token)
|
1100 | return None, False
|
1101 |
|
1102 | raise AssertionError("Shouldn't get here")
|
1103 |
|
1104 | def _ReadWord(self, lex_mode):
|
1105 | """Helper function for Read().
|
1106 |
|
1107 | Returns:
|
1108 | 2-tuple (word, need_more)
|
1109 | word: Word, or None if there was an error, or need_more is set
|
1110 | need_more: True if the caller should call us again
|
1111 | """
|
1112 | #print('_Read', lex_mode, self.cur_token)
|
1113 | self._Peek()
|
1114 |
|
1115 | if self.token_kind == Kind.Eof:
|
1116 | # No advance
|
1117 | return ast.TokenWord(self.cur_token), False
|
1118 |
|
1119 | # Allow Arith for ) at end of for loop?
|
1120 | elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
|
1121 | self._Next(lex_mode)
|
1122 | if self.token_type == Id.Op_Newline:
|
1123 | if self.cursor_was_newline:
|
1124 | #print('SKIP(nl)', self.cur_token)
|
1125 | return None, True
|
1126 |
|
1127 | return ast.TokenWord(self.cur_token), False
|
1128 |
|
1129 | elif self.token_kind == Kind.Right:
|
1130 | #print('WordParser.Read: Kind.Right', self.cur_token)
|
1131 | if self.token_type not in (
|
1132 | Id.Right_Subshell, Id.Right_FuncDef, Id.Right_CasePat,
|
1133 | Id.Right_ArrayLiteral):
|
1134 | raise AssertionError(self.cur_token)
|
1135 |
|
1136 | self._Next(lex_mode)
|
1137 | return ast.TokenWord(self.cur_token), False
|
1138 |
|
1139 | elif self.token_kind in (Kind.Ignored, Kind.WS):
|
1140 | self._Next(lex_mode)
|
1141 | return None, True # tell Read() to try again
|
1142 |
|
1143 | elif self.token_kind in (
|
1144 | Kind.VSub, Kind.Lit, Kind.Left, Kind.KW, Kind.Assign, Kind.ControlFlow,
|
1145 | Kind.BoolUnary, Kind.BoolBinary, Kind.ExtGlob):
|
1146 | # We're beginning a word. If we see Id.Lit_Pound, change to
|
1147 | # lex_mode_e.COMMENT and read until end of line. (TODO: How to add
|
1148 | # comments to AST?)
|
1149 |
|
1150 | # TODO: Can we do the same thing for Tilde here? Enter a state where we
|
1151 | # look for / too.
|
1152 | if self.token_type == Id.Lit_Pound:
|
1153 | self._Next(lex_mode_e.COMMENT)
|
1154 | self._Peek()
|
1155 |
|
1156 | # NOTE: The # could be the last character in the file. It can't be
|
1157 | # Eof_{RParen,Backtick} because #) and #` are comments.
|
1158 | assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
|
1159 | self.cur_token
|
1160 |
|
1161 | # The next iteration will go into Kind.Ignored and set lex state to
|
1162 | # lex_mode_e.OUTER/etc.
|
1163 | return None, True # tell Read() to try again after comment
|
1164 |
|
1165 | else:
|
1166 | w = self._ReadCompoundWord(lex_mode=lex_mode)
|
1167 | if not w:
|
1168 | self.AddErrorContext(
|
1169 | 'Error reading command word', token=self.cur_token)
|
1170 | return None, False
|
1171 | return w, False
|
1172 |
|
1173 | else:
|
1174 | raise AssertionError(
|
1175 | 'Unhandled: %s (%s)' % (self.cur_token, self.token_kind))
|
1176 |
|
1177 | raise AssertionError("Shouldn't get here")
|
1178 |
|
1179 | def LookAhead(self):
|
1180 | """Look ahead to the next token.
|
1181 |
|
1182 | For the command parser to recognize func () { } and array= (1 2 3). And
|
1183 | probably coprocesses.
|
1184 | """
|
1185 | assert self.token_type != Id.Undefined_Tok
|
1186 | if self.cur_token.id == Id.WS_Space:
|
1187 | t = self.lexer.LookAhead(lex_mode_e.OUTER)
|
1188 | else:
|
1189 | t = self.cur_token
|
1190 | return t.id
|
1191 |
|
1192 | def ReadWord(self, lex_mode):
|
1193 | """Read the next Word.
|
1194 |
|
1195 | Returns:
|
1196 | Word, or None if there was an error
|
1197 | """
|
1198 | # Implementation note: This is an stateful/iterative function that calls
|
1199 | # the stateless "_ReadWord" function.
|
1200 | while True:
|
1201 | if lex_mode == lex_mode_e.ARITH:
|
1202 | # TODO: Can this be unified?
|
1203 | w, need_more = self._ReadArithWord()
|
1204 | elif lex_mode in (
|
1205 | lex_mode_e.OUTER, lex_mode_e.DBRACKET, lex_mode_e.BASH_REGEX):
|
1206 | w, need_more = self._ReadWord(lex_mode)
|
1207 | else:
|
1208 | raise AssertionError('Invalid lex state %s' % lex_mode)
|
1209 | if not need_more:
|
1210 | break
|
1211 |
|
1212 | if not w: # Assumes AddErrorContext was already called
|
1213 | return None
|
1214 |
|
1215 | self.cursor = w
|
1216 |
|
1217 | # TODO: Do consolidation of newlines in the lexer?
|
1218 | # Note that there can be an infinite (Id.Ignored_Comment Id.Op_Newline
|
1219 | # Id.Ignored_Comment Id.Op_Newline) sequence, so we have to keep track of
|
1220 | # the last non-ignored token.
|
1221 | self.cursor_was_newline = (word.CommandId(self.cursor) == Id.Op_Newline)
|
1222 | return self.cursor
|
1223 |
|
1224 | def ReadHereDocBody(self):
|
1225 | """
|
1226 | Sort of like Read(), except we're in a double quoted context, but not using
|
1227 | double quotes.
|
1228 |
|
1229 | Returns:
|
1230 | CompoundWord. NOTE: We could also just use a DoubleQuotedPart for both
|
1231 | cases?
|
1232 | """
|
1233 | w = ast.CompoundWord()
|
1234 | dq = self._ReadDoubleQuotedPart(here_doc=True)
|
1235 | if not dq:
|
1236 | self.AddErrorContext('Error parsing here doc body')
|
1237 | return False
|
1238 | w.parts.append(dq)
|
1239 | return w
|