OILS / frontend / lexer.py View on Github | oilshell.org

451 lines, 196 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8lexer.py - Library for lexing.
9"""
10
11from _devbuild.gen.syntax_asdl import Token, SourceLine
12from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
13from _devbuild.gen.id_kind_asdl import Id_t, Id, Id_str
14from mycpp.mylib import log
15from frontend import match
16
17unused = log, Id_str
18
19from typing import List, Tuple, Counter, TYPE_CHECKING
20if TYPE_CHECKING:
21 from core.alloc import Arena
22 from frontend.reader import _Reader
23
24#
25# Optimized Token functions that use str.find(substr, start, end) to avoid
26# allocation of temporary slice
27#
28
29
30def IsPlusEquals(tok):
31 # type: (Token) -> bool
32 """Common pattern to test if we got foo= or foo+=
33 """
34 i = tok.col + tok.length - 2 # 'foo+='[-2] is '+'
35 return tok.line.content.find('+', i, i + 1) != -1
36
37
38def TokenContains(tok, substr):
39 # type: (Token, str) -> bool
40 return tok.line.content.find(substr, tok.col, tok.col + tok.length) != -1
41
42
43def TokenEquals(tok, s):
44 # type: (Token, str) -> bool
45 if len(s) != tok.length:
46 return False
47 return TokenContains(tok, s)
48
49
50def TokenStartsWith(tok, s):
51 # type: (Token, str) -> bool
52 return tok.line.content.find(s, tok.col, tok.col + len(s)) != -1
53
54
55def TokenEndsWith(tok, s):
56 # type: (Token, str) -> bool
57 end = tok.col + tok.length
58 return tok.line.content.find(s, end - len(s), end) != -1
59
60
61# Also: IsWhitespace, IsLeadingSpace
62
63
64def TokenVal(tok):
65 # type: (Token) -> str
66 """Compute string value on demand."""
67 return tok.line.content[tok.col:tok.col + tok.length]
68
69
70def TokenSliceLeft(tok, left_index):
71 # type: (Token, int) -> str
72 """Slice token directly, without creating intermediate string."""
73 assert left_index > 0
74 start = tok.col + left_index
75 return tok.line.content[start:tok.col + tok.length]
76
77
78def TokenSliceRight(tok, right_index):
79 # type: (Token, int) -> str
80 """Slice token directly, without creating intermediate string."""
81 assert right_index < 0
82 end = tok.col + tok.length + right_index
83 return tok.line.content[tok.col:end]
84
85
86def TokenSlice(tok, left, right):
87 # type: (Token, int, int) -> str
88 """Slice token directly, without creating intermediate string."""
89 assert left > 0, left
90 assert right < 0, right
91 start = tok.col + left
92 end = tok.col + tok.length + right
93 return tok.line.content[start:end]
94
95
96def LazyStr(tok):
97 # type: (Token) -> str
98 """Materialize the tval on demand, with special case for $myvar.
99
100 Most tokens do NOT need strings. We avoid allocating them in the lexer.
101
102 Note: SingleQuoted could have lazy sval, NOT at the token level.
103 """
104 if 0:
105 LAZY_ID_HIST[tok.id] += 1
106
107 if tok.tval is None:
108 if tok.id in (Id.VSub_DollarName, Id.VSub_Number): # $x or $2
109 # Special case for SimpleVarSub - completion also relies on this
110 tok.tval = TokenSliceLeft(tok, 1)
111 else:
112 tok.tval = TokenVal(tok)
113
114 return tok.tval
115
116
117def DummyToken(id_, val):
118 # type: (int, str) -> Token
119
120 col = -1
121 length = -1
122 return Token(id_, length, col, None, val)
123
124
125class LineLexer(object):
126
127 def __init__(self, arena):
128 # type: (Arena) -> None
129 self.arena = arena
130 self.replace_last_token = False # For MaybeUnreadOne
131
132 # Singleton instance because we don't allow globals.
133 # 2023-09: I tried LineLexer::Read() returning None, but that is subtly
134 # incorrect, e.g. in Lexer::Read() with NUL bytes.
135 self.eol_tok = DummyToken(Id.Eol_Tok, '')
136
137 self.Reset(None, 0) # Invalid src_line to start
138
139 def __repr__(self):
140 # type: () -> str
141 return '<LineLexer at pos %d of line %r>' % (self.line_pos,
142 self.src_line)
143
144 def Reset(self, src_line, line_pos):
145 # type: (SourceLine, int) -> None
146 self.src_line = src_line
147 self.line_pos = line_pos
148
149 def MaybeUnreadOne(self):
150 # type: () -> bool
151 """Return True if we can unread one character, or False otherwise.
152
153 NOTE: Only call this when you know the last token was exactly one character!
154 """
155 if self.line_pos == 0:
156 return False
157 else:
158 self.line_pos -= 1
159 self.replace_last_token = True # don't add the next token to the arena
160 return True
161
162 def GetEofToken(self, id_):
163 # type: (int) -> Token
164 """Create a new span ID for syntax errors involving the EOF token."""
165 if self.src_line is None:
166 # There are ZERO lines now. Add a dummy line 0 so the Token has a source
167 # to display errors.
168 src_line = self.arena.AddLine('', 0)
169 else:
170 src_line = self.src_line
171
172 return self.arena.NewToken(id_, self.line_pos, 0, src_line)
173
174 def LookAheadOne(self, lex_mode):
175 # type: (lex_mode_t) -> Id_t
176 """Look ahead exactly one token in the given lexer mode."""
177 pos = self.line_pos
178 line_str = self.src_line.content
179 n = len(line_str)
180 if pos == n:
181 return Id.Unknown_Tok
182 else:
183 tok_type, _ = match.OneToken(lex_mode, line_str, pos)
184 return tok_type
185
186 def AssertAtEndOfLine(self):
187 # type: () -> None
188 assert self.line_pos == len(self.src_line.content), \
189 '%d %s' % (self.line_pos, self.src_line.content)
190
191 def LookPastSpace(self, lex_mode):
192 # type: (lex_mode_t) -> Id_t
193 """Look ahead in current line for non-space token, using given lexer
194 mode.
195
196 Does NOT advance self.line_pos.
197
198 Called with at least the following modes:
199 lex_mode_e.Arith -- for ${a[@]} vs ${a[1+2]}
200 lex_mode_e.VSub_1
201 lex_mode_e.ShCommand
202
203 Note: Only ShCommand emits Id.WS_Space, but other lexer modes don't.
204 """
205 pos = self.line_pos
206 line_str = self.src_line.content
207 n = len(line_str)
208 #print('Look ahead from pos %d, line %r' % (pos,self.line))
209 while True:
210 if pos == n:
211 # We don't allow lookahead while already at end of line, because it
212 # would involve interacting with the line reader, and we never need
213 # it. In lex_mode_e.ShCommand, there is an explicit newline token, but
214 # lex_mode_e.Arith doesn't have it.
215 return Id.Unknown_Tok
216
217 tok_type, end_pos = match.OneToken(lex_mode, line_str, pos)
218
219 # NOTE: Instead of hard-coding this token, we could pass it in.
220 # LookPastSpace(lex_mode, past_token_type)
221 # - WS_Space only given in lex_mode_e.ShCommand
222 # - Id.Ignored_Space given in lex_mode_e.Expr
223 if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
224 break
225 pos = end_pos
226
227 return tok_type
228
229 def LookAheadFuncParens(self, unread):
230 # type: (int) -> bool
231 """For finding the () in 'f ( ) { echo hi; }'.
232
233 Args:
234 unread: either 0 or 1, for the number of characters to go back
235
236 The lookahead is limited to the current line, which sacrifices a rare
237 corner case. This not recognized as a function:
238
239 foo\
240 () {}
241
242 whereas this is
243
244 foo()
245 {}
246 """
247 pos = self.line_pos - unread
248 assert pos > 0, pos
249 tok_type, _ = match.OneToken(lex_mode_e.FuncParens,
250 self.src_line.content, pos)
251 return tok_type == Id.LookAhead_FuncParens
252
253 def ByteLookAhead(self):
254 # type: () -> str
255 """Lookahead a single byte.
256
257 Useful when you know the token is one char.
258 """
259 pos = self.line_pos
260 if pos == len(self.src_line.content):
261 return ''
262 else:
263 return self.src_line.content[pos]
264
265 def ByteLookBack(self):
266 # type: () -> int
267 """A little hack for stricter proc arg list syntax.
268
269 There has to be a space before the paren.
270
271 Yes: json write (x)
272 No: json write(x)
273 """
274 pos = self.line_pos - 2
275 if pos < 0:
276 return -1
277 else:
278 return ord(self.src_line.content[pos])
279
280 def Read(self, lex_mode):
281 # type: (lex_mode_t) -> Token
282
283 # Inner loop optimization
284 if self.src_line:
285 line_str = self.src_line.content
286 else:
287 line_str = ''
288 line_pos = self.line_pos
289
290 tok_type, end_pos = match.OneToken(lex_mode, line_str, line_pos)
291 if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel!
292 # LineLexer tells Lexer to read a new line.
293 return self.eol_tok
294
295 # NOTE: We're putting the arena hook in LineLexer and not Lexer because we
296 # want it to be "low level". The only thing fabricated here is a newline
297 # added at the last line, so we don't end with \0.
298 if self.replace_last_token: # make another token from the last span
299 self.arena.UnreadOne()
300 self.replace_last_token = False
301
302 tok_len = end_pos - line_pos
303 t = self.arena.NewToken(tok_type, line_pos, tok_len, self.src_line)
304
305 self.line_pos = end_pos
306 return t
307
308
309class Lexer(object):
310 """Read lines from the line_reader, split them into tokens with line_lexer,
311 returning them in a stream."""
312
313 def __init__(self, line_lexer, line_reader):
314 # type: (LineLexer, _Reader) -> None
315 """
316 Args:
317 line_lexer: Underlying object to get tokens from
318 line_reader: get new lines from here
319 """
320 self.line_lexer = line_lexer
321 self.line_reader = line_reader
322
323 self.line_id = -1 # Invalid one
324 self.translation_stack = [] # type: List[Tuple[Id_t, Id_t]]
325 self.emit_comp_dummy = False
326
327 def ResetInputObjects(self):
328 # type: () -> None
329 self.line_lexer.Reset(None, 0)
330
331 def MaybeUnreadOne(self):
332 # type: () -> bool
333 return self.line_lexer.MaybeUnreadOne()
334
335 def LookAheadOne(self, lex_mode):
336 # type: (lex_mode_t) -> Id_t
337 return self.line_lexer.LookAheadOne(lex_mode)
338
339 def LookPastSpace(self, lex_mode):
340 # type: (lex_mode_t) -> Id_t
341 return self.line_lexer.LookPastSpace(lex_mode)
342
343 def LookAheadFuncParens(self, unread):
344 # type: (int) -> bool
345 return self.line_lexer.LookAheadFuncParens(unread)
346
347 def ByteLookAhead(self):
348 # type: () -> str
349 return self.line_lexer.ByteLookAhead()
350
351 def ByteLookBack(self):
352 # type: () -> int
353 return self.line_lexer.ByteLookBack()
354
355 def EmitCompDummy(self):
356 # type: () -> None
357 """Emit Id.Lit_CompDummy right before EOF, for completion."""
358 self.emit_comp_dummy = True
359
360 def PushHint(self, old_id, new_id):
361 # type: (Id_t, Id_t) -> None
362 """Use cases: Id.Op_RParen -> Id.Right_Subshell -- disambiguate
363 Id.Op_RParen -> Id.Eof_RParen.
364
365 Problems for $() nesting.
366
367 - posix:
368 - case foo) and case (foo)
369 - func() {}
370 - subshell ( )
371 - bash extensions:
372 - precedence in [[, e.g. [[ (1 == 2) && (2 == 3) ]]
373 - arrays: a=(1 2 3), a+=(4 5)
374 """
375 #log(' PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
376 self.translation_stack.append((old_id, new_id))
377
378 def MoveToNextLine(self):
379 # type: () -> None
380 """For lookahead on the next line.
381
382 This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
383
384 We use this because otherwise calling `LookPastSpace` would return
385 `Id.Unknown_Tok` when the lexer has reached the end of the line. For an
386 example, take this case:
387
388 case (x) {
389 ^--- We are here
390
391 (else) {
392 ^--- We want lookahead to here
393
394 echo test
395 }
396 }
397
398 But, without `MoveToNextLine`, it is impossible to peek the '(' without
399 consuming it. And consuming it would be a problem once we want to hand off
400 pattern parsing to the expression parser.
401 """
402 # Only call this when you've seen \n
403 self.line_lexer.AssertAtEndOfLine()
404
405 src_line, line_pos = self.line_reader.GetLine()
406 self.line_lexer.Reset(src_line, line_pos) # fill with a new line
407
408 def _Read(self, lex_mode):
409 # type: (lex_mode_t) -> Token
410 """Read from the normal line buffer, not an alias."""
411 t = self.line_lexer.Read(lex_mode)
412 if t.id == Id.Eol_Tok: # We hit \0 aka Eol_Tok, read a new line
413 src_line, line_pos = self.line_reader.GetLine()
414
415 if src_line is None: # no more lines
416 if self.emit_comp_dummy:
417 id_ = Id.Lit_CompDummy
418 self.emit_comp_dummy = False # emit EOF the next time
419 else:
420 id_ = Id.Eof_Real
421 return self.line_lexer.GetEofToken(id_)
422
423 self.line_lexer.Reset(src_line, line_pos) # fill with a new line
424 t = self.line_lexer.Read(lex_mode)
425
426 # e.g. translate ) or ` into EOF
427 if len(self.translation_stack):
428 old_id, new_id = self.translation_stack[-1] # top
429 if t.id == old_id:
430 #log('==> TRANSLATING %s ==> %s', Id_str(t.id), Id_str(new_id))
431 self.translation_stack.pop()
432 t.id = new_id
433
434 return t
435
436 def Read(self, lex_mode):
437 # type: (lex_mode_t) -> Token
438 while True:
439 t = self._Read(lex_mode)
440 if t.id != Id.Ignored_LineCont:
441 break
442
443 if 0:
444 ID_HIST[t.id] += 1
445 return t
446
447
448if 0: # mylib.PYTHON: note: breaks tarball build
449 import collections
450 ID_HIST = collections.Counter() # type: Counter[Id_t]
451 LAZY_ID_HIST = collections.Counter() # type: Counter[Id_t]