| 1 | # Copyright 2016 Andy Chu. All rights reserved.
 | 
| 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 | 
| 3 | # you may not use this file except in compliance with the License.
 | 
| 4 | # You may obtain a copy of the License at
 | 
| 5 | #
 | 
| 6 | #   http://www.apache.org/licenses/LICENSE-2.0
 | 
| 7 | """
 | 
| 8 | lexer.py - Library for lexing.
 | 
| 9 | """
 | 
| 10 | 
 | 
| 11 | from _devbuild.gen.syntax_asdl import Token, SourceLine
 | 
| 12 | from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
 | 
| 13 | from _devbuild.gen.id_kind_asdl import Id_t, Id, Id_str
 | 
| 14 | from mycpp.mylib import log
 | 
| 15 | from frontend import match
 | 
| 16 | 
 | 
| 17 | unused = log, Id_str
 | 
| 18 | 
 | 
| 19 | from typing import List, Tuple, Counter, TYPE_CHECKING
 | 
| 20 | if TYPE_CHECKING:
 | 
| 21 |     from core.alloc import Arena
 | 
| 22 |     from frontend.reader import _Reader
 | 
| 23 | 
 | 
| 24 | #
 | 
| 25 | # Optimized Token functions that use str.find(substr, start, end) to avoid
 | 
| 26 | # allocation of temporary slice
 | 
| 27 | #
 | 
| 28 | 
 | 
| 29 | 
 | 
| 30 | def IsPlusEquals(tok):
 | 
| 31 |     # type: (Token) -> bool
 | 
| 32 |     """Common pattern to test if we got foo= or foo+=
 | 
| 33 |     """
 | 
| 34 |     i = tok.col + tok.length - 2  # 'foo+='[-2] is '+'
 | 
| 35 |     return tok.line.content.find('+', i, i + 1) != -1
 | 
| 36 | 
 | 
| 37 | 
 | 
| 38 | def TokenContains(tok, substr):
 | 
| 39 |     # type: (Token, str) -> bool
 | 
| 40 |     return tok.line.content.find(substr, tok.col, tok.col + tok.length) != -1
 | 
| 41 | 
 | 
| 42 | 
 | 
| 43 | def TokenEquals(tok, s):
 | 
| 44 |     # type: (Token, str) -> bool
 | 
| 45 |     if len(s) != tok.length:
 | 
| 46 |         return False
 | 
| 47 |     return TokenContains(tok, s)
 | 
| 48 | 
 | 
| 49 | 
 | 
| 50 | def TokenStartsWith(tok, s):
 | 
| 51 |     # type: (Token, str) -> bool
 | 
| 52 |     return tok.line.content.find(s, tok.col, tok.col + len(s)) != -1
 | 
| 53 | 
 | 
| 54 | 
 | 
| 55 | def TokenEndsWith(tok, s):
 | 
| 56 |     # type: (Token, str) -> bool
 | 
| 57 |     end = tok.col + tok.length
 | 
| 58 |     return tok.line.content.find(s, end - len(s), end) != -1
 | 
| 59 | 
 | 
| 60 | 
 | 
| 61 | # Also: IsWhitespace, IsLeadingSpace
 | 
| 62 | 
 | 
| 63 | 
 | 
| 64 | def TokenVal(tok):
 | 
| 65 |     # type: (Token) -> str
 | 
| 66 |     """Compute string value on demand."""
 | 
| 67 |     return tok.line.content[tok.col:tok.col + tok.length]
 | 
| 68 | 
 | 
| 69 | 
 | 
| 70 | def TokenSliceLeft(tok, left_index):
 | 
| 71 |     # type: (Token, int) -> str
 | 
| 72 |     """Slice token directly, without creating intermediate string."""
 | 
| 73 |     assert left_index > 0
 | 
| 74 |     start = tok.col + left_index
 | 
| 75 |     return tok.line.content[start:tok.col + tok.length]
 | 
| 76 | 
 | 
| 77 | 
 | 
| 78 | def TokenSliceRight(tok, right_index):
 | 
| 79 |     # type: (Token, int) -> str
 | 
| 80 |     """Slice token directly, without creating intermediate string."""
 | 
| 81 |     assert right_index < 0
 | 
| 82 |     end = tok.col + tok.length + right_index
 | 
| 83 |     return tok.line.content[tok.col:end]
 | 
| 84 | 
 | 
| 85 | 
 | 
| 86 | def TokenSlice(tok, left, right):
 | 
| 87 |     # type: (Token, int, int) -> str
 | 
| 88 |     """Slice token directly, without creating intermediate string."""
 | 
| 89 |     assert left > 0, left
 | 
| 90 |     assert right < 0, right
 | 
| 91 |     start = tok.col + left
 | 
| 92 |     end = tok.col + tok.length + right
 | 
| 93 |     return tok.line.content[start:end]
 | 
| 94 | 
 | 
| 95 | 
 | 
| 96 | def LazyStr(tok):
 | 
| 97 |     # type: (Token) -> str
 | 
| 98 |     """Materialize the tval on demand, with special case for $myvar.
 | 
| 99 | 
 | 
| 100 |     Most tokens do NOT need strings.  We avoid allocating them in the lexer.
 | 
| 101 | 
 | 
| 102 |     Note: SingleQuoted could have lazy sval, NOT at the token level.
 | 
| 103 |     """
 | 
| 104 |     if 0:
 | 
| 105 |         LAZY_ID_HIST[tok.id] += 1
 | 
| 106 | 
 | 
| 107 |     if tok.tval is None:
 | 
| 108 |         if tok.id in (Id.VSub_DollarName, Id.VSub_Number):  # $x or $2
 | 
| 109 |             # Special case for SimpleVarSub - completion also relies on this
 | 
| 110 |             tok.tval = TokenSliceLeft(tok, 1)
 | 
| 111 |         else:
 | 
| 112 |             tok.tval = TokenVal(tok)
 | 
| 113 | 
 | 
| 114 |     return tok.tval
 | 
| 115 | 
 | 
| 116 | 
 | 
| 117 | def DummyToken(id_, val):
 | 
| 118 |     # type: (int, str) -> Token
 | 
| 119 | 
 | 
| 120 |     col = -1
 | 
| 121 |     length = -1
 | 
| 122 |     return Token(id_, length, col, None, val)
 | 
| 123 | 
 | 
| 124 | 
 | 
| 125 | class LineLexer(object):
 | 
| 126 | 
 | 
| 127 |     def __init__(self, arena):
 | 
| 128 |         # type: (Arena) -> None
 | 
| 129 |         self.arena = arena
 | 
| 130 |         self.replace_last_token = False  # For MaybeUnreadOne
 | 
| 131 | 
 | 
| 132 |         # Singleton instance because we don't allow globals.
 | 
| 133 |         # 2023-09: I tried LineLexer::Read() returning None, but that is subtly
 | 
| 134 |         # incorrect, e.g. in Lexer::Read() with NUL bytes.
 | 
| 135 |         self.eol_tok = DummyToken(Id.Eol_Tok, '')
 | 
| 136 | 
 | 
| 137 |         self.Reset(None, 0)  # Invalid src_line to start
 | 
| 138 | 
 | 
| 139 |     def __repr__(self):
 | 
| 140 |         # type: () -> str
 | 
| 141 |         return '<LineLexer at pos %d of line %r>' % (self.line_pos,
 | 
| 142 |                                                      self.src_line)
 | 
| 143 | 
 | 
| 144 |     def Reset(self, src_line, line_pos):
 | 
| 145 |         # type: (SourceLine, int) -> None
 | 
| 146 |         self.src_line = src_line
 | 
| 147 |         self.line_pos = line_pos
 | 
| 148 | 
 | 
| 149 |     def MaybeUnreadOne(self):
 | 
| 150 |         # type: () -> bool
 | 
| 151 |         """Return True if we can unread one character, or False otherwise.
 | 
| 152 | 
 | 
| 153 |         NOTE: Only call this when you know the last token was exactly one character!
 | 
| 154 |         """
 | 
| 155 |         if self.line_pos == 0:
 | 
| 156 |             return False
 | 
| 157 |         else:
 | 
| 158 |             self.line_pos -= 1
 | 
| 159 |             self.replace_last_token = True  # don't add the next token to the arena
 | 
| 160 |             return True
 | 
| 161 | 
 | 
| 162 |     def GetEofToken(self, id_):
 | 
| 163 |         # type: (int) -> Token
 | 
| 164 |         """Create a new span ID for syntax errors involving the EOF token."""
 | 
| 165 |         if self.src_line is None:
 | 
| 166 |             # There are ZERO lines now.  Add a dummy line 0 so the Token has a source
 | 
| 167 |             # to display errors.
 | 
| 168 |             src_line = self.arena.AddLine('', 0)
 | 
| 169 |         else:
 | 
| 170 |             src_line = self.src_line
 | 
| 171 | 
 | 
| 172 |         return self.arena.NewToken(id_, self.line_pos, 0, src_line)
 | 
| 173 | 
 | 
| 174 |     def LookAheadOne(self, lex_mode):
 | 
| 175 |         # type: (lex_mode_t) -> Id_t
 | 
| 176 |         """Look ahead exactly one token in the given lexer mode."""
 | 
| 177 |         pos = self.line_pos
 | 
| 178 |         line_str = self.src_line.content
 | 
| 179 |         n = len(line_str)
 | 
| 180 |         if pos == n:
 | 
| 181 |             return Id.Unknown_Tok
 | 
| 182 |         else:
 | 
| 183 |             tok_type, _ = match.OneToken(lex_mode, line_str, pos)
 | 
| 184 |             return tok_type
 | 
| 185 | 
 | 
| 186 |     def AssertAtEndOfLine(self):
 | 
| 187 |         # type: () -> None
 | 
| 188 |         assert self.line_pos == len(self.src_line.content), \
 | 
| 189 |             '%d %s' % (self.line_pos, self.src_line.content)
 | 
| 190 | 
 | 
| 191 |     def LookPastSpace(self, lex_mode):
 | 
| 192 |         # type: (lex_mode_t) -> Id_t
 | 
| 193 |         """Look ahead in current line for non-space token, using given lexer
 | 
| 194 |         mode.
 | 
| 195 | 
 | 
| 196 |         Does NOT advance self.line_pos.
 | 
| 197 | 
 | 
| 198 |         Called with at least the following modes:
 | 
| 199 |           lex_mode_e.Arith -- for ${a[@]} vs ${a[1+2]}
 | 
| 200 |           lex_mode_e.VSub_1
 | 
| 201 |           lex_mode_e.ShCommand
 | 
| 202 | 
 | 
| 203 |         Note: Only ShCommand emits Id.WS_Space, but other lexer modes don't.
 | 
| 204 |         """
 | 
| 205 |         pos = self.line_pos
 | 
| 206 |         line_str = self.src_line.content
 | 
| 207 |         n = len(line_str)
 | 
| 208 |         #print('Look ahead from pos %d, line %r' % (pos,self.line))
 | 
| 209 |         while True:
 | 
| 210 |             if pos == n:
 | 
| 211 |                 # We don't allow lookahead while already at end of line, because it
 | 
| 212 |                 # would involve interacting with the line reader, and we never need
 | 
| 213 |                 # it.  In lex_mode_e.ShCommand, there is an explicit newline token, but
 | 
| 214 |                 # lex_mode_e.Arith doesn't have it.
 | 
| 215 |                 return Id.Unknown_Tok
 | 
| 216 | 
 | 
| 217 |             tok_type, end_pos = match.OneToken(lex_mode, line_str, pos)
 | 
| 218 | 
 | 
| 219 |             # NOTE: Instead of hard-coding this token, we could pass it in.
 | 
| 220 |             # LookPastSpace(lex_mode, past_token_type)
 | 
| 221 |             # - WS_Space only given in lex_mode_e.ShCommand
 | 
| 222 |             # - Id.Ignored_Space given in lex_mode_e.Expr
 | 
| 223 |             if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
 | 
| 224 |                 break
 | 
| 225 |             pos = end_pos
 | 
| 226 | 
 | 
| 227 |         return tok_type
 | 
| 228 | 
 | 
| 229 |     def LookAheadFuncParens(self, unread):
 | 
| 230 |         # type: (int) -> bool
 | 
| 231 |         """For finding the () in 'f ( ) { echo hi; }'.
 | 
| 232 | 
 | 
| 233 |         Args:
 | 
| 234 |           unread: either 0 or 1, for the number of characters to go back
 | 
| 235 | 
 | 
| 236 |         The lookahead is limited to the current line, which sacrifices a rare
 | 
| 237 |         corner case.  This not recognized as a function:
 | 
| 238 | 
 | 
| 239 |             foo\
 | 
| 240 |             () {}
 | 
| 241 | 
 | 
| 242 |         whereas this is
 | 
| 243 | 
 | 
| 244 |             foo()
 | 
| 245 |             {}
 | 
| 246 |         """
 | 
| 247 |         pos = self.line_pos - unread
 | 
| 248 |         assert pos > 0, pos
 | 
| 249 |         tok_type, _ = match.OneToken(lex_mode_e.FuncParens,
 | 
| 250 |                                      self.src_line.content, pos)
 | 
| 251 |         return tok_type == Id.LookAhead_FuncParens
 | 
| 252 | 
 | 
| 253 |     def ByteLookAhead(self):
 | 
| 254 |         # type: () -> str
 | 
| 255 |         """Lookahead a single byte.
 | 
| 256 | 
 | 
| 257 |         Useful when you know the token is one char.
 | 
| 258 |         """
 | 
| 259 |         pos = self.line_pos
 | 
| 260 |         if pos == len(self.src_line.content):
 | 
| 261 |             return ''
 | 
| 262 |         else:
 | 
| 263 |             return self.src_line.content[pos]
 | 
| 264 | 
 | 
| 265 |     def ByteLookBack(self):
 | 
| 266 |         # type: () -> int
 | 
| 267 |         """A little hack for stricter proc arg list syntax.
 | 
| 268 | 
 | 
| 269 |         There has to be a space before the paren.
 | 
| 270 | 
 | 
| 271 |         Yes: json write (x)
 | 
| 272 |          No: json write(x)
 | 
| 273 |         """
 | 
| 274 |         pos = self.line_pos - 2
 | 
| 275 |         if pos < 0:
 | 
| 276 |             return -1
 | 
| 277 |         else:
 | 
| 278 |             return ord(self.src_line.content[pos])
 | 
| 279 | 
 | 
| 280 |     def Read(self, lex_mode):
 | 
| 281 |         # type: (lex_mode_t) -> Token
 | 
| 282 | 
 | 
| 283 |         # Inner loop optimization
 | 
| 284 |         if self.src_line:
 | 
| 285 |             line_str = self.src_line.content
 | 
| 286 |         else:
 | 
| 287 |             line_str = ''
 | 
| 288 |         line_pos = self.line_pos
 | 
| 289 | 
 | 
| 290 |         tok_type, end_pos = match.OneToken(lex_mode, line_str, line_pos)
 | 
| 291 |         if tok_type == Id.Eol_Tok:  # Do NOT add a span for this sentinel!
 | 
| 292 |             # LineLexer tells Lexer to read a new line.
 | 
| 293 |             return self.eol_tok
 | 
| 294 | 
 | 
| 295 |         # NOTE: We're putting the arena hook in LineLexer and not Lexer because we
 | 
| 296 |         # want it to be "low level".  The only thing fabricated here is a newline
 | 
| 297 |         # added at the last line, so we don't end with \0.
 | 
| 298 |         if self.replace_last_token:  # make another token from the last span
 | 
| 299 |             self.arena.UnreadOne()
 | 
| 300 |             self.replace_last_token = False
 | 
| 301 | 
 | 
| 302 |         tok_len = end_pos - line_pos
 | 
| 303 |         t = self.arena.NewToken(tok_type, line_pos, tok_len, self.src_line)
 | 
| 304 | 
 | 
| 305 |         self.line_pos = end_pos
 | 
| 306 |         return t
 | 
| 307 | 
 | 
| 308 | 
 | 
| 309 | class Lexer(object):
 | 
| 310 |     """Read lines from the line_reader, split them into tokens with line_lexer,
 | 
| 311 |     returning them in a stream."""
 | 
| 312 | 
 | 
| 313 |     def __init__(self, line_lexer, line_reader):
 | 
| 314 |         # type: (LineLexer, _Reader) -> None
 | 
| 315 |         """
 | 
| 316 |         Args:
 | 
| 317 |           line_lexer: Underlying object to get tokens from
 | 
| 318 |           line_reader: get new lines from here
 | 
| 319 |         """
 | 
| 320 |         self.line_lexer = line_lexer
 | 
| 321 |         self.line_reader = line_reader
 | 
| 322 | 
 | 
| 323 |         self.line_id = -1  # Invalid one
 | 
| 324 |         self.translation_stack = []  # type: List[Tuple[Id_t, Id_t]]
 | 
| 325 |         self.emit_comp_dummy = False
 | 
| 326 | 
 | 
| 327 |     def ResetInputObjects(self):
 | 
| 328 |         # type: () -> None
 | 
| 329 |         self.line_lexer.Reset(None, 0)
 | 
| 330 | 
 | 
| 331 |     def MaybeUnreadOne(self):
 | 
| 332 |         # type: () -> bool
 | 
| 333 |         return self.line_lexer.MaybeUnreadOne()
 | 
| 334 | 
 | 
| 335 |     def LookAheadOne(self, lex_mode):
 | 
| 336 |         # type: (lex_mode_t) -> Id_t
 | 
| 337 |         return self.line_lexer.LookAheadOne(lex_mode)
 | 
| 338 | 
 | 
| 339 |     def LookPastSpace(self, lex_mode):
 | 
| 340 |         # type: (lex_mode_t) -> Id_t
 | 
| 341 |         return self.line_lexer.LookPastSpace(lex_mode)
 | 
| 342 | 
 | 
| 343 |     def LookAheadFuncParens(self, unread):
 | 
| 344 |         # type: (int) -> bool
 | 
| 345 |         return self.line_lexer.LookAheadFuncParens(unread)
 | 
| 346 | 
 | 
| 347 |     def ByteLookAhead(self):
 | 
| 348 |         # type: () -> str
 | 
| 349 |         return self.line_lexer.ByteLookAhead()
 | 
| 350 | 
 | 
| 351 |     def ByteLookBack(self):
 | 
| 352 |         # type: () -> int
 | 
| 353 |         return self.line_lexer.ByteLookBack()
 | 
| 354 | 
 | 
| 355 |     def EmitCompDummy(self):
 | 
| 356 |         # type: () -> None
 | 
| 357 |         """Emit Id.Lit_CompDummy right before EOF, for completion."""
 | 
| 358 |         self.emit_comp_dummy = True
 | 
| 359 | 
 | 
| 360 |     def PushHint(self, old_id, new_id):
 | 
| 361 |         # type: (Id_t, Id_t) -> None
 | 
| 362 |         """Use cases: Id.Op_RParen -> Id.Right_Subshell -- disambiguate
 | 
| 363 |         Id.Op_RParen -> Id.Eof_RParen.
 | 
| 364 | 
 | 
| 365 |         Problems for $() nesting.
 | 
| 366 | 
 | 
| 367 |         - posix:
 | 
| 368 |           - case foo) and case (foo)
 | 
| 369 |           - func() {}
 | 
| 370 |           - subshell ( )
 | 
| 371 |         - bash extensions:
 | 
| 372 |           - precedence in [[,   e.g.  [[ (1 == 2) && (2 == 3) ]]
 | 
| 373 |           - arrays: a=(1 2 3), a+=(4 5)
 | 
| 374 |         """
 | 
| 375 |         #log('   PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
 | 
| 376 |         self.translation_stack.append((old_id, new_id))
 | 
| 377 | 
 | 
| 378 |     def MoveToNextLine(self):
 | 
| 379 |         # type: () -> None
 | 
| 380 |         """For lookahead on the next line.
 | 
| 381 | 
 | 
| 382 |         This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
 | 
| 383 | 
 | 
| 384 |         We use this because otherwise calling `LookPastSpace` would return
 | 
| 385 |         `Id.Unknown_Tok` when the lexer has reached the end of the line. For an
 | 
| 386 |         example, take this case:
 | 
| 387 | 
 | 
| 388 |           case (x) {
 | 
| 389 |                    ^--- We are here
 | 
| 390 | 
 | 
| 391 |             (else) {
 | 
| 392 |             ^--- We want lookahead to here
 | 
| 393 | 
 | 
| 394 |                 echo test
 | 
| 395 |             }
 | 
| 396 |           }
 | 
| 397 | 
 | 
| 398 |         But, without `MoveToNextLine`, it is impossible to peek the '(' without
 | 
| 399 |         consuming it. And consuming it would be a problem once we want to hand off
 | 
| 400 |         pattern parsing to the expression parser.
 | 
| 401 |         """
 | 
| 402 |         # Only call this when you've seen \n
 | 
| 403 |         self.line_lexer.AssertAtEndOfLine()
 | 
| 404 | 
 | 
| 405 |         src_line, line_pos = self.line_reader.GetLine()
 | 
| 406 |         self.line_lexer.Reset(src_line, line_pos)  # fill with a new line
 | 
| 407 | 
 | 
| 408 |     def _Read(self, lex_mode):
 | 
| 409 |         # type: (lex_mode_t) -> Token
 | 
| 410 |         """Read from the normal line buffer, not an alias."""
 | 
| 411 |         t = self.line_lexer.Read(lex_mode)
 | 
| 412 |         if t.id == Id.Eol_Tok:  # We hit \0 aka Eol_Tok, read a new line
 | 
| 413 |             src_line, line_pos = self.line_reader.GetLine()
 | 
| 414 | 
 | 
| 415 |             if src_line is None:  # no more lines
 | 
| 416 |                 if self.emit_comp_dummy:
 | 
| 417 |                     id_ = Id.Lit_CompDummy
 | 
| 418 |                     self.emit_comp_dummy = False  # emit EOF the next time
 | 
| 419 |                 else:
 | 
| 420 |                     id_ = Id.Eof_Real
 | 
| 421 |                 return self.line_lexer.GetEofToken(id_)
 | 
| 422 | 
 | 
| 423 |             self.line_lexer.Reset(src_line, line_pos)  # fill with a new line
 | 
| 424 |             t = self.line_lexer.Read(lex_mode)
 | 
| 425 | 
 | 
| 426 |         # e.g. translate ) or ` into EOF
 | 
| 427 |         if len(self.translation_stack):
 | 
| 428 |             old_id, new_id = self.translation_stack[-1]  # top
 | 
| 429 |             if t.id == old_id:
 | 
| 430 |                 #log('==> TRANSLATING %s ==> %s', Id_str(t.id), Id_str(new_id))
 | 
| 431 |                 self.translation_stack.pop()
 | 
| 432 |                 t.id = new_id
 | 
| 433 | 
 | 
| 434 |         return t
 | 
| 435 | 
 | 
| 436 |     def Read(self, lex_mode):
 | 
| 437 |         # type: (lex_mode_t) -> Token
 | 
| 438 |         while True:
 | 
| 439 |             t = self._Read(lex_mode)
 | 
| 440 |             if t.id != Id.Ignored_LineCont:
 | 
| 441 |                 break
 | 
| 442 | 
 | 
| 443 |         if 0:
 | 
| 444 |             ID_HIST[t.id] += 1
 | 
| 445 |         return t
 | 
| 446 | 
 | 
| 447 | 
 | 
| 448 | if 0:  # mylib.PYTHON: note: breaks tarball build
 | 
| 449 |     import collections
 | 
| 450 |     ID_HIST = collections.Counter()  # type: Counter[Id_t]
 | 
| 451 |     LAZY_ID_HIST = collections.Counter()  # type: Counter[Id_t]
 |