| 1 | """
|
| 2 | match.py - lexer primitives, implemented with re2c or Python regexes.
|
| 3 | """
|
| 4 |
|
| 5 | from _devbuild.gen.id_kind_asdl import Id, Id_t
|
| 6 | from _devbuild.gen.types_asdl import lex_mode_t
|
| 7 | from frontend import lexer_def
|
| 8 |
|
| 9 | from typing import Tuple, Callable, Dict, List, Any, TYPE_CHECKING
|
| 10 |
|
| 11 | # bin/osh should work without compiling fastlex? But we want all the unit
|
| 12 | # tests to run with a known version of it.
|
| 13 | try:
|
| 14 | import fastlex
|
| 15 | except ImportError:
|
| 16 | fastlex = None
|
| 17 |
|
| 18 | if fastlex:
|
| 19 | re = None # re module isn't in CPython slice
|
| 20 | else:
|
| 21 | import re # type: ignore
|
| 22 |
|
| 23 | if TYPE_CHECKING:
|
| 24 | SRE_Pattern = Any # Do we need a .pyi file for re or _sre?
|
| 25 | SimpleMatchFunc = Callable[[str, int], Tuple[Id_t, int]]
|
| 26 | LexerPairs = List[Tuple[SRE_Pattern, Id_t]]
|
| 27 |
|
| 28 |
|
| 29 | def _LongestMatch(re_list, line, start_pos):
|
| 30 | # type: (LexerPairs, str, int) -> Tuple[Id_t, int]
|
| 31 |
|
| 32 | # Simulate the rule for \x00, which we generate in frontend/match.re2c.h
|
| 33 | if start_pos >= len(line):
|
| 34 | return Id.Eol_Tok, start_pos
|
| 35 | # Simulate C-style string handling: \x00 is empty string.
|
| 36 | if line[start_pos] == '\0':
|
| 37 | return Id.Eol_Tok, start_pos
|
| 38 |
|
| 39 | matches = []
|
| 40 | for regex, tok_type in re_list:
|
| 41 | m = regex.match(line, start_pos) # left-anchored
|
| 42 | if m:
|
| 43 | matches.append((m.end(0), tok_type, m.group(0)))
|
| 44 | if not matches:
|
| 45 | raise AssertionError('no match at position %d: %r' % (start_pos, line))
|
| 46 | end_pos, tok_type, tok_val = max(matches, key=lambda m: m[0])
|
| 47 | #util.log('%s %s', tok_type, end_pos)
|
| 48 | return tok_type, end_pos
|
| 49 |
|
| 50 |
|
| 51 | def _CompileAll(pat_list):
|
| 52 | # type: (List[Tuple[bool, str, Id_t]]) -> LexerPairs
|
| 53 | result = []
|
| 54 | for is_regex, pat, token_id in pat_list:
|
| 55 | if not is_regex:
|
| 56 | pat = re.escape(pat) # type: ignore # turn $ into \$
|
| 57 | result.append((re.compile(pat), token_id)) # type: ignore
|
| 58 | return result
|
| 59 |
|
| 60 |
|
| 61 | class _MatchOshToken_Slow(object):
|
| 62 | """An abstract matcher that doesn't depend on OSH."""
|
| 63 |
|
| 64 | def __init__(self, lexer_def):
|
| 65 | # type: (Dict[lex_mode_t, List[Tuple[bool, str, Id_t]]]) -> None
|
| 66 | self.lexer_def = {} # type: Dict[lex_mode_t, LexerPairs]
|
| 67 | for lex_mode, pat_list in lexer_def.items():
|
| 68 | self.lexer_def[lex_mode] = _CompileAll(pat_list)
|
| 69 |
|
| 70 | def __call__(self, lex_mode, line, start_pos):
|
| 71 | # type: (lex_mode_t, str, int) -> Tuple[Id_t, int]
|
| 72 | """Returns (id, end_pos)."""
|
| 73 | re_list = self.lexer_def[lex_mode]
|
| 74 |
|
| 75 | return _LongestMatch(re_list, line, start_pos)
|
| 76 |
|
| 77 |
|
| 78 | def _MatchOshToken_Fast(lex_mode, line, start_pos):
|
| 79 | # type: (lex_mode_t, str, int) -> Tuple[Id_t, int]
|
| 80 | """Returns (Id, end_pos)."""
|
| 81 | tok_type, end_pos = fastlex.MatchOshToken(lex_mode, line, start_pos)
|
| 82 | # IMPORTANT: We're reusing Id instances here. Ids are very common, so this
|
| 83 | # saves memory.
|
| 84 | return tok_type, end_pos
|
| 85 |
|
| 86 |
|
| 87 | class _MatchTokenSlow(object):
|
| 88 |
|
| 89 | def __init__(self, pat_list):
|
| 90 | # type: (List[Tuple[bool, str, Id_t]]) -> None
|
| 91 | self.pat_list = _CompileAll(pat_list)
|
| 92 |
|
| 93 | def __call__(self, line, start_pos):
|
| 94 | # type: (str, int) -> Tuple[Id_t, int]
|
| 95 | return _LongestMatch(self.pat_list, line, start_pos)
|
| 96 |
|
| 97 |
|
| 98 | def _MatchEchoToken_Fast(line, start_pos):
|
| 99 | # type: (str, int) -> Tuple[Id_t, int]
|
| 100 | tok_type, end_pos = fastlex.MatchEchoToken(line, start_pos)
|
| 101 | return tok_type, end_pos
|
| 102 |
|
| 103 |
|
| 104 | def _MatchGlobToken_Fast(line, start_pos):
|
| 105 | # type: (str, int) -> Tuple[Id_t, int]
|
| 106 | tok_type, end_pos = fastlex.MatchGlobToken(line, start_pos)
|
| 107 | return tok_type, end_pos
|
| 108 |
|
| 109 |
|
| 110 | def _MatchPS1Token_Fast(line, start_pos):
|
| 111 | # type: (str, int) -> Tuple[Id_t, int]
|
| 112 | tok_type, end_pos = fastlex.MatchPS1Token(line, start_pos)
|
| 113 | return tok_type, end_pos
|
| 114 |
|
| 115 |
|
| 116 | def _MatchHistoryToken_Fast(line, start_pos):
|
| 117 | # type: (str, int) -> Tuple[Id_t, int]
|
| 118 | tok_type, end_pos = fastlex.MatchHistoryToken(line, start_pos)
|
| 119 | return tok_type, end_pos
|
| 120 |
|
| 121 |
|
| 122 | def _MatchBraceRangeToken_Fast(line, start_pos):
|
| 123 | # type: (str, int) -> Tuple[Id_t, int]
|
| 124 | tok_type, end_pos = fastlex.MatchBraceRangeToken(line, start_pos)
|
| 125 | return tok_type, end_pos
|
| 126 |
|
| 127 |
|
| 128 | def _MatchJ8Token_Fast(line, start_pos):
|
| 129 | # type: (str, int) -> Tuple[Id_t, int]
|
| 130 | tok_type, end_pos = fastlex.MatchJ8Token(line, start_pos)
|
| 131 | return tok_type, end_pos
|
| 132 |
|
| 133 |
|
| 134 | def _MatchJ8LinesToken_Fast(line, start_pos):
|
| 135 | # type: (str, int) -> Tuple[Id_t, int]
|
| 136 | tok_type, end_pos = fastlex.MatchJ8LinesToken(line, start_pos)
|
| 137 | return tok_type, end_pos
|
| 138 |
|
| 139 |
|
| 140 | def _MatchJ8StrToken_Fast(line, start_pos):
|
| 141 | # type: (str, int) -> Tuple[Id_t, int]
|
| 142 | tok_type, end_pos = fastlex.MatchJ8StrToken(line, start_pos)
|
| 143 | return tok_type, end_pos
|
| 144 |
|
| 145 |
|
| 146 | def _MatchJsonStrToken_Fast(line, start_pos):
|
| 147 | # type: (str, int) -> Tuple[Id_t, int]
|
| 148 | tok_type, end_pos = fastlex.MatchJsonStrToken(line, start_pos)
|
| 149 | return tok_type, end_pos
|
| 150 |
|
| 151 |
|
| 152 | if fastlex:
|
| 153 | OneToken = _MatchOshToken_Fast
|
| 154 | ECHO_MATCHER = _MatchEchoToken_Fast
|
| 155 | GLOB_MATCHER = _MatchGlobToken_Fast
|
| 156 | PS1_MATCHER = _MatchPS1Token_Fast
|
| 157 | HISTORY_MATCHER = _MatchHistoryToken_Fast
|
| 158 | BRACE_RANGE_MATCHER = _MatchBraceRangeToken_Fast
|
| 159 |
|
| 160 | MatchJ8Token = _MatchJ8Token_Fast
|
| 161 | MatchJ8LinesToken = _MatchJ8LinesToken_Fast
|
| 162 | MatchJ8StrToken = _MatchJ8StrToken_Fast
|
| 163 | MatchJsonStrToken = _MatchJsonStrToken_Fast
|
| 164 |
|
| 165 | IsValidVarName = fastlex.IsValidVarName
|
| 166 | ShouldHijack = fastlex.ShouldHijack
|
| 167 | LooksLikeInteger = fastlex.LooksLikeInteger
|
| 168 | LooksLikeFloat = fastlex.LooksLikeFloat
|
| 169 | else:
|
| 170 | OneToken = _MatchOshToken_Slow(lexer_def.LEXER_DEF)
|
| 171 | ECHO_MATCHER = _MatchTokenSlow(lexer_def.ECHO_E_DEF)
|
| 172 | GLOB_MATCHER = _MatchTokenSlow(lexer_def.GLOB_DEF)
|
| 173 | PS1_MATCHER = _MatchTokenSlow(lexer_def.PS1_DEF)
|
| 174 | HISTORY_MATCHER = _MatchTokenSlow(lexer_def.HISTORY_DEF)
|
| 175 | BRACE_RANGE_MATCHER = _MatchTokenSlow(lexer_def.BRACE_RANGE_DEF)
|
| 176 |
|
| 177 | MatchJ8Token = _MatchTokenSlow(lexer_def.J8_DEF)
|
| 178 | MatchJ8LinesToken = _MatchTokenSlow(lexer_def.J8_LINES_DEF)
|
| 179 | MatchJ8StrToken = _MatchTokenSlow(lexer_def.J8_STR_DEF)
|
| 180 | MatchJsonStrToken = _MatchTokenSlow(lexer_def.JSON_STR_DEF)
|
| 181 |
|
| 182 | # Used by osh/cmd_parse.py to validate for loop name. Note it must be
|
| 183 | # anchored on the right.
|
| 184 | _VAR_NAME_RE = re.compile(lexer_def.VAR_NAME_RE + '$') # type: ignore
|
| 185 |
|
| 186 | def IsValidVarName(s):
|
| 187 | # type: (str) -> bool
|
| 188 | return bool(_VAR_NAME_RE.match(s))
|
| 189 |
|
| 190 | # yapf: disable
|
| 191 | _SHOULD_HIJACK_RE = re.compile(lexer_def.SHOULD_HIJACK_RE + '$') # type: ignore
|
| 192 |
|
| 193 | def ShouldHijack(s):
|
| 194 | # type: (str) -> bool
|
| 195 | return bool(_SHOULD_HIJACK_RE.match(s))
|
| 196 |
|
| 197 | _LOOKS_LIKE_INTEGER_RE = re.compile(lexer_def.LOOKS_LIKE_INTEGER + '$') # type: ignore
|
| 198 |
|
| 199 | def LooksLikeInteger(s):
|
| 200 | # type: (str) -> bool
|
| 201 | return bool(_LOOKS_LIKE_INTEGER_RE.match(s))
|
| 202 |
|
| 203 | _LOOKS_LIKE_FLOAT_RE = re.compile(lexer_def.LOOKS_LIKE_FLOAT + '$') # type: ignore
|
| 204 | # yapf: enable
|
| 205 |
|
| 206 |
|
| 207 | def LooksLikeFloat(s):
|
| 208 | # type: (str) -> bool
|
| 209 | return bool(_LOOKS_LIKE_FLOAT_RE.match(s))
|
| 210 |
|
| 211 |
|
| 212 | class SimpleLexer(object):
|
| 213 |
|
| 214 | def __init__(self, match_func, s):
|
| 215 | # type: (SimpleMatchFunc, str) -> None
|
| 216 | self.match_func = match_func
|
| 217 | self.s = s
|
| 218 | self.pos = 0
|
| 219 |
|
| 220 | def Next(self):
|
| 221 | # type: () -> Tuple[Id_t, str]
|
| 222 | """
|
| 223 | Note: match_func will return Id.Eol_Tok repeatedly the terminating NUL
|
| 224 | """
|
| 225 | tok_id, end_pos = self.match_func(self.s, self.pos)
|
| 226 | val = self.s[self.pos:end_pos]
|
| 227 | self.pos = end_pos
|
| 228 | return tok_id, val
|
| 229 |
|
| 230 | def Tokens(self):
|
| 231 | # type: () -> List[Tuple[Id_t, str]]
|
| 232 | tokens = [] # type: List[Tuple[Id_t, str]]
|
| 233 | while True:
|
| 234 | tok_id, val = self.Next()
|
| 235 | if tok_id == Id.Eol_Tok: # NUL terminator
|
| 236 | break
|
| 237 | tokens.append((tok_id, val))
|
| 238 | return tokens
|
| 239 |
|
| 240 |
|
| 241 | # Iterated over in builtin/io_osh.py
|
| 242 | def EchoLexer(s):
|
| 243 | # type: (str) -> SimpleLexer
|
| 244 | return SimpleLexer(ECHO_MATCHER, s)
|
| 245 |
|
| 246 |
|
| 247 | def BraceRangeLexer(s):
|
| 248 | # type: (str) -> SimpleLexer
|
| 249 | return SimpleLexer(BRACE_RANGE_MATCHER, s)
|
| 250 |
|
| 251 |
|
| 252 | def GlobLexer(s):
|
| 253 | # type: (str) -> SimpleLexer
|
| 254 | return SimpleLexer(GLOB_MATCHER, s)
|
| 255 |
|
| 256 |
|
| 257 | # These tokens are "slurped"
|
| 258 |
|
| 259 |
|
| 260 | def HistoryTokens(s):
|
| 261 | # type: (str) -> List[Tuple[Id_t, str]]
|
| 262 | lex = SimpleLexer(HISTORY_MATCHER, s)
|
| 263 | return lex.Tokens()
|
| 264 |
|
| 265 |
|
| 266 | def Ps1Tokens(s):
|
| 267 | # type: (str) -> List[Tuple[Id_t, str]]
|
| 268 | lex = SimpleLexer(PS1_MATCHER, s)
|
| 269 | return lex.Tokens()
|
| 270 |
|
| 271 |
|
| 272 | #
|
| 273 | # builtin/bracket_osh.py
|
| 274 | #
|
| 275 |
|
| 276 |
|
| 277 | def BracketUnary(s):
|
| 278 | # type: (str) -> Id_t
|
| 279 | from _devbuild.gen.id_kind import TEST_UNARY_LOOKUP # break circular dep
|
| 280 | return TEST_UNARY_LOOKUP.get(s, Id.Undefined_Tok)
|
| 281 |
|
| 282 |
|
| 283 | def BracketBinary(s):
|
| 284 | # type: (str) -> Id_t
|
| 285 | from _devbuild.gen.id_kind import TEST_BINARY_LOOKUP
|
| 286 | return TEST_BINARY_LOOKUP.get(s, Id.Undefined_Tok)
|
| 287 |
|
| 288 |
|
| 289 | def BracketOther(s):
|
| 290 | # type: (str) -> Id_t
|
| 291 | from _devbuild.gen.id_kind import TEST_OTHER_LOOKUP
|
| 292 | return TEST_OTHER_LOOKUP.get(s, Id.Undefined_Tok)
|