| 1 | """
 | 
| 2 | lexer_def.py - Lexing for OSH, YSH, and J8 Notation.
 | 
| 3 | 
 | 
| 4 | The OSH/YSH lexer has lexer modes, each with a regex -> Id mapping.
 | 
| 5 | 
 | 
| 6 | After changing this file, run:
 | 
| 7 | 
 | 
| 8 |     build/py.sh all
 | 
| 9 | 
 | 
| 10 | or at least:
 | 
| 11 | 
 | 
| 12 |     build/py.sh fastlex
 | 
| 13 | 
 | 
| 14 | Input Handling
 | 
| 15 | --------------
 | 
| 16 | 
 | 
| 17 | Every line is NUL terminated:
 | 
| 18 | 
 | 
| 19 |     'one\n\0' 'last line\0'
 | 
| 20 | 
 | 
| 21 | which means that no regexes below should match \0.
 | 
| 22 | 
 | 
| 23 | For example, use [^'\0]+ instead of [^']+ .
 | 
| 24 | 
 | 
| 25 | If this rule isn't followed, we would read uninitialized memory past the
 | 
| 26 | sentinel.  Python's regex engine knows where the end of the input string is, so
 | 
| 27 | it doesn't require need a sentinel like \0.
 | 
| 28 | 
 | 
| 29 | The frontend/lexer_gen.py generator adds a pattern mapping \0 to Id.Eol_Tok.
 | 
| 30 | """
 | 
| 31 | 
 | 
| 32 | from _devbuild.gen.id_kind_asdl import Id, Id_t, Kind
 | 
| 33 | from _devbuild.gen.types_asdl import lex_mode_e
 | 
| 34 | 
 | 
| 35 | from frontend import id_kind_def
 | 
| 36 | 
 | 
| 37 | from typing import Tuple
 | 
| 38 | 
 | 
| 39 | # Initialize spec that the lexer depends on.
 | 
| 40 | ID_SPEC = id_kind_def.IdSpec({}, {})
 | 
| 41 | 
 | 
| 42 | id_kind_def.AddKinds(ID_SPEC)
 | 
| 43 | id_kind_def.AddBoolKinds(ID_SPEC)  # must come second
 | 
| 44 | id_kind_def.SetupTestBuiltin(ID_SPEC, {}, {}, {})
 | 
| 45 | 
 | 
| 46 | 
 | 
| 47 | def C(pat, tok_type):
 | 
| 48 |     # type: (str, Id_t) -> Tuple[bool, str, Id_t]
 | 
| 49 |     """Lexer rule with a constant string, e.g. C('$*', VSub_Star)"""
 | 
| 50 |     return (False, pat, tok_type)
 | 
| 51 | 
 | 
| 52 | 
 | 
| 53 | def R(pat, tok_type):
 | 
| 54 |     # type: (str, Id_t) -> Tuple[bool, str, Id_t]
 | 
| 55 |     """Lexer rule with a regex string, e.g. R('\$[0-9]', VSub_Number)"""
 | 
| 56 |     return (True, pat, tok_type)
 | 
| 57 | 
 | 
| 58 | 
 | 
| 59 | # See unit tests in frontend/match_test.py.
 | 
| 60 | # We need the [^\0]* because the re2c translation assumes it's anchored like $.
 | 
| 61 | SHOULD_HIJACK_RE = r'#![^\0]*sh[ \t\r\n][^\0]*'
 | 
| 62 | 
 | 
| 63 | # Separates words (\r it not whitespace here)
 | 
| 64 | _SIGNIFICANT_SPACE = R(r'[ \t]+', Id.WS_Space)
 | 
| 65 | 
 | 
| 66 | _BACKSLASH = [
 | 
| 67 |     # To be conservative, we could deny a set of chars similar to
 | 
| 68 |     # _LITERAL_WHITELIST_REGEX, rather than allowing all the operator characters
 | 
| 69 |     # like \( and \;.
 | 
| 70 |     #
 | 
| 71 |     # strict_backslash makes this stricter.
 | 
| 72 |     R(r'\\[^\n\0]', Id.Lit_EscapedChar),
 | 
| 73 |     C('\\\n', Id.Ignored_LineCont),
 | 
| 74 | ]
 | 
| 75 | 
 | 
| 76 | # Only 4 characters are backslash escaped inside "".
 | 
| 77 | # https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
 | 
| 78 | _DQ_BACKSLASH = [
 | 
| 79 |     R(r'\\[$`"\\]', Id.Lit_EscapedChar),
 | 
| 80 |     C('\\', Id.Lit_BadBackslash),  # syntax error in YSH, but NOT in OSH
 | 
| 81 | ]
 | 
| 82 | 
 | 
| 83 | VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
 | 
| 84 | 
 | 
| 85 | # All Kind.VSub
 | 
| 86 | _VARS = [
 | 
| 87 |     # Unbraced variables
 | 
| 88 |     R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName),
 | 
| 89 |     R(r'\$[0-9]', Id.VSub_Number),
 | 
| 90 |     C(r'$!', Id.VSub_Bang),
 | 
| 91 |     C(r'$@', Id.VSub_At),
 | 
| 92 |     C(r'$#', Id.VSub_Pound),
 | 
| 93 |     C(r'$$', Id.VSub_Dollar),
 | 
| 94 |     C(r'$*', Id.VSub_Star),
 | 
| 95 |     C(r'$-', Id.VSub_Hyphen),
 | 
| 96 |     C(r'$?', Id.VSub_QMark),
 | 
| 97 | ]
 | 
| 98 | 
 | 
| 99 | # Kind.Left that are valid in double-quoted modes.
 | 
| 100 | 
 | 
| 101 | _LEFT_SUBS = [
 | 
| 102 |     C('`', Id.Left_Backtick),
 | 
| 103 |     C('$(', Id.Left_DollarParen),
 | 
| 104 |     C('${', Id.Left_DollarBrace),
 | 
| 105 |     # Parse zsh syntax, but don't execute it.
 | 
| 106 |     # The examples we've seen so far are like ${(%):-} and ${(m)
 | 
| 107 |     R(r'\$\{\([^)\0]+\)', Id.Left_DollarBraceZsh),
 | 
| 108 |     C('$((', Id.Left_DollarDParen),
 | 
| 109 |     C('$[', Id.Left_DollarBracket),
 | 
| 110 | ]
 | 
| 111 | 
 | 
| 112 | # Additional Kind.Left that are valid in unquoted modes.
 | 
| 113 | _LEFT_UNQUOTED = [
 | 
| 114 |     C('"', Id.Left_DoubleQuote),
 | 
| 115 |     C("'", Id.Left_SingleQuote),
 | 
| 116 |     C('$"', Id.Left_DollarDoubleQuote),
 | 
| 117 |     C("$'", Id.Left_DollarSingleQuote),
 | 
| 118 | ]
 | 
| 119 | 
 | 
| 120 | _LEFT_PROCSUB = [
 | 
| 121 |     C('<(', Id.Left_ProcSubIn),
 | 
| 122 |     C('>(', Id.Left_ProcSubOut),
 | 
| 123 | ]
 | 
| 124 | 
 | 
| 125 | # The regexes below are in Python syntax, but are translate to re2c syntax by
 | 
| 126 | # frontend/lexer_gen.py.
 | 
| 127 | #
 | 
| 128 | # http://re2c.org/manual/syntax/syntax.html
 | 
| 129 | # https://docs.python.org/2/library/re.html
 | 
| 130 | #
 | 
| 131 | # We use a limited set of constructs:
 | 
| 132 | # - + and * for repetition
 | 
| 133 | # - Character classes [] with simple ranges and negation
 | 
| 134 | # - Escapes like \n \0
 | 
| 135 | 
 | 
| 136 | LEXER_DEF = {}  # TODO: Should be a list so we enforce order.
 | 
| 137 | 
 | 
| 138 | # Anything until the end of the line is a comment.  Does not match the newline
 | 
| 139 | # itself.  We want to switch modes and possibly process Op_Newline for here
 | 
| 140 | # docs, etc.
 | 
| 141 | LEXER_DEF[lex_mode_e.Comment] = [R(r'[^\n\0]*', Id.Ignored_Comment)]
 | 
| 142 | 
 | 
| 143 | # A whitelist to make bigger Lit_Chars tokens.  We don't want one byte at a time.
 | 
| 144 | #
 | 
| 145 | # The shell language says that "anything other byte" is a literal character --
 | 
| 146 | # for example, unquoted $ \ ! are literal, not a syntax error.
 | 
| 147 | #
 | 
| 148 | # That is, a literal is defined NEGATIVELY, for a single characters.  But here
 | 
| 149 | # we define a SUBSET of literal chars POSITIVELY.
 | 
| 150 | 
 | 
| 151 | # The range \x80-\xff makes sure that UTF-8 sequences are a single token.
 | 
| 152 | _LITERAL_WHITELIST_REGEX = r'[\x80-\xffa-zA-Z0-9_.\-]+'
 | 
| 153 | 
 | 
| 154 | _UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + _VARS + [
 | 
| 155 |     # NOTE: We could add anything 128 and above to this character class?  So
 | 
| 156 |     # utf-8 characters don't get split?
 | 
| 157 |     R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
 | 
| 158 |     C('~', Id.Lit_Tilde),  # for tilde sub
 | 
| 159 |     C('/', Id.Lit_Slash),  # also for tilde sub
 | 
| 160 |     C(':', Id.Lit_Colon),  # for special PATH=a:~foo tilde detection
 | 
| 161 |     C('$', Id.Lit_Dollar),  # shopt -u parse_dollar
 | 
| 162 |     C('#', Id.Lit_Pound),  # For comments
 | 
| 163 |     _SIGNIFICANT_SPACE,
 | 
| 164 |     C('\n', Id.Op_Newline),
 | 
| 165 |     C('&', Id.Op_Amp),
 | 
| 166 |     C('|', Id.Op_Pipe),
 | 
| 167 |     C('|&', Id.Op_PipeAmp),
 | 
| 168 |     C('&&', Id.Op_DAmp),
 | 
| 169 |     C('||', Id.Op_DPipe),
 | 
| 170 |     C(';', Id.Op_Semi),
 | 
| 171 |     # Case terminators
 | 
| 172 |     C(';;', Id.Op_DSemi),
 | 
| 173 |     C(';&', Id.Op_SemiAmp),
 | 
| 174 |     C(';;&', Id.Op_DSemiAmp),
 | 
| 175 |     C('(', Id.Op_LParen),
 | 
| 176 |     C(')', Id.Op_RParen),
 | 
| 177 |     R(r'[^\0]', Id.Lit_Other),  # any other single char is a literal
 | 
| 178 | ]
 | 
| 179 | 
 | 
| 180 | # In ShCommand and DBracket states.
 | 
| 181 | _EXTGLOB_BEGIN = [
 | 
| 182 |     C(',(', Id.ExtGlob_Comma),  # YSH synonym for @(...)
 | 
| 183 |     C('@(', Id.ExtGlob_At),
 | 
| 184 |     C('*(', Id.ExtGlob_Star),
 | 
| 185 |     C('+(', Id.ExtGlob_Plus),
 | 
| 186 |     C('?(', Id.ExtGlob_QMark),
 | 
| 187 |     C('!(', Id.ExtGlob_Bang),
 | 
| 188 | ]
 | 
| 189 | 
 | 
| 190 | KEYWORDS = [
 | 
| 191 |     # NOTE: { is matched elsewhere
 | 
| 192 |     C('[[', Id.KW_DLeftBracket),
 | 
| 193 |     C('!', Id.KW_Bang),
 | 
| 194 |     C('for', Id.KW_For),
 | 
| 195 |     C('while', Id.KW_While),
 | 
| 196 |     C('until', Id.KW_Until),
 | 
| 197 |     C('do', Id.KW_Do),
 | 
| 198 |     C('done', Id.KW_Done),
 | 
| 199 |     C('in', Id.KW_In),
 | 
| 200 |     C('case', Id.KW_Case),
 | 
| 201 |     C('esac', Id.KW_Esac),
 | 
| 202 |     C('if', Id.KW_If),
 | 
| 203 |     C('fi', Id.KW_Fi),
 | 
| 204 |     C('then', Id.KW_Then),
 | 
| 205 |     C('else', Id.KW_Else),
 | 
| 206 |     C('elif', Id.KW_Elif),
 | 
| 207 |     C('function', Id.KW_Function),
 | 
| 208 |     C('time', Id.KW_Time),
 | 
| 209 | 
 | 
| 210 |     # YSH
 | 
| 211 |     C('const', Id.KW_Const),  # maybe remove this
 | 
| 212 |     C('var', Id.KW_Var),
 | 
| 213 |     C('setvar', Id.KW_SetVar),
 | 
| 214 |     C('setglobal', Id.KW_SetGlobal),
 | 
| 215 |     C('call', Id.KW_Call),
 | 
| 216 |     C('proc', Id.KW_Proc),
 | 
| 217 |     C('typed', Id.KW_Typed),
 | 
| 218 |     C('func', Id.KW_Func),
 | 
| 219 | ]
 | 
| 220 | 
 | 
| 221 | # These are treated like builtins in bash, but keywords in OSH.  However, we
 | 
| 222 | # maintain compatibility with bash for the 'type' builtin.
 | 
| 223 | CONTROL_FLOW = [
 | 
| 224 |     C('break', Id.ControlFlow_Break),
 | 
| 225 |     C('continue', Id.ControlFlow_Continue),
 | 
| 226 |     C('return', Id.ControlFlow_Return),
 | 
| 227 |     C('exit', Id.ControlFlow_Exit),
 | 
| 228 | ]
 | 
| 229 | 
 | 
| 230 | # Used by ysh/grammar_gen.py too
 | 
| 231 | EXPR_WORDS = [
 | 
| 232 |     C('null', Id.Expr_Null),
 | 
| 233 |     C('true', Id.Expr_True),
 | 
| 234 |     C('false', Id.Expr_False),
 | 
| 235 |     C('and', Id.Expr_And),
 | 
| 236 |     C('or', Id.Expr_Or),
 | 
| 237 |     C('not', Id.Expr_Not),
 | 
| 238 |     C('for', Id.Expr_For),
 | 
| 239 |     C('is', Id.Expr_Is),
 | 
| 240 |     C('in', Id.Expr_In),
 | 
| 241 |     C('if', Id.Expr_If),
 | 
| 242 |     C('else', Id.Expr_Else),
 | 
| 243 | 
 | 
| 244 |     # Unused: could be for function literals, although we also have
 | 
| 245 |     # |x| x+1 lambdas
 | 
| 246 |     C('func', Id.Expr_Func),
 | 
| 247 | 
 | 
| 248 |     # / <capture d+/
 | 
| 249 |     C('capture', Id.Expr_Capture),
 | 
| 250 |     # / <capture d+ as date> /
 | 
| 251 |     C('as', Id.Expr_As),
 | 
| 252 | ]
 | 
| 253 | 
 | 
| 254 | FD_VAR_NAME = r'\{' + VAR_NAME_RE + r'\}'
 | 
| 255 | 
 | 
| 256 | # file descriptors can only have two digits, like mksh
 | 
| 257 | # dash/zsh/etc. can have one
 | 
| 258 | FD_NUM = r'[0-9]?[0-9]?'
 | 
| 259 | 
 | 
| 260 | # These two can must be recognized in the ShCommand state, but can't nested
 | 
| 261 | # within [[.
 | 
| 262 | # Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
 | 
| 263 | # of <Lit_Chars "if">.
 | 
| 264 | LEXER_DEF[lex_mode_e.ShCommand] = [
 | 
| 265 |     # These four are not allowed within [[, so they are in ShCommand but not
 | 
| 266 |     # _UNQUOTED.
 | 
| 267 | 
 | 
| 268 |     # e.g. beginning of NAME=val, which will always be longer than
 | 
| 269 |     # _LITERAL_WHITELIST_REGEX.
 | 
| 270 |     R(VAR_NAME_RE + '\+?=', Id.Lit_VarLike),
 | 
| 271 |     R(VAR_NAME_RE + '\[', Id.Lit_ArrayLhsOpen),
 | 
| 272 |     R(r'\]\+?=', Id.Lit_ArrayLhsClose),
 | 
| 273 |     C('((', Id.Op_DLeftParen),
 | 
| 274 | 
 | 
| 275 |     # For static globbing, and [] for array literals
 | 
| 276 |     C('[', Id.Lit_LBracket),  # e.g. A=(['x']=1)
 | 
| 277 |     C(']', Id.Lit_RBracket),  # e.g. *.[ch]
 | 
| 278 |     # NOTE: Glob_Star and Glob_QMark are for dynamic parsing
 | 
| 279 |     C('*', Id.Lit_Star),
 | 
| 280 |     C('?', Id.Lit_QMark),
 | 
| 281 |     C('###', Id.Lit_TPound),  # like Lit_Pound, for doc comments
 | 
| 282 |     C('...', Id.Lit_TDot),  # ... for multiline commands
 | 
| 283 | 
 | 
| 284 |     # For brace expansion {a,b}
 | 
| 285 |     C('{', Id.Lit_LBrace),
 | 
| 286 |     C('}', Id.Lit_RBrace),  # Also for var sub ${a}
 | 
| 287 |     C(',', Id.Lit_Comma),
 | 
| 288 |     C('=', Id.Lit_Equals),  # for = f(x) and x = 1+2*3
 | 
| 289 |     C('@', Id.Lit_At),  # for detecting @[, @' etc. shopt -s parse_at_all
 | 
| 290 | 
 | 
| 291 |     # @array and @func(1, c)
 | 
| 292 |     R('@' + VAR_NAME_RE, Id.Lit_Splice),  # for YSH splicing
 | 
| 293 |     C('@[', Id.Lit_AtLBracket),  # @[split(x)]
 | 
| 294 |     C('@{.', Id.Lit_AtLBraceDot),  # for split builtin sub @{.myproc arg1}
 | 
| 295 |     R(FD_NUM + r'<', Id.Redir_Less),
 | 
| 296 |     R(FD_NUM + r'>', Id.Redir_Great),
 | 
| 297 |     R(FD_NUM + r'<<', Id.Redir_DLess),
 | 
| 298 |     R(FD_NUM + r'<<<', Id.Redir_TLess),
 | 
| 299 |     R(FD_NUM + r'>>', Id.Redir_DGreat),
 | 
| 300 |     R(FD_NUM + r'<<-', Id.Redir_DLessDash),
 | 
| 301 |     R(FD_NUM + r'>&', Id.Redir_GreatAnd),
 | 
| 302 |     R(FD_NUM + r'<&', Id.Redir_LessAnd),
 | 
| 303 |     R(FD_NUM + r'<>', Id.Redir_LessGreat),
 | 
| 304 |     R(FD_NUM + r'>\|', Id.Redir_Clobber),
 | 
| 305 |     R(FD_VAR_NAME + r'<', Id.Redir_Less),
 | 
| 306 |     R(FD_VAR_NAME + r'>', Id.Redir_Great),
 | 
| 307 |     R(FD_VAR_NAME + r'<<', Id.Redir_DLess),
 | 
| 308 |     R(FD_VAR_NAME + r'<<<', Id.Redir_TLess),
 | 
| 309 |     R(FD_VAR_NAME + r'>>', Id.Redir_DGreat),
 | 
| 310 |     R(FD_VAR_NAME + r'<<-', Id.Redir_DLessDash),
 | 
| 311 |     R(FD_VAR_NAME + r'>&', Id.Redir_GreatAnd),
 | 
| 312 |     R(FD_VAR_NAME + r'<&', Id.Redir_LessAnd),
 | 
| 313 |     R(FD_VAR_NAME + r'<>', Id.Redir_LessGreat),
 | 
| 314 |     R(FD_VAR_NAME + r'>\|', Id.Redir_Clobber),
 | 
| 315 | 
 | 
| 316 |     # No leading descriptor (2 is implied)
 | 
| 317 |     C(r'&>', Id.Redir_AndGreat),
 | 
| 318 |     C(r'&>>', Id.Redir_AndDGreat),
 | 
| 319 | ] + KEYWORDS + CONTROL_FLOW + _UNQUOTED + _EXTGLOB_BEGIN
 | 
| 320 | 
 | 
| 321 | # Preprocessing before ShCommand
 | 
| 322 | LEXER_DEF[lex_mode_e.Backtick] = [
 | 
| 323 |     C(r'`', Id.Backtick_Right),
 | 
| 324 |     # A backslash, and then $ or ` or \
 | 
| 325 |     R(r'\\[$`\\]', Id.Backtick_Quoted),
 | 
| 326 |     # \" treated specially, depending on whether bacticks are double-quoted!
 | 
| 327 |     R(r'\\"', Id.Backtick_DoubleQuote),
 | 
| 328 |     R(r'[^`\\\0]+', Id.Backtick_Other),  # contiguous run of literals
 | 
| 329 |     R(r'[^\0]', Id.Backtick_Other),  # anything else
 | 
| 330 | ]
 | 
| 331 | 
 | 
| 332 | # DBRACKET: can be like ShCommand, except:
 | 
| 333 | # - Don't really need redirects either... Redir_Less could be Op_Less
 | 
| 334 | # - Id.Op_DLeftParen can't be nested inside.
 | 
| 335 | LEXER_DEF[lex_mode_e.DBracket] = [
 | 
| 336 |     C(']]', Id.Lit_DRightBracket),
 | 
| 337 |     # Must be KW and not Op, because we can have stuff like [[ $foo == !* ]]
 | 
| 338 |     # in addition to [[ ! a && b ]]
 | 
| 339 |     C('!', Id.KW_Bang),
 | 
| 340 |     C('<', Id.Op_Less),
 | 
| 341 |     C('>', Id.Op_Great),
 | 
| 342 | ] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
 | 
| 343 |     ID_SPEC.LexerPairs(Kind.BoolBinary) + \
 | 
| 344 |     _UNQUOTED + _EXTGLOB_BEGIN
 | 
| 345 | 
 | 
| 346 | # Inside an extended glob, most characters are literals, including spaces and
 | 
| 347 | # punctuation.  We also accept \, $var, ${var}, "", etc.  They can also be
 | 
| 348 | # nested, so _EXTGLOB_BEGIN appears here.
 | 
| 349 | #
 | 
| 350 | # Example: echo @(<> <>|&&|'foo'|$bar)
 | 
| 351 | LEXER_DEF[lex_mode_e.ExtGlob] = \
 | 
| 352 |     _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + _EXTGLOB_BEGIN + [
 | 
| 353 |     R(r'[^\\$`"\'|)@*+!?\0]+', Id.Lit_Chars),
 | 
| 354 |     C('|', Id.Op_Pipe),
 | 
| 355 |     C(')', Id.Op_RParen),  # maybe be translated to Id.ExtGlob_RParen
 | 
| 356 |     R(r'[^\0]', Id.Lit_Other),  # everything else is literal
 | 
| 357 | ]
 | 
| 358 | 
 | 
| 359 | # Notes on BASH_REGEX states
 | 
| 360 | #
 | 
| 361 | # From bash manual:
 | 
| 362 | #
 | 
| 363 | # - Any part of the pattern may be quoted to force the quoted portion to be
 | 
| 364 | # matched as a string.
 | 
| 365 | # - Bracket expressions in regular expressions must be treated carefully, since
 | 
| 366 | # normal quoting characters lose their meanings between brackets.
 | 
| 367 | # - If the pattern is stored in a shell variable, quoting the variable
 | 
| 368 | # expansion forces the entire pattern to be matched as a string.
 | 
| 369 | #
 | 
| 370 | # Is there a re.escape function?  It's just like EscapeGlob and UnescapeGlob.
 | 
| 371 | #
 | 
| 372 | # TODO: For testing, write a script to extract and save regexes... and compile
 | 
| 373 | # them with regcomp.  I've only seen constant regexes.
 | 
| 374 | #
 | 
| 375 | # bash code: ( | ) are special
 | 
| 376 | 
 | 
| 377 | LEXER_DEF[lex_mode_e.BashRegex] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
 | 
| 378 |     # Like lex_mode_e.ShCommand
 | 
| 379 |     R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
 | 
| 380 | 
 | 
| 381 |     # Tokens for Tilde sub.  bash weirdness: RHS of [[ x =~ ~ ]] is expanded
 | 
| 382 |     C('~', Id.Lit_Tilde),
 | 
| 383 |     C('/', Id.Lit_Slash),
 | 
| 384 | 
 | 
| 385 |     # Id.WS_Space delimits words.  In lex_mode_e.BashRegexFakeInner, we
 | 
| 386 |     # translate them to Id.Lit_Chars.
 | 
| 387 |     _SIGNIFICANT_SPACE,
 | 
| 388 | 
 | 
| 389 |     # Analogous to Id.ExtGlob_* - we need to change lexer modes when we hit this
 | 
| 390 |     C('(', Id.BashRegex_LParen),
 | 
| 391 | 
 | 
| 392 |     # Not special, this is like lex_mode_e.Outer
 | 
| 393 |     C(')', Id.Op_RParen),
 | 
| 394 | 
 | 
| 395 |     # Copied and adapted from _UNQUOTED
 | 
| 396 |     # \n & ; < > are parse errors OUTSIDE a group   [[ s =~ ; ]]
 | 
| 397 |     #            but become allowed INSIDE a group  [[ s =~ (;) ]]
 | 
| 398 |     C('\n', Id.BashRegex_AllowedInParens),
 | 
| 399 |     C('&', Id.BashRegex_AllowedInParens),
 | 
| 400 |     C(';', Id.BashRegex_AllowedInParens),
 | 
| 401 |     C('>', Id.BashRegex_AllowedInParens),
 | 
| 402 |     C('<', Id.BashRegex_AllowedInParens),
 | 
| 403 | 
 | 
| 404 |     # e.g. | is Id.Lit_Other, not pipe operator
 | 
| 405 |     R(r'[^\0]', Id.Lit_Other),  # like _UNQUOTED, any other byte is literal
 | 
| 406 | ] + _BACKSLASH  # These have to come after RegexMeta
 | 
| 407 | 
 | 
| 408 | LEXER_DEF[lex_mode_e.DQ] = _DQ_BACKSLASH + [
 | 
| 409 |     C('\\\n', Id.Ignored_LineCont),
 | 
| 410 | ] + _LEFT_SUBS + _VARS + [
 | 
| 411 |     R(r'[^$`"\0\\]+', Id.Lit_Chars),  # matches a line at most
 | 
| 412 |     C('$', Id.Lit_Dollar),  # completion of var names relies on this
 | 
| 413 |     # NOTE: When parsing here doc line, this token doesn't end it.
 | 
| 414 |     C('"', Id.Right_DoubleQuote),
 | 
| 415 | ]
 | 
| 416 | 
 | 
| 417 | _VS_ARG_COMMON = [
 | 
| 418 |     C('/', Id.Lit_Slash),  # for patsub (not Id.VOp2_Slash)
 | 
| 419 |     C('#', Id.Lit_Pound),  # for patsub prefix (not Id.VOp1_Pound)
 | 
| 420 |     C('%', Id.Lit_Percent),  # for patsdub suffix (not Id.VOp1_Percent)
 | 
| 421 |     C('}', Id.Right_DollarBrace),  # For var sub "${a}"
 | 
| 422 |     C('$', Id.Lit_Dollar),  # completion of var names relies on this
 | 
| 423 | ]
 | 
| 424 | 
 | 
| 425 | # We don't execute zsh var subs, but to find the closing } properly, we need to
 | 
| 426 | # to recognize \} and '}' and "}" $'}' etc.
 | 
| 427 | LEXER_DEF[lex_mode_e.VSub_Zsh] = \
 | 
| 428 |   _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
 | 
| 429 |   [
 | 
| 430 |     C('}', Id.Right_DollarBrace),  # For var sub "${a}"
 | 
| 431 |     R(r'[^\0]', Id.Lit_Other),  # e.g. "$", must be last
 | 
| 432 | ]
 | 
| 433 | 
 | 
| 434 | # Kind.{Lit,Ignored,VSub,Left,Right,Eof}
 | 
| 435 | LEXER_DEF[lex_mode_e.VSub_ArgUnquoted] = \
 | 
| 436 |   _BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
 | 
| 437 |   _VARS + _EXTGLOB_BEGIN + [
 | 
| 438 | 
 | 
| 439 |     # Token for Tilde sub
 | 
| 440 |     C('~', Id.Lit_Tilde),
 | 
| 441 | 
 | 
| 442 |     # - doesn't match ~ for tilde sub
 | 
| 443 |     # - doesn't match < and > so it doesn't eat <()
 | 
| 444 |     # - doesn't match  @ ! ? + * so it doesn't eat _EXTGLOB_BEGIN -- ( alone it
 | 
| 445 |     #   not enough
 | 
| 446 |     R(r'[^$`~/}"\'\0\\#%<>@!?+*]+', Id.Lit_Chars),
 | 
| 447 |     R(r'[^\0]', Id.Lit_Other),  # e.g. "$", must be last
 | 
| 448 | ]
 | 
| 449 | 
 | 
| 450 | # Kind.{Lit,Ignored,VSub,Left,Right,Eof}
 | 
| 451 | LEXER_DEF[lex_mode_e.VSub_ArgDQ] = \
 | 
| 452 |   _DQ_BACKSLASH +  _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
 | 
| 453 | 
 | 
| 454 |     C(r'\}', Id.Lit_EscapedChar),  # For "${var-\}}"
 | 
| 455 | 
 | 
| 456 |     R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars),  # matches a line at most
 | 
| 457 | 
 | 
| 458 |     # Weird wart: even in double quoted state, double quotes are allowed
 | 
| 459 |     C('"', Id.Left_DoubleQuote),
 | 
| 460 | 
 | 
| 461 |     # Another weird wart of bash/mksh: $'' is recognized but NOT ''!
 | 
| 462 |     C("$'", Id.Left_DollarSingleQuote),
 | 
| 463 | ]
 | 
| 464 | 
 | 
| 465 | # NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
 | 
| 466 | # state.
 | 
| 467 | LEXER_DEF[lex_mode_e.SQ_Raw] = [
 | 
| 468 |     R(r"[^'\0]+", Id.Lit_Chars),  # matches a line at most
 | 
| 469 |     C("'", Id.Right_SingleQuote),
 | 
| 470 | ]
 | 
| 471 | 
 | 
| 472 | # The main purpose for EXPR_CHARS is in regex literals, e.g. [a-z \t \n].
 | 
| 473 | #
 | 
| 474 | # In YSH expressions, Chars are code point integers, so \u{1234} is the same as
 | 
| 475 | # 0x1234.  And \0 is 0x0.
 | 
| 476 | 
 | 
| 477 | # In Python:
 | 
| 478 | # chr(0x00012345) == u'\U00012345'
 | 
| 479 | #
 | 
| 480 | # In YSH:
 | 
| 481 | # 0x00012345 == \u{12345}
 | 
| 482 | # chr(0x00012345) == chr(\u{12345}) == $'\u{012345}'
 | 
| 483 | 
 | 
| 484 | _U_BRACED_CHAR = R(r'\\[uU]\{[0-9a-fA-F]{1,6}\}', Id.Char_UBraced)
 | 
| 485 | 
 | 
| 486 | _X_CHAR_LOOSE = R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex)  # bash
 | 
| 487 | _X_CHAR_STRICT = R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex)  # YSH
 | 
| 488 | 
 | 
| 489 | _U4_CHAR_LOOSE = R(r'\\u[0-9a-fA-F]{1,4}', Id.Char_Unicode4)  # bash
 | 
| 490 | 
 | 
| 491 | _U4_CHAR_STRICT = R(r'\\u[0-9a-fA-F]{4}', Id.Char_Unicode4)  # JSON-only
 | 
| 492 | 
 | 
| 493 | EXPR_CHARS = [
 | 
| 494 |     # This is like Rust.  We don't have the legacy C escapes like \b.
 | 
| 495 | 
 | 
| 496 |     # NOTE: \' and \" are more readable versions of '"' and "'" in regexs
 | 
| 497 |     R(r'\\[0rtn\\"%s]' % "'", Id.Char_OneChar),
 | 
| 498 |     _X_CHAR_STRICT,
 | 
| 499 | 
 | 
| 500 |     # Because 'a' is a string, we use the syntax #'a' for char literals.
 | 
| 501 |     # We explicitly leave out #''' because it's confusing.
 | 
| 502 |     # Note: we're not doing utf-8 validation here.
 | 
| 503 |     R(r"#'[^'\0]'", Id.Char_Pound),
 | 
| 504 |     _U_BRACED_CHAR,
 | 
| 505 | ]
 | 
| 506 | 
 | 
| 507 | # Shared between echo -e and $''.
 | 
| 508 | _C_STRING_COMMON = [
 | 
| 509 | 
 | 
| 510 |     # \x6 is valid in bash
 | 
| 511 |     _X_CHAR_LOOSE,
 | 
| 512 |     _U4_CHAR_LOOSE,
 | 
| 513 |     R(r'\\U[0-9a-fA-F]{1,8}', Id.Char_Unicode8),
 | 
| 514 |     R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
 | 
| 515 | 
 | 
| 516 |     # e.g. \A is not an escape, and \x doesn't match a hex escape.  We allow it,
 | 
| 517 |     # but a lint tool could warn about it.
 | 
| 518 |     C('\\', Id.Unknown_Backslash),
 | 
| 519 | ]
 | 
| 520 | 
 | 
| 521 | ECHO_E_DEF = _C_STRING_COMMON + [
 | 
| 522 |     # Note: tokens above \0377 can either be truncated or be flagged a syntax
 | 
| 523 |     # error in strict mode.
 | 
| 524 |     R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
 | 
| 525 |     C(r'\c', Id.Char_Stop),
 | 
| 526 | 
 | 
| 527 |     # e.g. 'foo', anything that's not a backslash escape
 | 
| 528 |     R(r'[^\\\0]+', Id.Lit_Chars),
 | 
| 529 | ]
 | 
| 530 | 
 | 
| 531 | # https://json.org/
 | 
| 532 | 
 | 
| 533 | # Note that [0-9] has to come second, because Python chooses the first match.
 | 
| 534 | _JSON_INT = r'-?([1-9][0-9]*|[0-9])'  # Numbers can't start with leading 0
 | 
| 535 | _JSON_FRACTION = r'(\.[0-9]+)?'
 | 
| 536 | _JSON_EXP = r'([eE][-+]?[0-9]+)?'
 | 
| 537 | 
 | 
| 538 | # R5RS extended alphabetic characters
 | 
| 539 | # https://groups.csail.mit.edu/mac/ftpdir/scheme-reports/r5rs-html/r5rs_4.html
 | 
| 540 | #
 | 
| 541 | #   ! $ % & * + - . / : < = > ? @ ^ _ ~
 | 
| 542 | 
 | 
| 543 | # Description from Guile Scheme - https://www.gnu.org/software/guile/manual/html_node/Symbol-Read-Syntax.html
 | 
| 544 | #
 | 
| 545 | # "The read syntax for a symbol is a sequence of letters, digits, and extended
 | 
| 546 | # alphabetic characters, beginning with a character that cannot begin a
 | 
| 547 | # number. In addition, the special cases of +, -, and ... are read as symbols
 | 
| 548 | # even though numbers can begin with +, - or ."
 | 
| 549 | #
 | 
| 550 | # (They should have used regular languages!)
 | 
| 551 | 
 | 
| 552 | # We take out $ and @ for our splicing syntax, i.e. $unquote and
 | 
| 553 | # @unquote-splicing.  And : for now because we use it for name:value.
 | 
| 554 | 
 | 
| 555 | # Also note Scheme allows |a b| for symbols with funny chars, and Guile scheme
 | 
| 556 | # allows #{a b}#.  We could use `a b` or (symbol "a b").
 | 
| 557 | 
 | 
| 558 | J8_SYMBOL_CHARS = r'!%&*+./<=>?^_~-'  # - is last for regex char class
 | 
| 559 | 
 | 
| 560 | # yapf: disable
 | 
| 561 | J8_SYMBOL_RE = (
 | 
| 562 |     r'[a-zA-Z' + J8_SYMBOL_CHARS + ']' +
 | 
| 563 |     r'[a-zA-Z0-9' + J8_SYMBOL_CHARS + ']*')
 | 
| 564 | # yapf: enable
 | 
| 565 | 
 | 
| 566 | _J8_LEFT = [
 | 
| 567 |     C('"', Id.Left_DoubleQuote),  # JSON string
 | 
| 568 |     C('j"', Id.Left_JDoubleQuote),  # JSON string with explicit J8 prefix
 | 
| 569 |     # Three left quotes that are J8 only
 | 
| 570 |     C("u'", Id.Left_USingleQuote),  # unicode string
 | 
| 571 |     C("'", Id.Left_USingleQuote),  # '' is alias for u'' in data, not in code
 | 
| 572 |     C("b'", Id.Left_BSingleQuote),  # byte string
 | 
| 573 | ]
 | 
| 574 | 
 | 
| 575 | J8_DEF = _J8_LEFT + [
 | 
| 576 |     C('[', Id.J8_LBracket),
 | 
| 577 |     C(']', Id.J8_RBracket),
 | 
| 578 |     C('{', Id.J8_LBrace),
 | 
| 579 |     C('}', Id.J8_RBrace),
 | 
| 580 |     C('(', Id.J8_LParen),  # NIL8 only
 | 
| 581 |     C(')', Id.J8_RParen),  # NIL8 only
 | 
| 582 |     C(',', Id.J8_Comma),
 | 
| 583 |     C(':', Id.J8_Colon),
 | 
| 584 |     C('null', Id.J8_Null),
 | 
| 585 |     C('true', Id.J8_Bool),
 | 
| 586 |     C('false', Id.J8_Bool),
 | 
| 587 |     R(_JSON_INT, Id.J8_Int),
 | 
| 588 |     R(_JSON_INT + _JSON_FRACTION + _JSON_EXP, Id.J8_Float),
 | 
| 589 | 
 | 
| 590 |     # Identifier names come AFTER null true false.
 | 
| 591 |     # - Happens to be the same as shell identifier # names.
 | 
| 592 |     # - Note that JS allows $ as an identifier, but we don't.
 | 
| 593 |     # - Used for dict keys / NIL8 field names.
 | 
| 594 |     R(VAR_NAME_RE, Id.J8_Identifier),
 | 
| 595 | 
 | 
| 596 |     # Symbol is a SUPERSET of Identifier.  The first word in NIL8 can be can
 | 
| 597 |     # be either Symbol or plain Identifier, but field names can only be
 | 
| 598 |     # Identifier.  JSON8 only has Identifier.
 | 
| 599 |     #R(J8_SYMBOL_RE, Id.J8_Symbol),  # NIL8 only
 | 
| 600 |     R(r'[~!@$%^&*+=|;./<>?-]+', Id.J8_Operator),  # NIL8 only
 | 
| 601 |     R(r'[ \r\t]+', Id.Ignored_Space),
 | 
| 602 |     # A separate token, to count lines for error messages
 | 
| 603 |     C('\n', Id.Ignored_Newline),
 | 
| 604 |     # comment is # until end of line
 | 
| 605 |     # // comments are JavaScript style, but right now we might want them as
 | 
| 606 |     # symbols?
 | 
| 607 |     R(r'#[^\n\0]*', Id.Ignored_Comment),  # J8 only (JSON8, NIL8)
 | 
| 608 | 
 | 
| 609 |     # This will reject ASCII control chars
 | 
| 610 |     R(r'[^\0]', Id.Unknown_Tok),
 | 
| 611 | ]
 | 
| 612 | 
 | 
| 613 | # Exclude control characters 0x00-0x1f, aka 0-31 in J8 data only (not YSH code)
 | 
| 614 | _ASCII_CONTROL = R(r'[\x01-\x1F]', Id.Char_AsciiControl)
 | 
| 615 | 
 | 
| 616 | J8_LINES_DEF = _J8_LEFT + [
 | 
| 617 |     # not sure if we want \r here - same with lex_mode_e.Expr
 | 
| 618 |     R(r'[ \r\t]+', Id.WS_Space),
 | 
| 619 |     R(r'[\n]', Id.J8_Newline),
 | 
| 620 | 
 | 
| 621 |     # doesn't match \t, which means tabs are allowed in the middle of unquoted
 | 
| 622 |     # lines
 | 
| 623 |     _ASCII_CONTROL,
 | 
| 624 | 
 | 
| 625 |     # not space or ' or " or ASCII control or EOF
 | 
| 626 |     R(r'''[^ \t\r\n'"\x00-\x1F]+''', Id.Lit_Chars),
 | 
| 627 | ]
 | 
| 628 | 
 | 
| 629 | # https://json.org list of chars, plus '
 | 
| 630 | _JSON_ONE_CHAR = R(r'\\[\\"/bfnrt]', Id.Char_OneChar)
 | 
| 631 | 
 | 
| 632 | # b'' u'' strings - what's common between code and data.
 | 
| 633 | _J8_STR_COMMON = [
 | 
| 634 |     C("'", Id.Right_SingleQuote),  # end for J8
 | 
| 635 |     _JSON_ONE_CHAR,
 | 
| 636 |     C("\\'", Id.Char_OneChar),  # since ' ends, allow \'
 | 
| 637 |     R(r'\\y[0-9a-fA-F]{2}', Id.Char_YHex),  # \yff - J8 only
 | 
| 638 |     _U_BRACED_CHAR,  # \u{123456} - J8 only
 | 
| 639 | 
 | 
| 640 |     # osh/word_parse.py relies on this.  It has to be consistent with $''
 | 
| 641 |     # lexing, which uses _C_STRING_COMMON
 | 
| 642 |     C('\\', Id.Unknown_Backslash),
 | 
| 643 | ]
 | 
| 644 | 
 | 
| 645 | # Lexer for J8 strings in CODE.
 | 
| 646 | LEXER_DEF[lex_mode_e.J8_Str] = _J8_STR_COMMON + [
 | 
| 647 |     # Don't produce Char_AsciiControl tokens - that's only for data
 | 
| 648 | 
 | 
| 649 |     # will match invalid UTF-8 - we have a separate validation step
 | 
| 650 |     R(r"[^\\'\0]+", Id.Lit_Chars),
 | 
| 651 | ]
 | 
| 652 | 
 | 
| 653 | # Lexer for J8 string data.
 | 
| 654 | # ASCII control characters are disallowed in DATA, but not CODE!
 | 
| 655 | J8_STR_DEF = _J8_STR_COMMON + [
 | 
| 656 |     _ASCII_CONTROL,
 | 
| 657 |     # will match invalid UTF-8 - we have a separate validation step
 | 
| 658 |     R(r"[^\\'\x00-\x1F]+", Id.Lit_Chars),
 | 
| 659 | ]
 | 
| 660 | 
 | 
| 661 | # Lexer for JSON string data - e.g. "json \" \u1234"
 | 
| 662 | JSON_STR_DEF = [
 | 
| 663 |     C('"', Id.Right_DoubleQuote),  # end for JSON
 | 
| 664 |     _JSON_ONE_CHAR,
 | 
| 665 |     _U4_CHAR_STRICT,  # \u1234 - JSON only
 | 
| 666 | 
 | 
| 667 |     # High surrogate [\uD800, \uDC00)
 | 
| 668 |     # Low surrogate  [\uDC00, \uE000)
 | 
| 669 |     # This pattern makes it easier to decode.  Unpaired surrogates because Id.Char_Unicode4.
 | 
| 670 |     R(
 | 
| 671 |         r'\\u[dD][89aAbB][0-9a-fA-F][0-9a-fA-F]\\u[dD][cCdDeEfF][0-9a-fA-F][0-9a-fA-F]',
 | 
| 672 |         Id.Char_SurrogatePair),
 | 
| 673 |     C('\\', Id.Unknown_Backslash),  # e.g. the \ before bad \z
 | 
| 674 |     _ASCII_CONTROL,
 | 
| 675 | 
 | 
| 676 |     # Note: This will match INVALID UTF-8.  UTF-8 validation is another step.
 | 
| 677 |     R(r'[^\\"\x00-\x1F]+', Id.Lit_Chars),
 | 
| 678 | ]
 | 
| 679 | 
 | 
| 680 | OCTAL3_RE = r'\\[0-7]{1,3}'
 | 
| 681 | 
 | 
| 682 | # https://www.gnu.org/software/bash/manual/html_node/Controlling-the-PromptEvaluator.html#Controlling-the-PromptEvaluator
 | 
| 683 | PS1_DEF = [
 | 
| 684 |     R(OCTAL3_RE, Id.PS_Octal3),
 | 
| 685 |     R(r'\\[adehHjlnrstT@AuvVwW!#$\\]', Id.PS_Subst),
 | 
| 686 |     # \D{%H:%M} strftime format
 | 
| 687 |     R(r'\\D\{[^}\0]*\}', Id.PS_Subst),
 | 
| 688 |     C(r'\[', Id.PS_LBrace),  # non-printing
 | 
| 689 |     C(r'\]', Id.PS_RBrace),
 | 
| 690 |     R(r'[^\\\0]+', Id.PS_Literals),
 | 
| 691 |     # e.g. \x is not a valid escape.
 | 
| 692 |     C('\\', Id.PS_BadBackslash),
 | 
| 693 | ]
 | 
| 694 | 
 | 
| 695 | # NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
 | 
| 696 | # point of it is that supports other backslash escapes like \n!  It just
 | 
| 697 | # becomes a regular backslash.
 | 
| 698 | LEXER_DEF[lex_mode_e.SQ_C] = _C_STRING_COMMON + [
 | 
| 699 |     # Weird special case matching bash: backslash that ends a line.  We emit
 | 
| 700 |     # this token literally in OSH, but disable it in YSH.
 | 
| 701 |     C('\\\n', Id.Unknown_Backslash),
 | 
| 702 | 
 | 
| 703 |     # Silly difference!  In echo -e, the syntax is \0377, but here it's $'\377',
 | 
| 704 |     # with no leading 0.
 | 
| 705 |     R(OCTAL3_RE, Id.Char_Octal3),
 | 
| 706 | 
 | 
| 707 |     # ' and " are escaped in $'' mode, but not echo -e.
 | 
| 708 |     C(r"\'", Id.Char_OneChar),
 | 
| 709 |     C(r'\"', Id.Char_OneChar),
 | 
| 710 | 
 | 
| 711 |     # e.g. 'foo', anything that's not a backslash escape or '
 | 
| 712 |     R(r"[^\\'\0]+", Id.Lit_Chars),
 | 
| 713 |     C("'", Id.Right_SingleQuote),
 | 
| 714 | ]
 | 
| 715 | 
 | 
| 716 | LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
 | 
| 717 |     R(OCTAL3_RE, Id.Char_Octal3),
 | 
| 718 |     R(r"[^%\\\0]+", Id.Lit_Chars),
 | 
| 719 |     C('%%', Id.Format_EscapedPercent),
 | 
| 720 |     C('%', Id.Format_Percent),
 | 
| 721 | ]
 | 
| 722 | 
 | 
| 723 | # Maybe: bash also supports %(strftime)T
 | 
| 724 | LEXER_DEF[lex_mode_e.PrintfPercent] = [
 | 
| 725 |     # Flags
 | 
| 726 |     R('[- +#]', Id.Format_Flag),
 | 
| 727 |     C('0', Id.Format_Zero),
 | 
| 728 |     R('[1-9][0-9]*', Id.Format_Num),
 | 
| 729 |     C('*', Id.Format_Star),
 | 
| 730 |     C('.', Id.Format_Dot),
 | 
| 731 |     # We support dsq.  The others we parse to display an error message.
 | 
| 732 |     R('[disqbcouxXeEfFgG]', Id.Format_Type),
 | 
| 733 |     R('\([^()\0]*\)T', Id.Format_Time),
 | 
| 734 |     R(r'[^\0]', Id.Unknown_Tok),  # any other char
 | 
| 735 | ]
 | 
| 736 | 
 | 
| 737 | LEXER_DEF[lex_mode_e.VSub_1] = [
 | 
| 738 |     R(VAR_NAME_RE, Id.VSub_Name),
 | 
| 739 |     #  ${11} is valid, compared to $11 which is $1 and then literal 1.
 | 
| 740 |     R(r'[0-9]+', Id.VSub_Number),
 | 
| 741 |     C('!', Id.VSub_Bang),
 | 
| 742 |     C('@', Id.VSub_At),
 | 
| 743 |     C('#', Id.VSub_Pound),
 | 
| 744 |     C('$', Id.VSub_Dollar),
 | 
| 745 |     C('*', Id.VSub_Star),
 | 
| 746 |     C('-', Id.VSub_Hyphen),
 | 
| 747 |     C('?', Id.VSub_QMark),
 | 
| 748 |     C('.', Id.VSub_Dot),  # ${.myproc builtin sub}
 | 
| 749 |     C('}', Id.Right_DollarBrace),
 | 
| 750 |     C('\\\n', Id.Ignored_LineCont),
 | 
| 751 |     C('\n', Id.Unknown_Tok),  # newline not allowed inside ${}
 | 
| 752 |     R(r'[^\0]', Id.Unknown_Tok),  # any char except newline
 | 
| 753 | ]
 | 
| 754 | 
 | 
| 755 | LEXER_DEF[lex_mode_e.VSub_2] = \
 | 
| 756 |     ID_SPEC.LexerPairs(Kind.VTest) + \
 | 
| 757 |     ID_SPEC.LexerPairs(Kind.VOp0) + \
 | 
| 758 |     ID_SPEC.LexerPairs(Kind.VOpYsh) + \
 | 
| 759 |     ID_SPEC.LexerPairs(Kind.VOp1) + \
 | 
| 760 |     ID_SPEC.LexerPairs(Kind.VOp2) + \
 | 
| 761 |     ID_SPEC.LexerPairs(Kind.VOp3) + [
 | 
| 762 |     C('}', Id.Right_DollarBrace),
 | 
| 763 | 
 | 
| 764 |     C('\\\n', Id.Ignored_LineCont),
 | 
| 765 |     C('\n', Id.Unknown_Tok),  # newline not allowed inside ${}
 | 
| 766 |     R(r'[^\0]', Id.Unknown_Tok),  # any char except newline
 | 
| 767 | ]
 | 
| 768 | 
 | 
| 769 | _EXPR_ARITH_SHARED = [
 | 
| 770 |     C('\\\n', Id.Ignored_LineCont),
 | 
| 771 |     R(r'[^\0]', Id.Unknown_Tok)  # any char.  This should be a syntax error.
 | 
| 772 | ]
 | 
| 773 | 
 | 
| 774 | # https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
 | 
| 775 | LEXER_DEF[lex_mode_e.Arith] = \
 | 
| 776 |     _LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
 | 
| 777 | 
 | 
| 778 |     # Arithmetic expressions can cross newlines.
 | 
| 779 |     R(r'[ \t\r\n]+', Id.Ignored_Space),
 | 
| 780 | 
 | 
| 781 |     # Examples of arith constants:
 | 
| 782 |     #   64#azAZ
 | 
| 783 |     #   0xabc 0xABC
 | 
| 784 |     #   0123
 | 
| 785 |     # A separate digits token makes this easier to parse STATICALLY.  But this
 | 
| 786 |     # doesn't help with DYNAMIC parsing.
 | 
| 787 |     R(VAR_NAME_RE, Id.Lit_ArithVarLike),  # for variable names or 64#_
 | 
| 788 |     R(r'[0-9]+', Id.Lit_Digits),
 | 
| 789 |     C('@', Id.Lit_At),  # for 64#@ or ${a[@]}
 | 
| 790 |     C('#', Id.Lit_Pound),  # for 64#a
 | 
| 791 | 
 | 
| 792 |     # TODO: 64#@ interferes with VS_AT.  Hm.
 | 
| 793 | ] + ID_SPEC.LexerPairs(Kind.Arith) + _EXPR_ARITH_SHARED
 | 
| 794 | 
 | 
| 795 | # A lexer for the parser that converts globs to extended regexes.  Since we're
 | 
| 796 | # only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
 | 
| 797 | # don't need lexer modes here.
 | 
| 798 | GLOB_DEF = [
 | 
| 799 |     # These could be operators in the glob, or just literals in a char class,
 | 
| 800 |     # e.g.  touch '?'; echo [?].
 | 
| 801 |     C('*', Id.Glob_Star),
 | 
| 802 |     C('?', Id.Glob_QMark),
 | 
| 803 | 
 | 
| 804 |     # For negation.  Treated as operators inside [], but literals outside.
 | 
| 805 |     C('!', Id.Glob_Bang),
 | 
| 806 |     C('^', Id.Glob_Caret),
 | 
| 807 | 
 | 
| 808 |     # Character classes.
 | 
| 809 |     C('[', Id.Glob_LBracket),
 | 
| 810 |     C(']', Id.Glob_RBracket),
 | 
| 811 | 
 | 
| 812 |     # There is no whitelist of characters; backslashes are unconditionally
 | 
| 813 |     # removed.  With libc.fnmatch(), the pattern r'\f' matches 'f' but not '\\f'.
 | 
| 814 |     # See libc_test.py.
 | 
| 815 |     R(r'\\[^\0]', Id.Glob_EscapedChar),
 | 
| 816 |     C('\\', Id.Glob_BadBackslash),  # Trailing single backslash
 | 
| 817 | 
 | 
| 818 |     # For efficiency, combine other characters into a single token,  e.g. 'py' in
 | 
| 819 |     # '*.py' or 'alpha' in '[[:alpha:]]'.
 | 
| 820 |     R(r'[a-zA-Z0-9_]+', Id.Glob_CleanLiterals),  # no regex escaping
 | 
| 821 |     R(r'[^\0]', Id.Glob_OtherLiteral),  # anything else -- examine the char
 | 
| 822 | ]
 | 
| 823 | 
 | 
| 824 | # History expansion.  We're doing this as "pre-lexing" since that's what bash
 | 
| 825 | # and zsh seem to do.  Example:
 | 
| 826 | #
 | 
| 827 | # $ foo=x
 | 
| 828 | # $ echo $
 | 
| 829 | # $ !!foo   # expands to echo $foo and prints x
 | 
| 830 | #
 | 
| 831 | # We can also reuse this in the RootCompleter to expand history interactively.
 | 
| 832 | #
 | 
| 833 | # bash note: handled in lib/readline/histexpand.c.  Quite messy and handles
 | 
| 834 | # quotes AGAIN.
 | 
| 835 | #
 | 
| 836 | # Note: \! gets expanded to literal \! for the real lexer, but no history
 | 
| 837 | # expansion occurs.
 | 
| 838 | 
 | 
| 839 | HISTORY_DEF = [
 | 
| 840 |     # Common operators.
 | 
| 841 |     R(r'![!*^$]', Id.History_Op),
 | 
| 842 | 
 | 
| 843 |     # By command number.
 | 
| 844 |     R(r'!-?[0-9]+', Id.History_Num),
 | 
| 845 | 
 | 
| 846 |     # Search by prefix of substring (optional '?').
 | 
| 847 |     # NOTE: there are no numbers allowed here!  Bash doesn't seem to support it.
 | 
| 848 |     # No hyphen since it conflits with $-1 too.
 | 
| 849 |     #
 | 
| 850 |     # Required trailing whitespace is there to avoid conflict with [!charclass]
 | 
| 851 |     # and ${!indirect}.  This is a simpler hack than the one bash has.  See
 | 
| 852 |     # frontend/lex_test.py.
 | 
| 853 |     R(r'!\??[a-zA-Z_/.][0-9a-zA-Z_/.]+[ \t\r\n]', Id.History_Search),
 | 
| 854 | 
 | 
| 855 |     # Comment is until end of line
 | 
| 856 |     R(r"#[^\0]*", Id.History_Other),
 | 
| 857 | 
 | 
| 858 |     # Single quoted, e.g. 'a' or $'\n'.  Terminated by another single quote or
 | 
| 859 |     # end of string.
 | 
| 860 |     R(r"'[^'\0]*'?", Id.History_Other),
 | 
| 861 | 
 | 
| 862 |     # Runs of chars that are definitely not special
 | 
| 863 |     R(r"[^!\\'#\0]+", Id.History_Other),
 | 
| 864 | 
 | 
| 865 |     # Escaped characters.  \! disables history
 | 
| 866 |     R(r'\\[^\0]', Id.History_Other),
 | 
| 867 |     # Other single chars, like a trailing \ or !
 | 
| 868 |     R(r'[^\0]', Id.History_Other),
 | 
| 869 | ]
 | 
| 870 | 
 | 
| 871 | BRACE_RANGE_DEF = [
 | 
| 872 |     R(r'-?[0-9]+', Id.Range_Int),
 | 
| 873 |     R(r'[a-zA-Z]', Id.Range_Char),  # just a single character
 | 
| 874 |     R(r'\.\.', Id.Range_Dots),
 | 
| 875 |     R(r'[^\0]', Id.Range_Other),  # invalid
 | 
| 876 | ]
 | 
| 877 | 
 | 
| 878 | #
 | 
| 879 | # YSH lexing
 | 
| 880 | #
 | 
| 881 | 
 | 
| 882 | # Valid in lex_mode_e.{Expr,DQ}
 | 
| 883 | # Used by ysh/grammar_gen.py
 | 
| 884 | YSH_LEFT_SUBS = [
 | 
| 885 |     C('$(', Id.Left_DollarParen),
 | 
| 886 |     C('${', Id.Left_DollarBrace),
 | 
| 887 |     C('$[', Id.Left_DollarBracket),  # TODO: Implement $[x]
 | 
| 888 | ]
 | 
| 889 | 
 | 
| 890 | # Valid in lex_mode_e.Expr, but not valid in DQ
 | 
| 891 | # Used by ysh/grammar_gen.py
 | 
| 892 | 
 | 
| 893 | YSH_LEFT_UNQUOTED = [
 | 
| 894 |     # Double quoted
 | 
| 895 |     C('"', Id.Left_DoubleQuote),
 | 
| 896 |     C('$"', Id.Left_DollarDoubleQuote),  # $"" is synonym for ""
 | 
| 897 |     C('j"', Id.Left_JDoubleQuote),  # for printing ERROR
 | 
| 898 |     # Single quoted
 | 
| 899 |     C("'", Id.Left_SingleQuote),
 | 
| 900 |     C("r'", Id.Left_RSingleQuote),
 | 
| 901 |     C("u'", Id.Left_USingleQuote),
 | 
| 902 |     C("b'", Id.Left_BSingleQuote),
 | 
| 903 |     C("$'", Id.Left_DollarSingleQuote),  # legacy
 | 
| 904 |     C('^"', Id.Left_CaretDoubleQuote),
 | 
| 905 |     C('"""', Id.Left_TDoubleQuote),
 | 
| 906 |     C('$"""', Id.Left_DollarTDoubleQuote),
 | 
| 907 |     # In expression mode, we add the r'' and c'' prefixes for '' and $''.
 | 
| 908 |     C("'''", Id.Left_TSingleQuote),
 | 
| 909 |     C("r'''", Id.Left_RTSingleQuote),
 | 
| 910 |     C("u'''", Id.Left_UTSingleQuote),
 | 
| 911 |     C("b'''", Id.Left_BTSingleQuote),
 | 
| 912 |     C('@(', Id.Left_AtParen),  # Split Command Sub
 | 
| 913 |     C('^(', Id.Left_CaretParen),  # Block literals in expression mode
 | 
| 914 |     C('^[', Id.Left_CaretBracket),  # Expr literals
 | 
| 915 |     C('^{', Id.Left_CaretBrace),  # Unused
 | 
| 916 |     C(':|', Id.Left_ColonPipe),  # shell-like word arrays.
 | 
| 917 |     C('%(', Id.Left_PercentParen),  # old syntax for shell-like word arrays.
 | 
| 918 |     C('%[', Id.Expr_Reserved),  # Maybe: like %() without unquoted [], {}
 | 
| 919 |     C('%{', Id.Expr_Reserved),  # Table literals
 | 
| 920 |     # t = %{
 | 
| 921 |     #    name:Str  age:Int
 | 
| 922 |     #    'andy c'  10
 | 
| 923 |     # }
 | 
| 924 |     # Significant newlines.  No unquoted [], {}
 | 
| 925 | 
 | 
| 926 |     # Not sure if we'll use these
 | 
| 927 |     C('@{', Id.Expr_Reserved),
 | 
| 928 |     C('@[', Id.Expr_Reserved),
 | 
| 929 | 
 | 
| 930 |     # Idea: Set literals are #{a, b} like Clojure
 | 
| 931 | ]
 | 
| 932 | 
 | 
| 933 | # Used by ysh/grammar_gen.py
 | 
| 934 | EXPR_OPS = [
 | 
| 935 |     # Terminator
 | 
| 936 |     C(';', Id.Op_Semi),
 | 
| 937 |     C('(', Id.Op_LParen),
 | 
| 938 |     C(')', Id.Op_RParen),
 | 
| 939 |     # NOTE: type expressions are expressions, e.g. Dict[Str, Int]
 | 
| 940 |     C('[', Id.Op_LBracket),
 | 
| 941 |     C(']', Id.Op_RBracket),
 | 
| 942 |     C('{', Id.Op_LBrace),
 | 
| 943 |     C('}', Id.Op_RBrace),
 | 
| 944 | ]
 | 
| 945 | 
 | 
| 946 | # Newline is significant, but sometimes elided by expr_parse.py.
 | 
| 947 | _EXPR_NEWLINE_COMMENT = [
 | 
| 948 |     C('\n', Id.Op_Newline),
 | 
| 949 |     R(r'#[^\n\0]*', Id.Ignored_Comment),
 | 
| 950 |     # Like lex_mode_e.Arith, \r is whitespace even without \n
 | 
| 951 |     R(r'[ \t\r]+', Id.Ignored_Space),
 | 
| 952 | ]
 | 
| 953 | 
 | 
| 954 | _WHITESPACE = r'[ \t\r\n]*'  # ASCII whitespace doesn't have legacy \f \v
 | 
| 955 | 
 | 
| 956 | # Python allows 0 to be written 00 or 0_0_0, which is weird.  But let's be
 | 
| 957 | # consistent, and avoid '00' turning into a float!
 | 
| 958 | _DECIMAL_INT_RE = r'[0-9](_?[0-9])*'
 | 
| 959 | 
 | 
| 960 | # Used for YSH comparison operators > >= < <=
 | 
| 961 | LOOKS_LIKE_INTEGER = _WHITESPACE + '-?' + _DECIMAL_INT_RE + _WHITESPACE
 | 
| 962 | 
 | 
| 963 | _FLOAT_RE = (
 | 
| 964 |     _DECIMAL_INT_RE +
 | 
| 965 |     # Unlike Python, exponent can't be like 42e5_000.  There's no use because
 | 
| 966 |     # 1e309 is already inf.  Let's keep our code simple.
 | 
| 967 |     r'(\.' + _DECIMAL_INT_RE + ')?([eE][+\-]?[0-9]+)?')
 | 
| 968 | 
 | 
| 969 | # Ditto, used for comparison operators
 | 
| 970 | # Added optional Optional -?
 | 
| 971 | # Example: -3_000_000.000_001e12
 | 
| 972 | LOOKS_LIKE_FLOAT = _WHITESPACE + '-?' + _FLOAT_RE + _WHITESPACE
 | 
| 973 | 
 | 
| 974 | # Python 3 float literals:
 | 
| 975 | 
 | 
| 976 | # digitpart     ::=  digit (["_"] digit)*
 | 
| 977 | # fraction      ::=  "." digitpart
 | 
| 978 | # exponent      ::=  ("e" | "E") ["+" | "-"] digitpart
 | 
| 979 | # pointfloat    ::=  [digitpart] fraction | digitpart "."
 | 
| 980 | # exponentfloat ::=  (digitpart | pointfloat) exponent
 | 
| 981 | # floatnumber   ::=  pointfloat | exponentfloat
 | 
| 982 | 
 | 
| 983 | # NOTE: Borrowing tokens from Arith (i.e. $(( )) ), but not using LexerPairs().
 | 
| 984 | LEXER_DEF[lex_mode_e.Expr] = \
 | 
| 985 |     _VARS + YSH_LEFT_SUBS + YSH_LEFT_UNQUOTED + EXPR_OPS + EXPR_WORDS + \
 | 
| 986 |     EXPR_CHARS + [
 | 
| 987 | 
 | 
| 988 |     # https://docs.python.org/3/reference/lexical_analysis.html#integer-literals
 | 
| 989 |     #
 | 
| 990 |     # integer      ::=  decinteger | bininteger | octinteger | hexinteger
 | 
| 991 |     # decinteger   ::=  nonzerodigit (["_"] digit)* | "0"+ (["_"] "0")*
 | 
| 992 |     # bininteger   ::=  "0" ("b" | "B") (["_"] bindigit)+
 | 
| 993 |     # octinteger   ::=  "0" ("o" | "O") (["_"] octdigit)+
 | 
| 994 |     # hexinteger   ::=  "0" ("x" | "X") (["_"] hexdigit)+
 | 
| 995 |     # nonzerodigit ::=  "1"..."9"
 | 
| 996 |     # digit        ::=  "0"..."9"
 | 
| 997 |     # bindigit     ::=  "0" | "1"
 | 
| 998 |     # octdigit     ::=  "0"..."7"
 | 
| 999 |     # hexdigit     ::=  digit | "a"..."f" | "A"..."F"
 | 
| 1000 | 
 | 
| 1001 |     R(_DECIMAL_INT_RE, Id.Expr_DecInt),
 | 
| 1002 | 
 | 
| 1003 |     R(r'0[bB](_?[01])+', Id.Expr_BinInt),
 | 
| 1004 |     R(r'0[oO](_?[0-7])+', Id.Expr_OctInt),
 | 
| 1005 |     R(r'0[xX](_?[0-9a-fA-F])+', Id.Expr_HexInt),
 | 
| 1006 | 
 | 
| 1007 |     R(_FLOAT_RE, Id.Expr_Float),
 | 
| 1008 | 
 | 
| 1009 |     # These can be looked up as keywords separately, so you enforce that they have
 | 
| 1010 |     # space around them?
 | 
| 1011 |     R(VAR_NAME_RE, Id.Expr_Name),
 | 
| 1012 | 
 | 
| 1013 |     R('%' + VAR_NAME_RE, Id.Expr_Symbol),
 | 
| 1014 | 
 | 
| 1015 |     #
 | 
| 1016 |     # Arith
 | 
| 1017 |     #
 | 
| 1018 | 
 | 
| 1019 |     C(',', Id.Arith_Comma),
 | 
| 1020 |     C(':', Id.Arith_Colon),   # for slicing a[1:2], and mylist:pop()
 | 
| 1021 | 
 | 
| 1022 |     C('?', Id.Arith_QMark),   # regex postfix
 | 
| 1023 | 
 | 
| 1024 |     C('+', Id.Arith_Plus),    # arith infix, regex postfix
 | 
| 1025 |     C('-', Id.Arith_Minus),   # arith infix, regex postfix
 | 
| 1026 |     C('*', Id.Arith_Star),
 | 
| 1027 |     C('^', Id.Arith_Caret),   # xor
 | 
| 1028 |     C('/', Id.Arith_Slash),
 | 
| 1029 |     C('%', Id.Arith_Percent),
 | 
| 1030 | 
 | 
| 1031 |     C('**', Id.Arith_DStar),  # exponentiation
 | 
| 1032 |     C('++', Id.Arith_DPlus),  # Option for string/list concatenation
 | 
| 1033 | 
 | 
| 1034 |     C('<', Id.Arith_Less),
 | 
| 1035 |     C('>', Id.Arith_Great),
 | 
| 1036 |     C('<=', Id.Arith_LessEqual),
 | 
| 1037 |     C('>=', Id.Arith_GreatEqual),
 | 
| 1038 |     C('===', Id.Expr_TEqual),
 | 
| 1039 |     C('!==', Id.Expr_NotDEqual),
 | 
| 1040 | 
 | 
| 1041 |     C('==', Id.Unknown_DEqual),  # user must choose === or ~==
 | 
| 1042 | 
 | 
| 1043 |     # Bitwise operators
 | 
| 1044 |     C('&', Id.Arith_Amp),
 | 
| 1045 |     C('|', Id.Arith_Pipe),
 | 
| 1046 |     C('>>', Id.Arith_DGreat),
 | 
| 1047 |     C('<<', Id.Arith_DLess),  # Doesn't Java also have <<< ?
 | 
| 1048 | 
 | 
| 1049 |     # Bitwise complement, as well as infix pattern matching
 | 
| 1050 |     C('~', Id.Arith_Tilde),
 | 
| 1051 |     C('!~', Id.Expr_NotTilde),
 | 
| 1052 |     C('~~', Id.Expr_DTilde),
 | 
| 1053 |     C('!~~', Id.Expr_NotDTilde),
 | 
| 1054 | 
 | 
| 1055 |     # Left out for now:
 | 
| 1056 |     # ++ --       -- needed for loops, awk?
 | 
| 1057 |     # ! && ||     -- needed for find dialect
 | 
| 1058 |     # = += etc.
 | 
| 1059 | 
 | 
| 1060 |     C('=', Id.Arith_Equal),
 | 
| 1061 | 
 | 
| 1062 |     C('+=', Id.Arith_PlusEqual),
 | 
| 1063 |     C('-=', Id.Arith_MinusEqual),
 | 
| 1064 |     C('*=', Id.Arith_StarEqual),
 | 
| 1065 |     C('/=', Id.Arith_SlashEqual),
 | 
| 1066 |     C('%=', Id.Arith_PercentEqual),
 | 
| 1067 | 
 | 
| 1068 |     C('>>=', Id.Arith_DGreatEqual),
 | 
| 1069 |     C('<<=', Id.Arith_DLessEqual),
 | 
| 1070 |     C('&=', Id.Arith_AmpEqual),
 | 
| 1071 |     C('|=', Id.Arith_PipeEqual),
 | 
| 1072 |     C('^=', Id.Arith_CaretEqual),  # Exponentiation
 | 
| 1073 | 
 | 
| 1074 |     # Augmented assignment that YSH has, but sh and OSH don't have
 | 
| 1075 |     C('**=', Id.Expr_DStarEqual),
 | 
| 1076 |     C('//=', Id.Expr_DSlashEqual),
 | 
| 1077 | 
 | 
| 1078 |     #
 | 
| 1079 |     # Expr
 | 
| 1080 |     #
 | 
| 1081 | 
 | 
| 1082 |     C('!', Id.Expr_Bang),     # For eggex negation
 | 
| 1083 | 
 | 
| 1084 |     C('//', Id.Expr_DSlash),  # For YSH integer division
 | 
| 1085 |     C('~==', Id.Expr_TildeDEqual),  # approximate equality
 | 
| 1086 | 
 | 
| 1087 |     C('.', Id.Expr_Dot),      # d.key is alias for d['key']
 | 
| 1088 |     C('..', Id.Expr_DDot),    # range 1..5
 | 
| 1089 |     C('->', Id.Expr_RArrow),  # s->startswith()
 | 
| 1090 |     C('$', Id.Expr_Dollar),   # legacy regex end: /d+ $/ (better written /d+ >/
 | 
| 1091 | 
 | 
| 1092 |     # Reserved this.  Go uses it for channels, etc.
 | 
| 1093 |     # I guess it conflicts with -4<-3, but that's OK -- spaces suffices.
 | 
| 1094 |     C('<-', Id.Expr_Reserved),
 | 
| 1095 |     C('=>', Id.Expr_RDArrow), # for df => filter(age > 10)
 | 
| 1096 |                             # and match (x) { 1 => "one" }
 | 
| 1097 |                             # note: other languages use |>
 | 
| 1098 |                             # R/dplyr uses %>%
 | 
| 1099 | 
 | 
| 1100 |     C('...', Id.Expr_Ellipsis),  # f(...args) and maybe a[:, ...]
 | 
| 1101 | 
 | 
| 1102 |     # For multiline regex literals?
 | 
| 1103 |     C('///', Id.Expr_Reserved),
 | 
| 1104 | 
 | 
| 1105 |     # Splat operators
 | 
| 1106 |     C('@', Id.Expr_At),
 | 
| 1107 |     # NOTE: Unused
 | 
| 1108 |     C('@@', Id.Expr_DoubleAt),
 | 
| 1109 | ] + _EXPR_NEWLINE_COMMENT + _EXPR_ARITH_SHARED
 | 
| 1110 | 
 | 
| 1111 | LEXER_DEF[lex_mode_e.FuncParens] = [
 | 
| 1112 |     # () with spaces
 | 
| 1113 |     R(r'[ \t]*\([ \t]*\)', Id.LookAhead_FuncParens),
 | 
| 1114 |     # anything else
 | 
| 1115 |     R(r'[^\0]', Id.Unknown_Tok)
 | 
| 1116 | ]
 |