| 1 | """
 | 
| 2 | lex.py -- Shell lexer.
 | 
| 3 | 
 | 
| 4 | It consists of a series of lexer modes, each with a regex -> Id mapping.
 | 
| 5 | 
 | 
| 6 | NOTE: If this changes, the lexer may need to be recompiled with
 | 
| 7 | build/codegen.sh lexer.
 | 
| 8 | 
 | 
| 9 | Input Handling
 | 
| 10 | --------------
 | 
| 11 | 
 | 
| 12 | Note that our style of input Handling affects the regular expressions in the
 | 
| 13 | lexer.
 | 
| 14 | 
 | 
| 15 | We pass one line at a time to the Lexer, via LineLexer.  We must be able to
 | 
| 16 | parse one line at a time because of interactive parsing (e.g. using the output
 | 
| 17 | of GNU readline.)
 | 
| 18 | 
 | 
| 19 | There are two ways we could handle input:
 | 
| 20 | 
 | 
| 21 |   1. Every line is NUL terminated:
 | 
| 22 |      'one\n\0' 'last line\0'
 | 
| 23 |   2. Every line is terminated by NUL, except the last:
 | 
| 24 |      'one\n' 'last line\0'
 | 
| 25 | 
 | 
| 26 | The advantage of #2 is that in the common case of reading files, we don't have
 | 
| 27 | to do it one line at a time.  We could slurp the whole file in, or mmap() it,
 | 
| 28 | etc.
 | 
| 29 | 
 | 
| 30 | The second option makes the regular expressions more complicated, so I'm
 | 
| 31 | punting on it for now.  We assume the first.
 | 
| 32 | 
 | 
| 33 | That means:
 | 
| 34 | 
 | 
| 35 |   - No regexes below should match \0.  They are added by
 | 
| 36 |     core/lexer_gen.py for re2c.
 | 
| 37 | 
 | 
| 38 | For example, [^']+ is not valid.  [^'\0]+ is correct.  Otherwise we would read
 | 
| 39 | uninitialized memory past the sentinel.
 | 
| 40 | 
 | 
| 41 | Python's regex engine knows where the end of the input string is, so it
 | 
| 42 | doesn't require need a sentinel like \0.
 | 
| 43 | 
 | 
| 44 | Note that re2c is not able to work in a mode with a strict length limit.  It
 | 
| 45 | would cause too many extra checks?  The language is then no longer regular!
 | 
| 46 | 
 | 
| 47 | http://re2c.org/examples/example_03.html
 | 
| 48 | 
 | 
| 49 | UPDATE: Two More Options
 | 
| 50 | ------------------------
 | 
| 51 | 
 | 
| 52 | 3. Change the \n at the end of every line to \0.  \0 becomes Id.Op_Newline, at
 | 
| 53 | least in lex_mode.OUTER.
 | 
| 54 | 
 | 
| 55 | Advantage: This makes the regular expressions easier to generate, but allows
 | 
| 56 | you to read in the whole file at once instead of allocating lines.
 | 
| 57 | 
 | 
| 58 | Disadvantages:
 | 
| 59 | - You can't mmap() the file because the data is mutated.  Or it will have to be
 | 
| 60 |   copy-on-write.
 | 
| 61 | - You can't get rid of comment lines if you read the whole file.
 | 
| 62 | 
 | 
| 63 | 4. Read a line at a time.  Throw away the lines, unless you're parsing a
 | 
| 64 | function, which should be obvious.
 | 
| 65 | 
 | 
| 66 | After you parse the function, you can COPY all the tokens to another location.
 | 
| 67 | Very few tokens need their actual text data.  Most of them can just be
 | 
| 68 | identified by ID.
 | 
| 69 | 
 | 
| 70 | Contents are relevant:
 | 
| 71 | 
 | 
| 72 | - Lit_Chars, Lit_Other, Lit_EscapedChar, Lit_Digits
 | 
| 73 | - Id.Lit_VarLike -- for the name, and for = vs +=
 | 
| 74 | - Id.Lit_ArithVarLike
 | 
| 75 | - VSub_Name, VSub_Number
 | 
| 76 | - Id.Redir_* for the LHS file descriptor.  Although this is one or two bytes
 | 
| 77 |   that could be copied.
 | 
| 78 | 
 | 
| 79 | You can also take this opportunity to enter the strings in an intern table.
 | 
| 80 | How much memory would that save?
 | 
| 81 | 
 | 
| 82 | Remaining constructs
 | 
| 83 | --------------------
 | 
| 84 | 
 | 
| 85 | Case terminators:
 | 
| 86 |   ;;&                  Op_DSemiAmp  for case
 | 
| 87 |   ;&                   Op_Semi
 | 
| 88 | 
 | 
| 89 | Left Index:
 | 
| 90 | 
 | 
| 91 |   _VAR_NAME_RE + '\['  Lit_LeftIndexLikeOpen
 | 
| 92 |   ]=                   Lit_LeftIndexLikeClose
 | 
| 93 | 
 | 
| 94 | Indexed array and Associative array literals:
 | 
| 95 |   declare -A a=([key]=value [key2]=value2)
 | 
| 96 |   declare -a a=([1 + 2]=value [3 + 4]=value2)  # parsed!
 | 
| 97 | 
 | 
| 98 |   Lit_LBracket Lit_RBracketEqual
 | 
| 99 |   Left_Bracket, Right_BracketEqual?
 | 
| 100 |   Op_LBracket Op_RBracketEqual
 | 
| 101 | """
 | 
| 102 | 
 | 
| 103 | import re
 | 
| 104 | 
 | 
| 105 | from osh.meta import Id, Kind, ID_SPEC
 | 
| 106 | from core.lexer import C, R
 | 
| 107 | 
 | 
| 108 | from osh.meta import types
 | 
| 109 | 
 | 
| 110 | lex_mode_e = types.lex_mode_e
 | 
| 111 | 
 | 
| 112 | 
 | 
| 113 | # In oil, I hope to have these lexer modes:
 | 
| 114 | # COMMAND
 | 
| 115 | # EXPRESSION (takes place of ARITH, VS_UNQ_ARG, VS_DQ_ARG)
 | 
| 116 | # SQ  RAW_SQ  DQ  RAW_DQ
 | 
| 117 | # VS    -- a single state here?  Or switches into expression state, because }
 | 
| 118 | #          is an operator
 | 
| 119 | # Problem: DICT_KEY might be a different state, to accept either a bare word
 | 
| 120 | # foo, or an expression (X=a+2), which is allowed in shell.  Python doesn't
 | 
| 121 | # allowed unquoted words, but we want to.
 | 
| 122 | 
 | 
| 123 | # TODO: There are 4 shared groups here.  I think you should test if that
 | 
| 124 | # structure should be preserved through re2c.  Do a benchmark.
 | 
| 125 | #
 | 
| 126 | # If a group has no matches, then return Id.Unknown_Tok?  And then you can
 | 
| 127 | # chain the groups in order.  It might make sense to experiment with the order
 | 
| 128 | # too.
 | 
| 129 | 
 | 
| 130 | _BACKSLASH = [
 | 
| 131 |   R(r'\\[^\n\0]', Id.Lit_EscapedChar),
 | 
| 132 |   C('\\\n', Id.Ignored_LineCont),
 | 
| 133 | ]
 | 
| 134 | 
 | 
| 135 | _VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
 | 
| 136 | 
 | 
| 137 | # Used by osh/cmd_parse.py to validate for loop name.  Note it must be
 | 
| 138 | # anchored on the right.
 | 
| 139 | VAR_NAME_RE = re.compile(_VAR_NAME_RE + '$')
 | 
| 140 | 
 | 
| 141 | # All Kind.VSub
 | 
| 142 | _VARS = [
 | 
| 143 |   # Unbraced variables
 | 
| 144 |   R(r'\$' + _VAR_NAME_RE, Id.VSub_Name),
 | 
| 145 |   R(r'\$[0-9]', Id.VSub_Number),
 | 
| 146 |   C(r'$!', Id.VSub_Bang),
 | 
| 147 |   C(r'$@', Id.VSub_At),
 | 
| 148 |   C(r'$#', Id.VSub_Pound),
 | 
| 149 |   C(r'$$', Id.VSub_Dollar),
 | 
| 150 |   C(r'$*', Id.VSub_Star),
 | 
| 151 |   C(r'$-', Id.VSub_Hyphen),
 | 
| 152 |   C(r'$?', Id.VSub_QMark),
 | 
| 153 | ]
 | 
| 154 | 
 | 
| 155 | # Kind.Left that are valid in double-quoted modes.
 | 
| 156 | _LEFT_SUBS = [
 | 
| 157 |   C('`', Id.Left_Backtick),
 | 
| 158 |   C('$(', Id.Left_CommandSub),
 | 
| 159 |   C('${', Id.Left_VarSub),
 | 
| 160 |   C('$((', Id.Left_ArithSub),
 | 
| 161 |   C('$[', Id.Left_ArithSub2),
 | 
| 162 | ]
 | 
| 163 | 
 | 
| 164 | # Additional Kind.Left that are valid in unquoted modes.
 | 
| 165 | _LEFT_UNQUOTED = [
 | 
| 166 |   C('"', Id.Left_DoubleQuote),
 | 
| 167 |   C("'", Id.Left_SingleQuote),
 | 
| 168 |   C('$"', Id.Left_DollarDoubleQuote),
 | 
| 169 |   C("$'", Id.Left_DollarSingleQuote),
 | 
| 170 | 
 | 
| 171 |   C('<(', Id.Left_ProcSubIn),
 | 
| 172 |   C('>(', Id.Left_ProcSubOut),
 | 
| 173 | ]
 | 
| 174 | 
 | 
| 175 | # Constructs used:
 | 
| 176 | # Character classes [] with simple ranges and negation, +, *, \n, \0
 | 
| 177 | # It would be nice to express this as CRE ... ?  And then compile to re2c
 | 
| 178 | # syntax.  And Python syntax.
 | 
| 179 | 
 | 
| 180 | # NOTE: Should remain compatible with re2c syntax, for code gen.
 | 
| 181 | # http://re2c.org/manual/syntax/syntax.html
 | 
| 182 | 
 | 
| 183 | # PROBLEM: \0 in Python re vs \000 in re2?  Can this be unified?
 | 
| 184 | # Yes, Python allows \000 octal escapes.
 | 
| 185 | #
 | 
| 186 | # https://docs.python.org/2/library/re.html
 | 
| 187 | 
 | 
| 188 | LEXER_DEF = {}  # TODO: Should be a list so we enforce order.
 | 
| 189 | 
 | 
| 190 | # Anything until the end of the line is a comment.  Does not match the newline
 | 
| 191 | # itself.  We want to switch modes and possibly process Op_Newline for here
 | 
| 192 | # docs, etc.
 | 
| 193 | LEXER_DEF[lex_mode_e.COMMENT] = [
 | 
| 194 |   R(r'[^\n\0]*', Id.Ignored_Comment)
 | 
| 195 | ]
 | 
| 196 | 
 | 
| 197 | _UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
 | 
| 198 |   # NOTE: We could add anything 128 and above to this character class?  So
 | 
| 199 |   # utf-8 characters don't get split?
 | 
| 200 |   R(r'[a-zA-Z0-9_/.-]+', Id.Lit_Chars),
 | 
| 201 |   # e.g. beginning of NAME=val, which will always be longer than the above
 | 
| 202 |   # Id.Lit_Chars.
 | 
| 203 |   R(r'[a-zA-Z_][a-zA-Z0-9_]*\+?=', Id.Lit_VarLike),
 | 
| 204 | 
 | 
| 205 |   C('#', Id.Lit_Pound),  # For comments
 | 
| 206 | 
 | 
| 207 |   # Needs to be LONGER than any other
 | 
| 208 |   #(_VAR_NAME_RE + r'\[', Id.Lit_Maybe_LHS_ARRAY),
 | 
| 209 |   # Id.Lit_Maybe_LHS_ARRAY2
 | 
| 210 |   #(r'\]\+?=', Id.Lit_Maybe_ARRAY_ASSIGN_RIGHT),
 | 
| 211 | 
 | 
| 212 |   # For brace expansion {a,b}
 | 
| 213 |   C('{', Id.Lit_LBrace),
 | 
| 214 |   C('}', Id.Lit_RBrace),  # Also for var sub ${a}
 | 
| 215 |   C(',', Id.Lit_Comma),
 | 
| 216 |   C('~', Id.Lit_Tilde),  # For tilde expansion
 | 
| 217 | 
 | 
| 218 |   R(r'[ \t\r]+', Id.WS_Space),
 | 
| 219 | 
 | 
| 220 |   C('\n', Id.Op_Newline),
 | 
| 221 | 
 | 
| 222 |   C('&', Id.Op_Amp),
 | 
| 223 |   C('|', Id.Op_Pipe),
 | 
| 224 |   C('|&', Id.Op_PipeAmp),
 | 
| 225 |   C('&&', Id.Op_DAmp),
 | 
| 226 |   C('||', Id.Op_DPipe),
 | 
| 227 |   C(';', Id.Op_Semi),
 | 
| 228 |   C(';;', Id.Op_DSemi),
 | 
| 229 | 
 | 
| 230 |   C('(', Id.Op_LParen),
 | 
| 231 |   C(')', Id.Op_RParen),
 | 
| 232 | 
 | 
| 233 |   R(r'[0-9]*<', Id.Redir_Less),
 | 
| 234 |   R(r'[0-9]*>', Id.Redir_Great),
 | 
| 235 |   R(r'[0-9]*<<', Id.Redir_DLess),
 | 
| 236 |   R(r'[0-9]*<<<', Id.Redir_TLess),
 | 
| 237 |   R(r'[0-9]*>>', Id.Redir_DGreat),
 | 
| 238 |   R(r'[0-9]*<<-', Id.Redir_DLessDash),
 | 
| 239 |   R(r'[0-9]*>&', Id.Redir_GreatAnd),
 | 
| 240 |   R(r'[0-9]*<&', Id.Redir_LessAnd),
 | 
| 241 |   R(r'[0-9]*<>', Id.Redir_LessGreat),
 | 
| 242 |   R(r'[0-9]*>\|', Id.Redir_Clobber),
 | 
| 243 | 
 | 
| 244 |   R(r'[^\0]', Id.Lit_Other),  # any other single char is a literal
 | 
| 245 | ]
 | 
| 246 | 
 | 
| 247 | # In OUTER and DBRACKET states.
 | 
| 248 | _EXTGLOB_BEGIN = [
 | 
| 249 |   C('@(', Id.ExtGlob_At),
 | 
| 250 |   C('*(', Id.ExtGlob_Star),
 | 
| 251 |   C('+(', Id.ExtGlob_Plus),
 | 
| 252 |   C('?(', Id.ExtGlob_QMark),
 | 
| 253 |   C('!(', Id.ExtGlob_Bang),
 | 
| 254 | ]
 | 
| 255 | 
 | 
| 256 | _KEYWORDS = [
 | 
| 257 |   # NOTE: { is matched elsewhere
 | 
| 258 |   C('[[',       Id.KW_DLeftBracket),
 | 
| 259 |   C('!',        Id.KW_Bang),
 | 
| 260 |   C('for',      Id.KW_For),
 | 
| 261 |   C('while',    Id.KW_While),
 | 
| 262 |   C('until',    Id.KW_Until),
 | 
| 263 |   C('do',       Id.KW_Do),
 | 
| 264 |   C('done',     Id.KW_Done),
 | 
| 265 |   C('in',       Id.KW_In),
 | 
| 266 |   C('case',     Id.KW_Case),
 | 
| 267 |   C('esac',     Id.KW_Esac),
 | 
| 268 |   C('if',       Id.KW_If),
 | 
| 269 |   C('fi',       Id.KW_Fi),
 | 
| 270 |   C('then',     Id.KW_Then),
 | 
| 271 |   C('else',     Id.KW_Else),
 | 
| 272 |   C('elif',     Id.KW_Elif),
 | 
| 273 |   C('function', Id.KW_Function),
 | 
| 274 |   C('time',     Id.KW_Time),
 | 
| 275 | ]
 | 
| 276 | 
 | 
| 277 | # These are treated like builtins in bash, but keywords in OSH.  However, we
 | 
| 278 | # main compatibility with bash for the 'type' builtin.
 | 
| 279 | _MORE_KEYWORDS = [
 | 
| 280 |   C('declare',  Id.Assign_Declare),
 | 
| 281 |   C('typeset',  Id.Assign_Typeset),
 | 
| 282 |   C('local',    Id.Assign_Local),
 | 
| 283 |   C('readonly', Id.Assign_Readonly),
 | 
| 284 | 
 | 
| 285 |   C('break',    Id.ControlFlow_Break),
 | 
| 286 |   C('continue', Id.ControlFlow_Continue),
 | 
| 287 |   C('return',   Id.ControlFlow_Return),
 | 
| 288 |   C('exit',     Id.ControlFlow_Exit),
 | 
| 289 | ]
 | 
| 290 | 
 | 
| 291 | 
 | 
| 292 | _TYPE_KEYWORDS = set(name for _, name, _ in _KEYWORDS)
 | 
| 293 | _TYPE_KEYWORDS.add('{')  # not in our lexer list
 | 
| 294 | _TYPE_BUILTINS = set(name for _, name, _ in _MORE_KEYWORDS)
 | 
| 295 | 
 | 
| 296 | 
 | 
| 297 | def IsOtherBuiltin(name):
 | 
| 298 |   return name in _TYPE_BUILTINS
 | 
| 299 | 
 | 
| 300 | 
 | 
| 301 | def IsKeyword(name):
 | 
| 302 |   return name in _TYPE_KEYWORDS
 | 
| 303 | 
 | 
| 304 | 
 | 
| 305 | # These two can must be recognized in the OUTER state, but can't nested within
 | 
| 306 | # [[.
 | 
| 307 | # Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
 | 
| 308 | # of <Lit_Chars "if">.
 | 
| 309 | LEXER_DEF[lex_mode_e.OUTER] = [
 | 
| 310 |   C('((', Id.Op_DLeftParen),  # not allowed within [[
 | 
| 311 | ] + _KEYWORDS + _MORE_KEYWORDS + _UNQUOTED + _EXTGLOB_BEGIN
 | 
| 312 | 
 | 
| 313 | # DBRACKET: can be like OUTER, except:
 | 
| 314 | # - Don't really need redirects either... Redir_Less could be Op_Less
 | 
| 315 | # - Id.Op_DLeftParen can't be nested inside.
 | 
| 316 | LEXER_DEF[lex_mode_e.DBRACKET] = [
 | 
| 317 |   C(']]', Id.Lit_DRightBracket),
 | 
| 318 |   C('!', Id.KW_Bang),
 | 
| 319 | ] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
 | 
| 320 |     ID_SPEC.LexerPairs(Kind.BoolBinary) + \
 | 
| 321 |     _UNQUOTED + _EXTGLOB_BEGIN
 | 
| 322 | 
 | 
| 323 | # Inside an extended glob, most characters are literals, including spaces and
 | 
| 324 | # punctuation.  We also accept \, $var, ${var}, "", etc.  They can also be
 | 
| 325 | # nested, so _EXTGLOB_BEGIN appears here.
 | 
| 326 | #
 | 
| 327 | # Example: echo @(<> <>|&&|'foo'|$bar)
 | 
| 328 | LEXER_DEF[lex_mode_e.EXTGLOB] = \
 | 
| 329 |     _BACKSLASH + _LEFT_SUBS + _VARS + _EXTGLOB_BEGIN + [
 | 
| 330 |   R(r'[^\\$`"\'|)@*+!?\0]+', Id.Lit_Chars),
 | 
| 331 |   C('|', Id.Op_Pipe),
 | 
| 332 |   C(')', Id.Op_RParen),  # maybe be translated to Id.ExtGlob_RParen
 | 
| 333 |   R(r'[^\0]', Id.Lit_Other),  # everything else is literal
 | 
| 334 | ]
 | 
| 335 | 
 | 
| 336 | 
 | 
| 337 | LEXER_DEF[lex_mode_e.BASH_REGEX] = [
 | 
| 338 |   # Match these literals first, and then the rest of the OUTER state I guess.
 | 
| 339 |   # That's how bash works.
 | 
| 340 |   #
 | 
| 341 |   # At a minimum, you do need $ and ~ expansions to happen.  <>;& could have
 | 
| 342 |   # been allowed unescaped too, but that's not what bash does.  The criteria
 | 
| 343 |   # was whether they were "special" in both languages, which seems dubious.
 | 
| 344 |   C('(', Id.Lit_Chars),
 | 
| 345 |   C(')', Id.Lit_Chars),
 | 
| 346 |   C('|', Id.Lit_Chars),
 | 
| 347 | ] + [
 | 
| 348 |   # Avoid "unreachable rule error"
 | 
| 349 |   (is_regex, pat, re_list) for 
 | 
| 350 |   (is_regex, pat, re_list) in _UNQUOTED
 | 
| 351 |   if not (is_regex == False and pat in ('(', ')', '|'))
 | 
| 352 | ]
 | 
| 353 | 
 | 
| 354 | 
 | 
| 355 | LEXER_DEF[lex_mode_e.DQ] = [
 | 
| 356 |   # Only 4 characters are backslash escaped inside "".
 | 
| 357 |   # https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
 | 
| 358 |   R(r'\\[$`"\\]', Id.Lit_EscapedChar),
 | 
| 359 |   C('\\\n', Id.Ignored_LineCont),
 | 
| 360 | ] + _LEFT_SUBS + _VARS + [
 | 
| 361 |   R(r'[^$`"\0\\]+', Id.Lit_Chars),  # matches a line at most
 | 
| 362 |   # NOTE: When parsing here doc line, this token doesn't end it.
 | 
| 363 |   C('"', Id.Right_DoubleQuote),
 | 
| 364 |   R(r'[^\0]', Id.Lit_Other),  # e.g. "$"
 | 
| 365 | ]
 | 
| 366 | 
 | 
| 367 | _VS_ARG_COMMON = _BACKSLASH + [
 | 
| 368 |   C('/', Id.Lit_Slash),  # for patsub (not Id.VOp2_Slash)
 | 
| 369 |   C('#', Id.Lit_Pound),  # for patsub prefix (not Id.VOp1_Pound)
 | 
| 370 |   C('%', Id.Lit_Percent),  # for patsdub suffix (not Id.VOp1_Percent)
 | 
| 371 |   C('}', Id.Right_VarSub),  # For var sub "${a}"
 | 
| 372 | ]
 | 
| 373 | 
 | 
| 374 | # Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
 | 
| 375 | LEXER_DEF[lex_mode_e.VS_ARG_UNQ] = \
 | 
| 376 |   _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
 | 
| 377 |   # NOTE: added < and > so it doesn't eat <()
 | 
| 378 |   R(r'[^$`/}"\'\0\\#%<>]+', Id.Lit_Chars),
 | 
| 379 |   R(r'[^\0]', Id.Lit_Other),  # e.g. "$", must be last
 | 
| 380 | ]
 | 
| 381 | 
 | 
| 382 | # Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
 | 
| 383 | LEXER_DEF[lex_mode_e.VS_ARG_DQ] = _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
 | 
| 384 |   R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars),  # matches a line at most
 | 
| 385 |   # Weird wart: even in double quoted state, double quotes are allowed
 | 
| 386 |   C('"', Id.Left_DoubleQuote),
 | 
| 387 |   R(r'[^\0]', Id.Lit_Other),  # e.g. "$", must be last
 | 
| 388 | ]
 | 
| 389 | 
 | 
| 390 | # NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
 | 
| 391 | # state.
 | 
| 392 | LEXER_DEF[lex_mode_e.SQ] = [
 | 
| 393 |   R(r"[^'\0]+", Id.Lit_Chars),  # matches a line at most
 | 
| 394 |   C("'", Id.Right_SingleQuote),
 | 
| 395 | ]
 | 
| 396 | 
 | 
| 397 | # Shared between echo -e and $''.
 | 
| 398 | _C_STRING_COMMON = [
 | 
| 399 | 
 | 
| 400 |   # \x6 is valid in bash
 | 
| 401 |   R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex),
 | 
| 402 |   R(r'\\u[0-9]{1,4}', Id.Char_Unicode4),
 | 
| 403 |   R(r'\\U[0-9]{1,8}', Id.Char_Unicode8),
 | 
| 404 | 
 | 
| 405 |   R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
 | 
| 406 | 
 | 
| 407 |   # Backslash that ends a line.  Note '.' doesn't match a newline character.
 | 
| 408 |   C('\\\n', Id.Char_Literals),
 | 
| 409 | 
 | 
| 410 |   # e.g. \A is not an escape, and \x doesn't match a hex escape.  We allow it,
 | 
| 411 |   # but a lint tool could warn about it.
 | 
| 412 |   C('\\', Id.Char_BadBackslash),
 | 
| 413 | ]
 | 
| 414 | 
 | 
| 415 | # Used by ECHO_LEXER in core/builtin.py.
 | 
| 416 | ECHO_E_DEF = _C_STRING_COMMON + [
 | 
| 417 |   # Note: tokens above \0377 can either be truncated or be flagged a syntax
 | 
| 418 |   # error in strict mode.
 | 
| 419 |   R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
 | 
| 420 | 
 | 
| 421 |   C(r'\c', Id.Char_Stop),
 | 
| 422 | 
 | 
| 423 |   # Bad Backslash should not end the string.  We allow it, but a lint tool
 | 
| 424 |   # should warn about it.
 | 
| 425 |   R(r'\\$', Id.Char_BadBackslash),
 | 
| 426 | 
 | 
| 427 |   # e.g. 'foo', anything that's not a backslash escape
 | 
| 428 |   R(r'[^\\]+', Id.Char_Literals),
 | 
| 429 | ]
 | 
| 430 | 
 | 
| 431 | # NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
 | 
| 432 | # point of it is that supports other backslash escapes like \n!  It just
 | 
| 433 | # becomes a regular backslash.
 | 
| 434 | LEXER_DEF[lex_mode_e.DOLLAR_SQ] = _C_STRING_COMMON + [
 | 
| 435 |   # Silly difference!  In echo -e, the syntax is \0377, but here it's $'\377',
 | 
| 436 |   # with no leading 0.
 | 
| 437 |   R(r'\\[0-7]{1,3}', Id.Char_Octal3),
 | 
| 438 | 
 | 
| 439 |   # ' is escaped in $'' mode, but not echo -e.  Ditto fr ", not sure why.
 | 
| 440 |   C(r"\'", Id.Char_OneChar),
 | 
| 441 |   C(r'\"', Id.Char_OneChar),
 | 
| 442 | 
 | 
| 443 |   # e.g. 'foo', anything that's not a backslash escape.  Need to exclude ' as
 | 
| 444 |   # well.
 | 
| 445 |   R(r"[^\\'\0]+", Id.Char_Literals),
 | 
| 446 | 
 | 
| 447 |   C("'", Id.Right_SingleQuote),
 | 
| 448 | 
 | 
| 449 |   # Backslash that ends the file!  Caught by re2c exhaustiveness check.  Parser
 | 
| 450 |   # will assert; should give a better syntax error.
 | 
| 451 |   C('\\\0', Id.Unknown_Tok),
 | 
| 452 | ]
 | 
| 453 | 
 | 
| 454 | LEXER_DEF[lex_mode_e.VS_1] = [
 | 
| 455 |   R(_VAR_NAME_RE, Id.VSub_Name),
 | 
| 456 |   #  ${11} is valid, compared to $11 which is $1 and then literal 1.
 | 
| 457 |   R(r'[0-9]+', Id.VSub_Number),
 | 
| 458 |   C('!', Id.VSub_Bang),
 | 
| 459 |   C('@', Id.VSub_At),
 | 
| 460 |   C('#', Id.VSub_Pound),
 | 
| 461 |   C('$', Id.VSub_Dollar),
 | 
| 462 |   C('*', Id.VSub_Star),
 | 
| 463 |   C('-', Id.VSub_Hyphen),
 | 
| 464 |   C('?', Id.VSub_QMark),
 | 
| 465 | 
 | 
| 466 |   C('}', Id.Right_VarSub),
 | 
| 467 | 
 | 
| 468 |   C('\\\n', Id.Ignored_LineCont),
 | 
| 469 | 
 | 
| 470 |   C('\n', Id.Unknown_Tok),  # newline not allowed inside ${}
 | 
| 471 |   R(r'[^\0]', Id.Unknown_Tok),  # any char except newline
 | 
| 472 | ]
 | 
| 473 | 
 | 
| 474 | LEXER_DEF[lex_mode_e.VS_2] = \
 | 
| 475 |     ID_SPEC.LexerPairs(Kind.VTest) + \
 | 
| 476 |     ID_SPEC.LexerPairs(Kind.VOp1) + \
 | 
| 477 |     ID_SPEC.LexerPairs(Kind.VOp2) + [
 | 
| 478 |   C('}', Id.Right_VarSub),
 | 
| 479 | 
 | 
| 480 |   C('\\\n', Id.Ignored_LineCont),
 | 
| 481 |   C('\n', Id.Unknown_Tok),  # newline not allowed inside ${}
 | 
| 482 |   R(r'[^\0]', Id.Unknown_Tok),  # any char except newline
 | 
| 483 | ]
 | 
| 484 | 
 | 
| 485 | # https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
 | 
| 486 | LEXER_DEF[lex_mode_e.ARITH] = \
 | 
| 487 |     _LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
 | 
| 488 |   # newline is ignored space, unlike in OUTER
 | 
| 489 |   R(r'[ \t\r\n]+', Id.Ignored_Space),
 | 
| 490 | 
 | 
| 491 |   # Examples of arith constants:
 | 
| 492 |   #   64#azAZ
 | 
| 493 |   #   0xabc 0xABC
 | 
| 494 |   #   0123
 | 
| 495 |   # A separate digits token makes this easier to parse STATICALLY.  But this
 | 
| 496 |   # doesn't help with DYNAMIC parsing.
 | 
| 497 |   R(_VAR_NAME_RE, Id.Lit_ArithVarLike),  # for variable names or 64#_
 | 
| 498 |   R(r'[0-9]+', Id.Lit_Digits),
 | 
| 499 |   C('@', Id.Lit_At),  # for 64#@ or ${a[@]}
 | 
| 500 |   C('#', Id.Lit_Pound),  # for 64#a
 | 
| 501 | 
 | 
| 502 | # TODO: 64#@ interferes with VS_AT.  Hm.
 | 
| 503 | ] + ID_SPEC.LexerPairs(Kind.Arith) + [
 | 
| 504 |   C('\\\n', Id.Ignored_LineCont),
 | 
| 505 |   R(r'[^\0]', Id.Unknown_Tok)  # any char.  This should be a syntax error.
 | 
| 506 | ]
 | 
| 507 | 
 | 
| 508 | # Notes on BASH_REGEX states
 | 
| 509 | #
 | 
| 510 | # From bash manual:
 | 
| 511 | #
 | 
| 512 | # - Any part of the pattern may be quoted to force the quoted portion to be
 | 
| 513 | # matched as a string.
 | 
| 514 | # - Bracket expressions in regular expressions must be treated carefully, since
 | 
| 515 | # normal quoting characters lose their meanings between brackets.
 | 
| 516 | # - If the pattern is stored in a shell variable, quoting the variable
 | 
| 517 | # expansion forces the entire pattern to be matched as a string.
 | 
| 518 | #
 | 
| 519 | # Is there a re.escape function?  It's just like EscapeGlob and UnescapeGlob.
 | 
| 520 | #
 | 
| 521 | # TODO: For testing, write a script to extract and save regexes... and compile
 | 
| 522 | # them with regcomp.  I've only seen constant regexes.
 | 
| 523 | #
 | 
| 524 | # From code: ( | ) are treated special.
 |