1 | """
|
2 | lex.py -- Shell lexer.
|
3 |
|
4 | It consists of a series of lexer modes, each with a regex -> Id mapping.
|
5 |
|
6 | NOTE: If this changes, the lexer may need to be recompiled with
|
7 | build/codegen.sh lexer.
|
8 |
|
9 | Input Handling
|
10 | --------------
|
11 |
|
12 | Note that our style of input Handling affects the regular expressions in the
|
13 | lexer.
|
14 |
|
15 | We pass one line at a time to the Lexer, via LineLexer. We must be able to
|
16 | parse one line at a time because of interactive parsing (e.g. using the output
|
17 | of GNU readline.)
|
18 |
|
19 | There are two ways we could handle input:
|
20 |
|
21 | 1. Every line is NUL terminated:
|
22 | 'one\n\0' 'last line\0'
|
23 | 2. Every line is terminated by NUL, except the last:
|
24 | 'one\n' 'last line\0'
|
25 |
|
26 | The advantage of #2 is that in the common case of reading files, we don't have
|
27 | to do it one line at a time. We could slurp the whole file in, or mmap() it,
|
28 | etc.
|
29 |
|
30 | The second option makes the regular expressions more complicated, so I'm
|
31 | punting on it for now. We assume the first.
|
32 |
|
33 | That means:
|
34 |
|
35 | - No regexes below should match \0. They are added by
|
36 | core/lexer_gen.py for re2c.
|
37 |
|
38 | For example, [^']+ is not valid. [^'\0]+ is correct. Otherwise we would read
|
39 | uninitialized memory past the sentinel.
|
40 |
|
41 | Python's regex engine knows where the end of the input string is, so it
|
42 | doesn't require need a sentinel like \0.
|
43 |
|
44 | Note that re2c is not able to work in a mode with a strict length limit. It
|
45 | would cause too many extra checks? The language is then no longer regular!
|
46 |
|
47 | http://re2c.org/examples/example_03.html
|
48 |
|
49 | UPDATE: Two More Options
|
50 | ------------------------
|
51 |
|
52 | 3. Change the \n at the end of every line to \0. \0 becomes Id.Op_Newline, at
|
53 | least in lex_mode.OUTER.
|
54 |
|
55 | Advantage: This makes the regular expressions easier to generate, but allows
|
56 | you to read in the whole file at once instead of allocating lines.
|
57 |
|
58 | Disadvantages:
|
59 | - You can't mmap() the file because the data is mutated. Or it will have to be
|
60 | copy-on-write.
|
61 | - You can't get rid of comment lines if you read the whole file.
|
62 |
|
63 | 4. Read a line at a time. Throw away the lines, unless you're parsing a
|
64 | function, which should be obvious.
|
65 |
|
66 | After you parse the function, you can COPY all the tokens to another location.
|
67 | Very few tokens need their actual text data. Most of them can just be
|
68 | identified by ID.
|
69 |
|
70 | Contents are relevant:
|
71 |
|
72 | - Lit_Chars, Lit_Other, Lit_EscapedChar, Lit_Digits
|
73 | - Id.Lit_VarLike -- for the name, and for = vs +=
|
74 | - Id.Lit_ArithVarLike
|
75 | - VSub_Name, VSub_Number
|
76 | - Id.Redir_* for the LHS file descriptor. Although this is one or two bytes
|
77 | that could be copied.
|
78 |
|
79 | You can also take this opportunity to enter the strings in an intern table.
|
80 | How much memory would that save?
|
81 |
|
82 | Remaining constructs
|
83 | --------------------
|
84 |
|
85 | Case terminators:
|
86 | ;;& Op_DSemiAmp for case
|
87 | ;& Op_Semi
|
88 |
|
89 | Left Index:
|
90 |
|
91 | _VAR_NAME_RE + '\[' Lit_LeftIndexLikeOpen
|
92 | ]= Lit_LeftIndexLikeClose
|
93 |
|
94 | Indexed array and Associative array literals:
|
95 | declare -A a=([key]=value [key2]=value2)
|
96 | declare -a a=([1 + 2]=value [3 + 4]=value2) # parsed!
|
97 |
|
98 | Lit_LBracket Lit_RBracketEqual
|
99 | Left_Bracket, Right_BracketEqual?
|
100 | Op_LBracket Op_RBracketEqual
|
101 | """
|
102 |
|
103 | import re
|
104 |
|
105 | from osh.meta import Id, Kind, ID_SPEC
|
106 | from core.lexer import C, R
|
107 |
|
108 | from osh.meta import types
|
109 |
|
110 | lex_mode_e = types.lex_mode_e
|
111 |
|
112 |
|
113 | # In oil, I hope to have these lexer modes:
|
114 | # COMMAND
|
115 | # EXPRESSION (takes place of ARITH, VS_UNQ_ARG, VS_DQ_ARG)
|
116 | # SQ RAW_SQ DQ RAW_DQ
|
117 | # VS -- a single state here? Or switches into expression state, because }
|
118 | # is an operator
|
119 | # Problem: DICT_KEY might be a different state, to accept either a bare word
|
120 | # foo, or an expression (X=a+2), which is allowed in shell. Python doesn't
|
121 | # allowed unquoted words, but we want to.
|
122 |
|
123 | # TODO: There are 4 shared groups here. I think you should test if that
|
124 | # structure should be preserved through re2c. Do a benchmark.
|
125 | #
|
126 | # If a group has no matches, then return Id.Unknown_Tok? And then you can
|
127 | # chain the groups in order. It might make sense to experiment with the order
|
128 | # too.
|
129 |
|
130 | _BACKSLASH = [
|
131 | R(r'\\[^\n\0]', Id.Lit_EscapedChar),
|
132 | C('\\\n', Id.Ignored_LineCont),
|
133 | ]
|
134 |
|
135 | _VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
|
136 |
|
137 | # Used by osh/cmd_parse.py to validate for loop name. Note it must be
|
138 | # anchored on the right.
|
139 | VAR_NAME_RE = re.compile(_VAR_NAME_RE + '$')
|
140 |
|
141 | # All Kind.VSub
|
142 | _VARS = [
|
143 | # Unbraced variables
|
144 | R(r'\$' + _VAR_NAME_RE, Id.VSub_Name),
|
145 | R(r'\$[0-9]', Id.VSub_Number),
|
146 | C(r'$!', Id.VSub_Bang),
|
147 | C(r'$@', Id.VSub_At),
|
148 | C(r'$#', Id.VSub_Pound),
|
149 | C(r'$$', Id.VSub_Dollar),
|
150 | C(r'$*', Id.VSub_Star),
|
151 | C(r'$-', Id.VSub_Hyphen),
|
152 | C(r'$?', Id.VSub_QMark),
|
153 | ]
|
154 |
|
155 | # Kind.Left that are valid in double-quoted modes.
|
156 | _LEFT_SUBS = [
|
157 | C('`', Id.Left_Backtick),
|
158 | C('$(', Id.Left_CommandSub),
|
159 | C('${', Id.Left_VarSub),
|
160 | C('$((', Id.Left_ArithSub),
|
161 | C('$[', Id.Left_ArithSub2),
|
162 | ]
|
163 |
|
164 | # Additional Kind.Left that are valid in unquoted modes.
|
165 | _LEFT_UNQUOTED = [
|
166 | C('"', Id.Left_DoubleQuote),
|
167 | C("'", Id.Left_SingleQuote),
|
168 | C('$"', Id.Left_DollarDoubleQuote),
|
169 | C("$'", Id.Left_DollarSingleQuote),
|
170 |
|
171 | C('<(', Id.Left_ProcSubIn),
|
172 | C('>(', Id.Left_ProcSubOut),
|
173 | ]
|
174 |
|
175 | # Constructs used:
|
176 | # Character classes [] with simple ranges and negation, +, *, \n, \0
|
177 | # It would be nice to express this as CRE ... ? And then compile to re2c
|
178 | # syntax. And Python syntax.
|
179 |
|
180 | # NOTE: Should remain compatible with re2c syntax, for code gen.
|
181 | # http://re2c.org/manual/syntax/syntax.html
|
182 |
|
183 | # PROBLEM: \0 in Python re vs \000 in re2? Can this be unified?
|
184 | # Yes, Python allows \000 octal escapes.
|
185 | #
|
186 | # https://docs.python.org/2/library/re.html
|
187 |
|
188 | LEXER_DEF = {} # TODO: Should be a list so we enforce order.
|
189 |
|
190 | # Anything until the end of the line is a comment. Does not match the newline
|
191 | # itself. We want to switch modes and possibly process Op_Newline for here
|
192 | # docs, etc.
|
193 | LEXER_DEF[lex_mode_e.COMMENT] = [
|
194 | R(r'[^\n\0]*', Id.Ignored_Comment)
|
195 | ]
|
196 |
|
197 | _UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
|
198 | # NOTE: We could add anything 128 and above to this character class? So
|
199 | # utf-8 characters don't get split?
|
200 | R(r'[a-zA-Z0-9_/.-]+', Id.Lit_Chars),
|
201 | # e.g. beginning of NAME=val, which will always be longer than the above
|
202 | # Id.Lit_Chars.
|
203 | R(r'[a-zA-Z_][a-zA-Z0-9_]*\+?=', Id.Lit_VarLike),
|
204 |
|
205 | C('#', Id.Lit_Pound), # For comments
|
206 |
|
207 | # Needs to be LONGER than any other
|
208 | #(_VAR_NAME_RE + r'\[', Id.Lit_Maybe_LHS_ARRAY),
|
209 | # Id.Lit_Maybe_LHS_ARRAY2
|
210 | #(r'\]\+?=', Id.Lit_Maybe_ARRAY_ASSIGN_RIGHT),
|
211 |
|
212 | # For brace expansion {a,b}
|
213 | C('{', Id.Lit_LBrace),
|
214 | C('}', Id.Lit_RBrace), # Also for var sub ${a}
|
215 | C(',', Id.Lit_Comma),
|
216 | C('~', Id.Lit_Tilde), # For tilde expansion
|
217 |
|
218 | R(r'[ \t\r]+', Id.WS_Space),
|
219 |
|
220 | C('\n', Id.Op_Newline),
|
221 |
|
222 | C('&', Id.Op_Amp),
|
223 | C('|', Id.Op_Pipe),
|
224 | C('|&', Id.Op_PipeAmp),
|
225 | C('&&', Id.Op_DAmp),
|
226 | C('||', Id.Op_DPipe),
|
227 | C(';', Id.Op_Semi),
|
228 | C(';;', Id.Op_DSemi),
|
229 |
|
230 | C('(', Id.Op_LParen),
|
231 | C(')', Id.Op_RParen),
|
232 |
|
233 | R(r'[0-9]*<', Id.Redir_Less),
|
234 | R(r'[0-9]*>', Id.Redir_Great),
|
235 | R(r'[0-9]*<<', Id.Redir_DLess),
|
236 | R(r'[0-9]*<<<', Id.Redir_TLess),
|
237 | R(r'[0-9]*>>', Id.Redir_DGreat),
|
238 | R(r'[0-9]*<<-', Id.Redir_DLessDash),
|
239 | R(r'[0-9]*>&', Id.Redir_GreatAnd),
|
240 | R(r'[0-9]*<&', Id.Redir_LessAnd),
|
241 | R(r'[0-9]*<>', Id.Redir_LessGreat),
|
242 | R(r'[0-9]*>\|', Id.Redir_Clobber),
|
243 |
|
244 | R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
|
245 | ]
|
246 |
|
247 | # In OUTER and DBRACKET states.
|
248 | _EXTGLOB_BEGIN = [
|
249 | C('@(', Id.ExtGlob_At),
|
250 | C('*(', Id.ExtGlob_Star),
|
251 | C('+(', Id.ExtGlob_Plus),
|
252 | C('?(', Id.ExtGlob_QMark),
|
253 | C('!(', Id.ExtGlob_Bang),
|
254 | ]
|
255 |
|
256 | _KEYWORDS = [
|
257 | # NOTE: { is matched elsewhere
|
258 | C('[[', Id.KW_DLeftBracket),
|
259 | C('!', Id.KW_Bang),
|
260 | C('for', Id.KW_For),
|
261 | C('while', Id.KW_While),
|
262 | C('until', Id.KW_Until),
|
263 | C('do', Id.KW_Do),
|
264 | C('done', Id.KW_Done),
|
265 | C('in', Id.KW_In),
|
266 | C('case', Id.KW_Case),
|
267 | C('esac', Id.KW_Esac),
|
268 | C('if', Id.KW_If),
|
269 | C('fi', Id.KW_Fi),
|
270 | C('then', Id.KW_Then),
|
271 | C('else', Id.KW_Else),
|
272 | C('elif', Id.KW_Elif),
|
273 | C('function', Id.KW_Function),
|
274 | C('time', Id.KW_Time),
|
275 | ]
|
276 |
|
277 | # These are treated like builtins in bash, but keywords in OSH. However, we
|
278 | # main compatibility with bash for the 'type' builtin.
|
279 | _MORE_KEYWORDS = [
|
280 | C('declare', Id.Assign_Declare),
|
281 | C('typeset', Id.Assign_Typeset),
|
282 | C('local', Id.Assign_Local),
|
283 | C('readonly', Id.Assign_Readonly),
|
284 |
|
285 | C('break', Id.ControlFlow_Break),
|
286 | C('continue', Id.ControlFlow_Continue),
|
287 | C('return', Id.ControlFlow_Return),
|
288 | C('exit', Id.ControlFlow_Exit),
|
289 | ]
|
290 |
|
291 |
|
292 | _TYPE_KEYWORDS = set(name for _, name, _ in _KEYWORDS)
|
293 | _TYPE_KEYWORDS.add('{') # not in our lexer list
|
294 | _TYPE_BUILTINS = set(name for _, name, _ in _MORE_KEYWORDS)
|
295 |
|
296 |
|
297 | def IsOtherBuiltin(name):
|
298 | return name in _TYPE_BUILTINS
|
299 |
|
300 |
|
301 | def IsKeyword(name):
|
302 | return name in _TYPE_KEYWORDS
|
303 |
|
304 |
|
305 | # These two can must be recognized in the OUTER state, but can't nested within
|
306 | # [[.
|
307 | # Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
|
308 | # of <Lit_Chars "if">.
|
309 | LEXER_DEF[lex_mode_e.OUTER] = [
|
310 | C('((', Id.Op_DLeftParen), # not allowed within [[
|
311 | ] + _KEYWORDS + _MORE_KEYWORDS + _UNQUOTED + _EXTGLOB_BEGIN
|
312 |
|
313 | # DBRACKET: can be like OUTER, except:
|
314 | # - Don't really need redirects either... Redir_Less could be Op_Less
|
315 | # - Id.Op_DLeftParen can't be nested inside.
|
316 | LEXER_DEF[lex_mode_e.DBRACKET] = [
|
317 | C(']]', Id.Lit_DRightBracket),
|
318 | C('!', Id.KW_Bang),
|
319 | ] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
|
320 | ID_SPEC.LexerPairs(Kind.BoolBinary) + \
|
321 | _UNQUOTED + _EXTGLOB_BEGIN
|
322 |
|
323 | # Inside an extended glob, most characters are literals, including spaces and
|
324 | # punctuation. We also accept \, $var, ${var}, "", etc. They can also be
|
325 | # nested, so _EXTGLOB_BEGIN appears here.
|
326 | #
|
327 | # Example: echo @(<> <>|&&|'foo'|$bar)
|
328 | LEXER_DEF[lex_mode_e.EXTGLOB] = \
|
329 | _BACKSLASH + _LEFT_SUBS + _VARS + _EXTGLOB_BEGIN + [
|
330 | R(r'[^\\$`"\'|)@*+!?\0]+', Id.Lit_Chars),
|
331 | C('|', Id.Op_Pipe),
|
332 | C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
|
333 | R(r'[^\0]', Id.Lit_Other), # everything else is literal
|
334 | ]
|
335 |
|
336 |
|
337 | LEXER_DEF[lex_mode_e.BASH_REGEX] = [
|
338 | # Match these literals first, and then the rest of the OUTER state I guess.
|
339 | # That's how bash works.
|
340 | #
|
341 | # At a minimum, you do need $ and ~ expansions to happen. <>;& could have
|
342 | # been allowed unescaped too, but that's not what bash does. The criteria
|
343 | # was whether they were "special" in both languages, which seems dubious.
|
344 | C('(', Id.Lit_Chars),
|
345 | C(')', Id.Lit_Chars),
|
346 | C('|', Id.Lit_Chars),
|
347 | ] + [
|
348 | # Avoid "unreachable rule error"
|
349 | (is_regex, pat, re_list) for
|
350 | (is_regex, pat, re_list) in _UNQUOTED
|
351 | if not (is_regex == False and pat in ('(', ')', '|'))
|
352 | ]
|
353 |
|
354 |
|
355 | LEXER_DEF[lex_mode_e.DQ] = [
|
356 | # Only 4 characters are backslash escaped inside "".
|
357 | # https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
|
358 | R(r'\\[$`"\\]', Id.Lit_EscapedChar),
|
359 | C('\\\n', Id.Ignored_LineCont),
|
360 | ] + _LEFT_SUBS + _VARS + [
|
361 | R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
|
362 | # NOTE: When parsing here doc line, this token doesn't end it.
|
363 | C('"', Id.Right_DoubleQuote),
|
364 | R(r'[^\0]', Id.Lit_Other), # e.g. "$"
|
365 | ]
|
366 |
|
367 | _VS_ARG_COMMON = _BACKSLASH + [
|
368 | C('/', Id.Lit_Slash), # for patsub (not Id.VOp2_Slash)
|
369 | C('#', Id.Lit_Pound), # for patsub prefix (not Id.VOp1_Pound)
|
370 | C('%', Id.Lit_Percent), # for patsdub suffix (not Id.VOp1_Percent)
|
371 | C('}', Id.Right_VarSub), # For var sub "${a}"
|
372 | ]
|
373 |
|
374 | # Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
|
375 | LEXER_DEF[lex_mode_e.VS_ARG_UNQ] = \
|
376 | _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
|
377 | # NOTE: added < and > so it doesn't eat <()
|
378 | R(r'[^$`/}"\'\0\\#%<>]+', Id.Lit_Chars),
|
379 | R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
|
380 | ]
|
381 |
|
382 | # Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
|
383 | LEXER_DEF[lex_mode_e.VS_ARG_DQ] = _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
|
384 | R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
|
385 | # Weird wart: even in double quoted state, double quotes are allowed
|
386 | C('"', Id.Left_DoubleQuote),
|
387 | R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
|
388 | ]
|
389 |
|
390 | # NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
|
391 | # state.
|
392 | LEXER_DEF[lex_mode_e.SQ] = [
|
393 | R(r"[^'\0]+", Id.Lit_Chars), # matches a line at most
|
394 | C("'", Id.Right_SingleQuote),
|
395 | ]
|
396 |
|
397 | # Shared between echo -e and $''.
|
398 | _C_STRING_COMMON = [
|
399 |
|
400 | # \x6 is valid in bash
|
401 | R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex),
|
402 | R(r'\\u[0-9]{1,4}', Id.Char_Unicode4),
|
403 | R(r'\\U[0-9]{1,8}', Id.Char_Unicode8),
|
404 |
|
405 | R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
|
406 |
|
407 | # Backslash that ends a line. Note '.' doesn't match a newline character.
|
408 | C('\\\n', Id.Char_Literals),
|
409 |
|
410 | # e.g. \A is not an escape, and \x doesn't match a hex escape. We allow it,
|
411 | # but a lint tool could warn about it.
|
412 | C('\\', Id.Char_BadBackslash),
|
413 | ]
|
414 |
|
415 | # Used by ECHO_LEXER in core/builtin.py.
|
416 | ECHO_E_DEF = _C_STRING_COMMON + [
|
417 | # Note: tokens above \0377 can either be truncated or be flagged a syntax
|
418 | # error in strict mode.
|
419 | R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
|
420 |
|
421 | C(r'\c', Id.Char_Stop),
|
422 |
|
423 | # Bad Backslash should not end the string. We allow it, but a lint tool
|
424 | # should warn about it.
|
425 | R(r'\\$', Id.Char_BadBackslash),
|
426 |
|
427 | # e.g. 'foo', anything that's not a backslash escape
|
428 | R(r'[^\\]+', Id.Char_Literals),
|
429 | ]
|
430 |
|
431 | # NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
|
432 | # point of it is that supports other backslash escapes like \n! It just
|
433 | # becomes a regular backslash.
|
434 | LEXER_DEF[lex_mode_e.DOLLAR_SQ] = _C_STRING_COMMON + [
|
435 | # Silly difference! In echo -e, the syntax is \0377, but here it's $'\377',
|
436 | # with no leading 0.
|
437 | R(r'\\[0-7]{1,3}', Id.Char_Octal3),
|
438 |
|
439 | # ' is escaped in $'' mode, but not echo -e. Ditto fr ", not sure why.
|
440 | C(r"\'", Id.Char_OneChar),
|
441 | C(r'\"', Id.Char_OneChar),
|
442 |
|
443 | # e.g. 'foo', anything that's not a backslash escape. Need to exclude ' as
|
444 | # well.
|
445 | R(r"[^\\'\0]+", Id.Char_Literals),
|
446 |
|
447 | C("'", Id.Right_SingleQuote),
|
448 |
|
449 | # Backslash that ends the file! Caught by re2c exhaustiveness check. Parser
|
450 | # will assert; should give a better syntax error.
|
451 | C('\\\0', Id.Unknown_Tok),
|
452 | ]
|
453 |
|
454 | LEXER_DEF[lex_mode_e.VS_1] = [
|
455 | R(_VAR_NAME_RE, Id.VSub_Name),
|
456 | # ${11} is valid, compared to $11 which is $1 and then literal 1.
|
457 | R(r'[0-9]+', Id.VSub_Number),
|
458 | C('!', Id.VSub_Bang),
|
459 | C('@', Id.VSub_At),
|
460 | C('#', Id.VSub_Pound),
|
461 | C('$', Id.VSub_Dollar),
|
462 | C('*', Id.VSub_Star),
|
463 | C('-', Id.VSub_Hyphen),
|
464 | C('?', Id.VSub_QMark),
|
465 |
|
466 | C('}', Id.Right_VarSub),
|
467 |
|
468 | C('\\\n', Id.Ignored_LineCont),
|
469 |
|
470 | C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
|
471 | R(r'[^\0]', Id.Unknown_Tok), # any char except newline
|
472 | ]
|
473 |
|
474 | LEXER_DEF[lex_mode_e.VS_2] = \
|
475 | ID_SPEC.LexerPairs(Kind.VTest) + \
|
476 | ID_SPEC.LexerPairs(Kind.VOp1) + \
|
477 | ID_SPEC.LexerPairs(Kind.VOp2) + [
|
478 | C('}', Id.Right_VarSub),
|
479 |
|
480 | C('\\\n', Id.Ignored_LineCont),
|
481 | C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
|
482 | R(r'[^\0]', Id.Unknown_Tok), # any char except newline
|
483 | ]
|
484 |
|
485 | # https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
|
486 | LEXER_DEF[lex_mode_e.ARITH] = \
|
487 | _LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
|
488 | # newline is ignored space, unlike in OUTER
|
489 | R(r'[ \t\r\n]+', Id.Ignored_Space),
|
490 |
|
491 | # Examples of arith constants:
|
492 | # 64#azAZ
|
493 | # 0xabc 0xABC
|
494 | # 0123
|
495 | # A separate digits token makes this easier to parse STATICALLY. But this
|
496 | # doesn't help with DYNAMIC parsing.
|
497 | R(_VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
|
498 | R(r'[0-9]+', Id.Lit_Digits),
|
499 | C('@', Id.Lit_At), # for 64#@ or ${a[@]}
|
500 | C('#', Id.Lit_Pound), # for 64#a
|
501 |
|
502 | # TODO: 64#@ interferes with VS_AT. Hm.
|
503 | ] + ID_SPEC.LexerPairs(Kind.Arith) + [
|
504 | C('\\\n', Id.Ignored_LineCont),
|
505 | R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
|
506 | ]
|
507 |
|
508 | # Notes on BASH_REGEX states
|
509 | #
|
510 | # From bash manual:
|
511 | #
|
512 | # - Any part of the pattern may be quoted to force the quoted portion to be
|
513 | # matched as a string.
|
514 | # - Bracket expressions in regular expressions must be treated carefully, since
|
515 | # normal quoting characters lose their meanings between brackets.
|
516 | # - If the pattern is stored in a shell variable, quoting the variable
|
517 | # expansion forces the entire pattern to be matched as a string.
|
518 | #
|
519 | # Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
|
520 | #
|
521 | # TODO: For testing, write a script to extract and save regexes... and compile
|
522 | # them with regcomp. I've only seen constant regexes.
|
523 | #
|
524 | # From code: ( | ) are treated special.
|