1 | #!/usr/bin/env python2
|
2 | """Lex_gen.py."""
|
3 | from __future__ import print_function
|
4 |
|
5 | from _devbuild.gen.id_kind import (TEST_UNARY_LOOKUP, TEST_BINARY_LOOKUP,
|
6 | TEST_OTHER_LOOKUP)
|
7 | from _devbuild.gen.id_kind_asdl import Id_str
|
8 | from _devbuild.gen.types_asdl import lex_mode_str
|
9 |
|
10 | import cStringIO
|
11 | import sys
|
12 | import sre_parse
|
13 | import sre_constants
|
14 |
|
15 | from frontend import lexer_def
|
16 |
|
17 |
|
18 | def PrintTree(re_tree, depth=2):
|
19 | """
|
20 | re_tree: List of children
|
21 | """
|
22 | for child in re_tree:
|
23 | name, arg = child
|
24 | sys.stdout.write(depth * '\t')
|
25 | sys.stdout.write(name)
|
26 | sys.stdout.write(' ')
|
27 | if name == 'in': # character class
|
28 | print('{')
|
29 | PrintTree(arg, depth=depth + 1)
|
30 | sys.stdout.write(depth * '\t')
|
31 | print('}')
|
32 | elif name == 'max_repeat': # repetition
|
33 | min_, max_, children = arg
|
34 | # min = 0 means *, min = 1 means +
|
35 | assert min_ in (0, 1), min_
|
36 | print(min_, max_, '{')
|
37 | PrintTree(children, depth=depth + 1)
|
38 | sys.stdout.write(depth * '\t')
|
39 | print()
|
40 | elif name == 'negate': # Oh this is a ^. It doesn't form a node.
|
41 | assert arg is None
|
42 | print()
|
43 | elif name == 'literal': # Quote \ and " in re2c syntax
|
44 | print(repr(chr(arg)))
|
45 | elif name == 'not_literal': # ditto
|
46 | print(repr(chr(arg)))
|
47 | elif name == 'range': # ascii range
|
48 | begin, end = arg
|
49 | print(repr(chr(begin)), repr(chr(end)))
|
50 | elif name == 'any': # This is the '.' character
|
51 | assert arg is None
|
52 | print()
|
53 | else:
|
54 | raise AssertionError(name)
|
55 |
|
56 | # NOTE: negate and not_literal are sort of duplicated
|
57 |
|
58 |
|
59 | def PrintRegex(pat):
|
60 | re_tree = sre_parse.parse(pat)
|
61 | print('\t\t[')
|
62 | PrintTree(re_tree)
|
63 | print('\t\t]')
|
64 |
|
65 |
|
66 | # re2c literals are inside double quotes, so we need to escape \ and "
|
67 | # But we don't need to do anything with the quoted literal ^
|
68 | LITERAL_META = ['\\', '"']
|
69 | LITERAL_META_CODES = [ord(c) for c in LITERAL_META]
|
70 |
|
71 |
|
72 | def _Literal(arg, char_escapes=LITERAL_META_CODES):
|
73 | if arg == ord('\n'):
|
74 | s = r'\n'
|
75 | elif arg == ord('\r'):
|
76 | s = r'\r'
|
77 | elif arg == ord('\t'):
|
78 | s = r'\t'
|
79 | elif arg >= 0x80 or arg < 0x20: # 0x1f and below
|
80 | s = '\\x%02x' % arg # for \x80-\xff
|
81 | elif arg in char_escapes:
|
82 | s = '\\' + chr(arg)
|
83 | else:
|
84 | s = chr(arg)
|
85 | return s
|
86 |
|
87 |
|
88 | # - means range. Note that re2c gives an error we uselessly escape \^.
|
89 | CHAR_CLASS_META = ['\\', '-', ']']
|
90 | CHAR_CLASS_META_CODES = [ord(c) for c in CHAR_CLASS_META]
|
91 |
|
92 |
|
93 | def _CharClassLiteral(arg):
|
94 | return _Literal(arg, char_escapes=CHAR_CLASS_META_CODES)
|
95 |
|
96 |
|
97 | def TranslateConstant(pat):
|
98 | return '"' + ''.join(_Literal(ord(c)) for c in pat) + '"'
|
99 |
|
100 |
|
101 | def TranslateTree(re_tree, f, in_char_class=False):
|
102 | """
|
103 | re_tree: List of children
|
104 | """
|
105 | for child in re_tree:
|
106 | name, arg = child
|
107 | if name == 'in': # character class
|
108 | f.write('[')
|
109 | TranslateTree(arg, f,
|
110 | in_char_class=True) # list of literals/ranges
|
111 | f.write(']')
|
112 |
|
113 | elif name == 'branch': # |
|
114 | _, branches = arg
|
115 | for i, branch in enumerate(branches):
|
116 | if i != 0:
|
117 | f.write(' | ')
|
118 | TranslateTree(branch, f)
|
119 |
|
120 | elif name == 'max_repeat': # repetition
|
121 | min_, max_, children = arg
|
122 | # min = 0 means *, min = 1 means +
|
123 | #assert min_ in (0, 1), min_
|
124 | TranslateTree(children, f)
|
125 |
|
126 | if min_ == 0 and max_ == 1:
|
127 | f.write('? ')
|
128 |
|
129 | elif max_ == sre_constants.MAXREPEAT:
|
130 | if min_ == 0:
|
131 | f.write('* ')
|
132 | elif min_ == 1:
|
133 | f.write('+ ')
|
134 | else:
|
135 | assert 0, min_
|
136 |
|
137 | else: # re2c also supports [0-7]{1,2} syntax
|
138 | # note: might generated {2,2}
|
139 | f.write('{%d,%d} ' % (min_, max_))
|
140 |
|
141 | elif name == 'negate': # ^ in [^a-z]
|
142 | assert arg is None
|
143 | f.write('^')
|
144 |
|
145 | elif name == 'literal': # Quote \ and " in re2c syntax
|
146 | # TODO: it matters if we're inside a character class
|
147 | #print("literal ARG %r" % arg)
|
148 |
|
149 | if in_char_class:
|
150 | s = _CharClassLiteral(arg)
|
151 | else:
|
152 | s = '"%s" ' % _Literal(arg)
|
153 | f.write(s)
|
154 |
|
155 | elif name == 'not_literal': # ditto
|
156 | assert not in_char_class
|
157 | f.write('[^%s]' % _CharClassLiteral(arg))
|
158 |
|
159 | elif name == 'range': # ascii range
|
160 | begin, end = arg
|
161 | f.write('%s-%s' %
|
162 | (_CharClassLiteral(begin), _CharClassLiteral(end)))
|
163 |
|
164 | elif name == 'any': # This is the '.' character
|
165 | assert arg is None
|
166 | f.write('.')
|
167 |
|
168 | elif name == 'subpattern':
|
169 | _, children = arg # Not sure what the _ is, but it works
|
170 | f.write('(')
|
171 | TranslateTree(children, f)
|
172 | f.write(')')
|
173 |
|
174 | else:
|
175 | if 0:
|
176 | from mycpp.mylib import log
|
177 | log('child %s', child)
|
178 | raise RuntimeError("I don't understand regex construct: %r" % name)
|
179 |
|
180 | # NOTE: negate and not_literal are sort of duplicated
|
181 |
|
182 |
|
183 | def TranslateRegex(pat):
|
184 | re_tree = sre_parse.parse(pat)
|
185 | # For debugging
|
186 | #import pprint
|
187 | #print(pprint.pformat(re_tree), file=sys.stderr)
|
188 | f = cStringIO.StringIO()
|
189 | try:
|
190 | TranslateTree(re_tree, f)
|
191 | except RuntimeError:
|
192 | print('Error translating %r' % pat, file=sys.stderr)
|
193 | raise
|
194 | return f.getvalue()
|
195 |
|
196 |
|
197 | # This explains the sentinel method, which we will use.
|
198 | # http://re2c.org/examples/example_01.html
|
199 | #
|
200 | # TODO: Change ParseTuple to use 's' rather than '#s' ?
|
201 |
|
202 | # I don't think we need this YYFILL mechanism, because we lex a line at a
|
203 | # time.
|
204 | # http://re2c.org/examples/example_03.html
|
205 |
|
206 |
|
207 | def TranslateSimpleLexer(func_name, lexer_def):
|
208 | print(r"""
|
209 | static inline void %s(const unsigned char* line, int line_len,
|
210 | int start_pos, int* id, int* end_pos) {
|
211 | assert(start_pos <= line_len); /* caller should have checked */
|
212 |
|
213 | const unsigned char* p = line + start_pos; /* modified by re2c */
|
214 |
|
215 | /* Echo and History lexer apparently need this, but others don't */
|
216 | __attribute__((unused)) const unsigned char* YYMARKER;
|
217 |
|
218 | for (;;) {
|
219 | /*!re2c
|
220 | """ % func_name)
|
221 |
|
222 | for is_regex, pat, id_ in lexer_def:
|
223 | if is_regex:
|
224 | re2c_pat = TranslateRegex(pat)
|
225 | else:
|
226 | re2c_pat = TranslateConstant(pat)
|
227 | id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
|
228 | print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
|
229 |
|
230 | # EARLY RETURN: Do NOT advance past the NUL terminator.
|
231 | print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
|
232 | r'"\x00"')
|
233 |
|
234 | print("""
|
235 | */
|
236 | }
|
237 | *end_pos = p - line; /* relative */
|
238 | }
|
239 | """)
|
240 |
|
241 |
|
242 | def TranslateBracket(func_name, token_dict):
|
243 | print(r"""
|
244 | static inline int %s(const unsigned char* s, int len) {
|
245 | const unsigned char* p = s; /* modified by re2c */
|
246 | const unsigned char* end = s + len;
|
247 |
|
248 | __attribute__((unused)) const unsigned char* YYMARKER;
|
249 | int id;
|
250 |
|
251 | for (;;) {
|
252 | /*!re2c
|
253 | """ % func_name)
|
254 |
|
255 | for pat in sorted(token_dict):
|
256 | id_ = token_dict[pat]
|
257 | re2c_pat = TranslateConstant(pat)
|
258 | id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
|
259 | print(' %-30s { id = id__%s; break; }' % (re2c_pat, id_name))
|
260 |
|
261 | # EARLY RETURN: Do NOT advance past other chars, including the NUL
|
262 | # terminator.
|
263 | print(' %-30s { return id__Undefined_Tok; }' % '*')
|
264 |
|
265 | print("""
|
266 | */
|
267 | }
|
268 | // must be an exact match
|
269 | return (p == end) ? id : id__Undefined_Tok;
|
270 | }
|
271 | """)
|
272 |
|
273 |
|
274 | def StringToInt(func_name, name_def):
|
275 | print(r"""
|
276 | static inline void %s(const unsigned char* s, int len, int* id) {
|
277 | const unsigned char* p = s; /* modified by re2c */
|
278 | const unsigned char* end = s + len;
|
279 |
|
280 | //fprintf(stderr, "*** s = %%s\n", s);
|
281 |
|
282 | for (;;) {
|
283 | /*!re2c
|
284 | """ % func_name)
|
285 |
|
286 | for name, enum in name_def:
|
287 | re2c_pat = TranslateConstant(name)
|
288 | print(' %-30s { *id = %s; break; }' % (re2c_pat, enum))
|
289 |
|
290 | # Not found. * matches anything else.
|
291 | print(' %-30s { *id = 0; return; }' % \
|
292 | r'*')
|
293 |
|
294 | print(r"""
|
295 | */
|
296 | }
|
297 | if (p != end) {
|
298 | //fprintf(stderr, "EXTRA CHARS\n", s);
|
299 | *id = 0; // Not an exact match
|
300 | }
|
301 | }
|
302 | """)
|
303 |
|
304 | # TODO: Check that we're at the END OF THE STRING
|
305 |
|
306 |
|
307 | def TranslateOshLexer(lexer_def):
|
308 | # https://stackoverflow.com/questions/12836171/difference-between-an-inline-function-and-static-inline-function
|
309 | # Has to be 'static inline' rather than 'inline', otherwise the
|
310 | # _bin/oil.ovm-dbg build fails (but the _bin/oil.ovm doesn't!).
|
311 | # Since we reference this function in exactly one translation unit --
|
312 | # fastlex.c, the difference is moot, and we just satisfy the compiler.
|
313 |
|
314 | print(r"""
|
315 | /* Common stuff */
|
316 |
|
317 | /*!re2c
|
318 | re2c:define:YYCTYPE = "unsigned char";
|
319 | re2c:define:YYCURSOR = p;
|
320 | re2c:yyfill:enable = 0; // generated code doesn't ask for more input
|
321 | */
|
322 |
|
323 | static inline void MatchOshToken(int lex_mode, const unsigned char* line, int line_len,
|
324 | int start_pos, int* id, int* end_pos) {
|
325 | assert(start_pos <= line_len); /* caller should have checked */
|
326 |
|
327 | const unsigned char* p = line + start_pos; /* modified by re2c */
|
328 | //printf("p: %p q: %p\n", p, q);
|
329 |
|
330 | __attribute__((unused)) const unsigned char* YYMARKER; /* why do we need this? */
|
331 | switch (lex_mode) {
|
332 | """)
|
333 |
|
334 | # TODO: Should be ordered by most common? Or will profile-directed feedback
|
335 | # help?
|
336 | for state, pat_list in lexer_def.iteritems():
|
337 | # e.g. lex_mode.DQ => lex_mode__DQ
|
338 | print(' case %s:' % lex_mode_str(state).replace('.', '__'))
|
339 | print(' for (;;) {')
|
340 | print(' /*!re2c')
|
341 |
|
342 | for is_regex, pat, id_ in pat_list:
|
343 | if is_regex:
|
344 | re2c_pat = TranslateRegex(pat)
|
345 | else:
|
346 | re2c_pat = TranslateConstant(pat)
|
347 | id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
|
348 | print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
|
349 |
|
350 | # EARLY RETURN: Do NOT advance past the NUL terminator.
|
351 | print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
|
352 | r'"\x00"')
|
353 |
|
354 | print(' */')
|
355 | print(' }')
|
356 | print(' break;')
|
357 | print()
|
358 |
|
359 | # This is literal code without generation:
|
360 | """
|
361 | case lex_mode__OUTER:
|
362 | for (;;) {
|
363 | /*!re2c
|
364 | literal_chunk = [a-zA-Z0-9_/.-]+;
|
365 | var_like = [a-zA-Z_][a-zA-Z0-9_]* "="; // might be NAME=val
|
366 | comment = [ \t\r]* "#" [^\000\r\n]*;
|
367 | space = [ \t\r]+;
|
368 | nul = "\000";
|
369 |
|
370 | literal_chunk { *id = id__Lit_Chars; break; }
|
371 | var_like { *id = id__Lit_VarLike; break; }
|
372 |
|
373 | [ \t\r]* "\n" { *id = id__Op_Newline; break; }
|
374 | space { *id = id__WS_Space; break; }
|
375 |
|
376 | nul { *id = id__Eof_Real; break; }
|
377 |
|
378 | // anything else
|
379 | * { *id = id__Lit_Other; break; }
|
380 |
|
381 | */
|
382 | }
|
383 |
|
384 | *end_pos = p - line;
|
385 | break;
|
386 |
|
387 | case lex_mode__COMMENT:
|
388 | *id = id__Lit_Other;
|
389 | *end_pos = 6;
|
390 | break;
|
391 | """
|
392 |
|
393 | print("""\
|
394 | default:
|
395 | assert(0);
|
396 |
|
397 | }
|
398 | *end_pos = p - line; /* relative */
|
399 | }
|
400 | """)
|
401 |
|
402 |
|
403 | def TranslateRegexToPredicate(py_regex, func_name):
|
404 | re2c_pat = TranslateRegex(py_regex)
|
405 | print(r"""
|
406 | static inline int %s(const unsigned char* s, int len) {
|
407 | const unsigned char* p = s; /* modified by re2c */
|
408 | const unsigned char* end = s + len;
|
409 |
|
410 | /* MatchBraceRangeToken needs this, but others don't */
|
411 | __attribute__((unused)) const unsigned char* YYMARKER;
|
412 |
|
413 | /*!re2c
|
414 | re2c:define:YYCTYPE = "unsigned char";
|
415 | re2c:define:YYCURSOR = p;
|
416 | %-30s { return p == end; } // Match must be anchored right, like $
|
417 | * { return 0; }
|
418 | */
|
419 | }
|
420 | """ % (func_name, re2c_pat))
|
421 |
|
422 |
|
423 | # note: use YYCURSOR and YYLIMIT
|
424 | # limit should be the end of string
|
425 | # line + line_len
|
426 | def main(argv):
|
427 | action = argv[1]
|
428 | if action == 'c':
|
429 | # Code is printed to stdout
|
430 |
|
431 | TranslateOshLexer(lexer_def.LEXER_DEF)
|
432 |
|
433 | TranslateSimpleLexer('MatchEchoToken', lexer_def.ECHO_E_DEF)
|
434 | TranslateSimpleLexer('MatchGlobToken', lexer_def.GLOB_DEF)
|
435 | TranslateSimpleLexer('MatchPS1Token', lexer_def.PS1_DEF)
|
436 | TranslateSimpleLexer('MatchHistoryToken', lexer_def.HISTORY_DEF)
|
437 | TranslateSimpleLexer('MatchBraceRangeToken', lexer_def.BRACE_RANGE_DEF)
|
438 | TranslateSimpleLexer('MatchJ8Token', lexer_def.J8_DEF)
|
439 | TranslateSimpleLexer('MatchJ8LinesToken', lexer_def.J8_LINES_DEF)
|
440 | TranslateSimpleLexer('MatchJ8StrToken', lexer_def.J8_STR_DEF)
|
441 | TranslateSimpleLexer('MatchJsonStrToken', lexer_def.JSON_STR_DEF)
|
442 |
|
443 | TranslateRegexToPredicate(lexer_def.VAR_NAME_RE, 'IsValidVarName')
|
444 | TranslateRegexToPredicate(lexer_def.SHOULD_HIJACK_RE, 'ShouldHijack')
|
445 | TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_INTEGER,
|
446 | 'LooksLikeInteger')
|
447 | TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_FLOAT, 'LooksLikeFloat')
|
448 |
|
449 | TranslateBracket('BracketUnary', TEST_UNARY_LOOKUP)
|
450 | TranslateBracket('BracketBinary', TEST_BINARY_LOOKUP)
|
451 | TranslateBracket('BracketOther', TEST_OTHER_LOOKUP)
|
452 |
|
453 | elif action == 'print-all':
|
454 | # Top level is a switch statement.
|
455 | for state, pat_list in lexer_def.LEXER_DEF.iteritems():
|
456 | print(state)
|
457 | # This level is re2c patterns.
|
458 | for is_regex, pat, token_id in pat_list:
|
459 | print('\t%r -> %r' % (pat, token_id))
|
460 | if is_regex:
|
461 | #print re_tree
|
462 | _ = TranslateRegex(pat)
|
463 | #print out_pat
|
464 |
|
465 | print()
|
466 |
|
467 | elif action == 'print-regex':
|
468 | unique = set()
|
469 |
|
470 | num_regexes = 0
|
471 | for state, pat_list in lexer_def.LEXER_DEF.iteritems():
|
472 | print(state)
|
473 | # This level is re2c patterns.
|
474 | for is_regex, pat, token_id in pat_list:
|
475 | #print '\t%r -> %r' % (pat, token_id)
|
476 | if is_regex:
|
477 | print('\t' + pat)
|
478 | print('\t' + TranslateRegex(pat))
|
479 | print()
|
480 | #PrintRegex(pat)
|
481 | num_regexes += 1
|
482 | unique.add(pat)
|
483 | else:
|
484 | print('\t' + TranslateConstant(pat))
|
485 |
|
486 | print()
|
487 |
|
488 | print('Printed %d regexes (%d unique)' % (num_regexes, len(unique)))
|
489 |
|
490 |
|
491 | if __name__ == '__main__':
|
492 | try:
|
493 | main(sys.argv)
|
494 | except RuntimeError as e:
|
495 | print('FATAL: %s' % e, file=sys.stderr)
|
496 | sys.exit(1)
|