OILS / doctools / micro_syntax.re2c.h View on Github | oilshell.org

688 lines, 516 significant
1#ifndef MICRO_SYNTAX_H
2#define MICRO_SYNTAX_H
3
4#include <assert.h>
5#include <string.h> // strlen()
6
7#include <vector>
8
9enum class Id {
10 // Common to nearly all languages
11 Comm,
12 MaybeComment, // for shell, resolved in a fix-up pass
13
14 WS,
15
16 Name, // Keyword or Identifier
17 Str, // "" and Python r""
18 // '' and Python r''
19 // ''' """
20 // body of here docs
21
22 Other, // any other text
23 Unknown,
24
25 // C++
26 DelimStrBegin, // for C++ R"zzz(hello)zzz"
27 DelimStrEnd,
28 Re2c, // re2c code block
29
30 MaybePreproc, // resolved to PreprocCommand/PreprocOther in fix-up pass
31 PreprocCommand, // resolved #define
32 PreprocOther, // any other text
33 LineCont, // backslash at end of line, for #define continuation
34
35 // Braces for C++ block structure. Could be done in second pass after
36 // removing comments/strings?
37 LBrace,
38 RBrace,
39
40 // Shell
41 HereBegin,
42 HereEnd,
43
44 // Zero-width token to detect #ifdef and Python INDENT/DEDENT
45 // StartLine,
46
47 // These are special zero-width tokens for Python
48 // Indent,
49 // Dedent,
50 // Maintain our own stack!
51 // https://stackoverflow.com/questions/40960123/how-exactly-a-dedent-token-is-generated-in-python
52};
53
54struct Token {
55 Token() : id(Id::Unknown), end_col(0), submatch_start(0), submatch_end(0) {
56 }
57 Token(Id id, int end_col)
58 : id(id), end_col(end_col), submatch_start(0), submatch_end(0) {
59 }
60
61 Id id;
62 int end_col; // offset from char* line
63 int submatch_start; // ditto
64 int submatch_end; // ditto
65};
66
67// Lexer and Matcher are specialized on py_mode_e, cpp_mode_e, ...
68
69template <typename T>
70class Lexer {
71 public:
72 Lexer(char* line) : line_(line), p_current(line), line_mode(T::Outer) {
73 }
74
75 void SetLine(char* line) {
76 line_ = line;
77 p_current = line;
78 }
79
80 const char* line_;
81 const char* p_current; // points into line
82 T line_mode; // current mode, starts with Outer
83};
84
85template <typename T>
86class Matcher {
87 public:
88 // Returns whether EOL was hit. Mutates lexer state, and fills in tok out
89 // param.
90 bool Match(Lexer<T>* lexer, Token* tok);
91};
92
93// Macros for semantic actions
94
95#define TOK(k) \
96 tok->id = k; \
97 break;
98
99#define TOK_MODE(k, m) \
100 tok->id = k; \
101 lexer->line_mode = m; \
102 break;
103
104// Must call TOK*() after this
105#define SUBMATCH(s, e) \
106 tok->submatch_start = s - lexer->line_; \
107 tok->submatch_end = e - lexer->line_;
108
109// Regex definitions shared between languages
110
111/*!re2c
112 re2c:yyfill:enable = 0;
113 re2c:define:YYCTYPE = char;
114 re2c:define:YYCURSOR = p;
115
116 nul = [\x00];
117 not_nul = [^\x00];
118
119 // Whitespace is needed for SLOC, to tell if a line is entirely blank
120 whitespace = [ \t\r\n]*;
121
122 identifier = [_a-zA-Z][_a-zA-Z0-9]*;
123
124 // Python and C++ have "" strings
125 // C++ char literals are similar, e.g. '\''
126 // We are not more precise
127
128 sq_middle = ( [^\x00'\\] | "\\" not_nul )*;
129 dq_middle = ( [^\x00"\\] | "\\" not_nul )*;
130
131 sq_string = ['] sq_middle ['];
132 dq_string = ["] dq_middle ["];
133
134 // Shell and Python have # comments
135 pound_comment = "#" not_nul*;
136
137 // YSH and Python have ''' """
138 triple_sq = "'''";
139 triple_dq = ["]["]["];
140*/
141
142enum class text_mode_e {
143 Outer, // default
144};
145
146// Returns whether EOL was hit
147template <>
148bool Matcher<text_mode_e>::Match(Lexer<text_mode_e>* lexer, Token* tok) {
149 const char* p = lexer->p_current; // mutated by re2c
150
151 while (true) {
152 /*!re2c
153 nul { return true; }
154
155 // whitespace at start of line
156 whitespace { TOK(Id::WS); }
157
158 // This rule consumes trailing whitespace, but
159 // it's OK. We're counting significant lines, not
160 // highlighting.
161 [^\x00]+ { TOK(Id::Other); }
162
163 * { TOK(Id::Other); }
164
165 */
166 }
167
168 tok->end_col = p - lexer->line_;
169 lexer->p_current = p;
170 return false;
171}
172
173enum class asdl_mode_e {
174 Outer,
175};
176
177// Returns whether EOL was hit
178template <>
179bool Matcher<asdl_mode_e>::Match(Lexer<asdl_mode_e>* lexer, Token* tok) {
180 const char* p = lexer->p_current; // mutated by re2c
181
182 switch (lexer->line_mode) {
183 case asdl_mode_e::Outer:
184 while (true) {
185 /*!re2c
186 nul { return true; }
187
188 whitespace { TOK(Id::WS); }
189
190 identifier { TOK(Id::Name); }
191
192 pound_comment { TOK(Id::Comm); }
193
194 // Not the start of a comment, identifier
195 [^\x00#_a-zA-Z]+ { TOK(Id::Other); }
196
197 // e.g. unclosed quote like "foo
198 * { TOK(Id::Unknown); }
199
200 */
201 }
202 break;
203 }
204
205 tok->end_col = p - lexer->line_;
206 lexer->p_current = p;
207 return false;
208}
209
210enum class py_mode_e {
211 Outer, // default
212 MultiSQ, // inside '''
213 MultiDQ, // inside """
214};
215
216// Returns whether EOL was hit
217template <>
218bool Matcher<py_mode_e>::Match(Lexer<py_mode_e>* lexer, Token* tok) {
219 const char* p = lexer->p_current; // mutated by re2c
220 const char* YYMARKER = p;
221
222 switch (lexer->line_mode) {
223 case py_mode_e::Outer:
224 while (true) {
225 /*!re2c
226 nul { return true; }
227
228 whitespace { TOK(Id::WS); }
229
230 identifier { TOK(Id::Name); }
231
232 [r]? sq_string { TOK(Id::Str); }
233 [r]? dq_string { TOK(Id::Str); }
234
235 // optional raw prefix
236 [r]? triple_sq { TOK_MODE(Id::Str, py_mode_e::MultiSQ); }
237 [r]? triple_dq { TOK_MODE(Id::Str, py_mode_e::MultiDQ); }
238
239 pound_comment { TOK(Id::Comm); }
240
241 // Not the start of a string, comment, identifier
242 [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
243
244 // e.g. unclosed quote like "foo
245 * { TOK(Id::Unknown); }
246
247 */
248 }
249 break;
250
251 case py_mode_e::MultiSQ:
252 while (true) {
253 /*!re2c
254 nul { return true; }
255
256 triple_sq { TOK_MODE(Id::Str, py_mode_e::Outer); }
257
258 [^\x00']* { TOK(Id::Str); }
259
260 * { TOK(Id::Str); }
261
262 */
263 }
264 break;
265
266 case py_mode_e::MultiDQ:
267 while (true) {
268 /*!re2c
269 nul { return true; }
270
271 triple_dq { TOK_MODE(Id::Str, py_mode_e::Outer); }
272
273 [^\x00"]* { TOK(Id::Str); }
274
275 * { TOK(Id::Str); }
276
277 */
278 }
279 break;
280 }
281
282 tok->end_col = p - lexer->line_;
283 lexer->p_current = p;
284 return false;
285}
286
287enum class cpp_mode_e {
288 Outer, // default
289 Comm, // inside /* */ comment
290 DelimStr, // R"zz(string literal)zz"
291 Re2c, // /* !re2c
292};
293
294// Returns whether EOL was hit
295template <>
296bool Matcher<cpp_mode_e>::Match(Lexer<cpp_mode_e>* lexer, Token* tok) {
297 const char* p = lexer->p_current; // mutated by re2c
298 const char* YYMARKER = p;
299 const char *s, *e; // submatch extraction
300
301 // Autogenerated tag variables used by the lexer to track tag values.
302 /*!stags:re2c format = 'const char *@@;\n'; */
303
304 switch (lexer->line_mode) {
305 case cpp_mode_e::Outer:
306
307 while (true) {
308 /*!re2c
309 nul { return true; }
310
311 whitespace { TOK(Id::WS); }
312
313 "{" { TOK(Id::LBrace); }
314 "}" { TOK(Id::RBrace); }
315
316 identifier { TOK(Id::Name); }
317
318 // approximation for C++ char literals
319 sq_string { TOK(Id::Str); }
320 dq_string { TOK(Id::Str); }
321
322 // Not the start of a string, comment, identifier
323 [^\x00"'/_a-zA-Z{}]+ { TOK(Id::Other); }
324
325 "//" not_nul* { TOK(Id::Comm); }
326
327 // Treat re2c as preprocessor block
328 "/" "*!re2c" { TOK_MODE(Id::Re2c, cpp_mode_e::Re2c); }
329
330 "/" "*" { TOK_MODE(Id::Comm, cpp_mode_e::Comm); }
331
332 // Not sure what the rules are for R"zz(hello)zz". Make it similar to
333 // here docs.
334 cpp_delim_str = [_a-zA-Z]*;
335
336 "R" ["] @s cpp_delim_str @e "(" {
337 SUBMATCH(s, e);
338 TOK_MODE(Id::DelimStrBegin, cpp_mode_e::DelimStr);
339 }
340
341 // e.g. unclosed quote like "foo
342 * { TOK(Id::Unknown); }
343
344 */
345 }
346 break;
347
348 case cpp_mode_e::Comm:
349 // Search until next */
350 while (true) {
351 /*!re2c
352 nul { return true; }
353
354 "*" "/" { TOK_MODE(Id::Comm, cpp_mode_e::Outer); }
355
356 [^\x00*]* { TOK(Id::Comm); }
357
358 * { TOK(Id::Comm); }
359
360 */
361 }
362 break;
363
364 case cpp_mode_e::Re2c:
365 // Search until next */
366 while (true) {
367 /*!re2c
368 nul { return true; }
369
370 "*" "/" { TOK_MODE(Id::Re2c, cpp_mode_e::Outer); }
371
372 [^\x00*]* { TOK(Id::Re2c); }
373
374 * { TOK(Id::Re2c); }
375
376 */
377 }
378 break;
379
380 case cpp_mode_e::DelimStr:
381 // Search until next */
382 while (true) {
383 /*!re2c
384 nul { return true; }
385
386 ")" @s cpp_delim_str @e ["] {
387 SUBMATCH(s, e);
388 TOK(Id::DelimStrEnd);
389
390 // Caller is responsible for checking the extracted delimiter, and
391 // setting mode back to Cpp::Outer!
392 }
393
394 [^\x00)]* { TOK(Id::Str); }
395
396 * { TOK(Id::Str); }
397
398 */
399 }
400 break;
401 }
402
403 tok->end_col = p - lexer->line_;
404 lexer->p_current = p;
405 return false;
406}
407
408class Hook {
409 public:
410 // Return true if this is a preprocessor line, and fill in tokens
411 // Caller should check last token for whether there is a continuation line.
412 virtual void TryPreprocess(char* line, std::vector<Token>* tokens) {
413 ;
414 }
415 virtual ~Hook() {
416 }
417};
418
419enum class pp_mode_e {
420 Outer,
421};
422
423// Returns whether EOL was hit
424template <>
425bool Matcher<pp_mode_e>::Match(Lexer<pp_mode_e>* lexer, Token* tok) {
426 const char* p = lexer->p_current; // mutated by re2c
427 const char* YYMARKER = p;
428
429 switch (lexer->line_mode) {
430 case pp_mode_e::Outer:
431 while (true) {
432 /*!re2c
433 nul { return true; }
434
435 // Resolved in fix-up pass
436 // #include #define etc. only valid at the
437 // beginning
438 [ \t]* "#" [a-z]+ { TOK(Id::MaybePreproc); }
439
440 // C-style comments can end these lines
441 "//" not_nul* { TOK(Id::Comm); }
442
443 [\\] [\n] { TOK(Id::LineCont); }
444
445 // A line could be all whitespace, then \ at the
446 // end. And it's not significant
447 whitespace { TOK(Id::WS); }
448
449 // Not the start of a command, comment, or line
450 // continuation
451 [^\x00#/\\]+ { TOK(Id::PreprocOther); }
452
453 * { TOK(Id::PreprocOther); }
454
455 */
456 }
457 break;
458 }
459
460 tok->end_col = p - lexer->line_;
461 lexer->p_current = p;
462 return false;
463}
464
465class CppHook : public Hook {
466 public:
467 virtual void TryPreprocess(char* line, std::vector<Token>* tokens);
468};
469
470enum class R_mode_e {
471 Outer, // default
472
473 SQ, // inside multi-line ''
474 DQ, // inside multi-line ""
475};
476
477// Returns whether EOL was hit
478template <>
479bool Matcher<R_mode_e>::Match(Lexer<R_mode_e>* lexer, Token* tok) {
480 const char* p = lexer->p_current; // mutated by re2c
481 const char* YYMARKER = p;
482
483 switch (lexer->line_mode) {
484 case R_mode_e::Outer:
485 while (true) {
486 /*!re2c
487 nul { return true; }
488
489 whitespace { TOK(Id::WS); }
490
491 pound_comment { TOK(Id::Comm); }
492
493 identifier { TOK(Id::Name); }
494
495 // Not the start of a string, escaped, comment, identifier
496 [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
497
498 ['] { TOK_MODE(Id::Str, R_mode_e::SQ); }
499 ["] { TOK_MODE(Id::Str, R_mode_e::DQ); }
500
501 * { TOK(Id::Unknown); }
502
503 */
504 }
505 break;
506
507 case R_mode_e::SQ:
508 while (true) {
509 /*!re2c
510 nul { return true; }
511
512 ['] { TOK_MODE(Id::Str, R_mode_e::Outer); }
513
514 sq_middle { TOK(Id::Str); }
515
516 * { TOK(Id::Str); }
517
518 */
519 }
520 break;
521
522 case R_mode_e::DQ:
523 while (true) {
524 /*!re2c
525 nul { return true; }
526
527 ["] { TOK_MODE(Id::Str, R_mode_e::Outer); }
528
529 dq_middle { TOK(Id::Str); }
530
531 * { TOK(Id::Str); }
532
533 */
534 }
535 break;
536 }
537
538 tok->end_col = p - lexer->line_;
539 lexer->p_current = p;
540 return false;
541}
542
543// Problem with shell: nested double quotes!!!
544// We probably discourage this in YSH
545
546enum class sh_mode_e {
547 Outer, // default
548
549 SQ, // inside multi-line ''
550 DollarSQ, // inside multi-line $''
551 DQ, // inside multi-line ""
552
553 // We could have a separate thing for this
554 YshSQ, // inside '''
555 YshDQ, // inside """
556 YshJ, // inside j"""
557};
558
559// Returns whether EOL was hit
560
561// Submatch docs:
562// https://re2c.org/manual/manual_c.html#submatch-extraction
563
564template <>
565bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
566 const char* p = lexer->p_current; // mutated by re2c
567 const char* YYMARKER = p;
568 const char *s, *e; // submatch extraction
569
570 // Autogenerated tag variables used by the lexer to track tag values.
571 /*!stags:re2c format = 'const char *@@;\n'; */
572
573 switch (lexer->line_mode) {
574 case sh_mode_e::Outer:
575 while (true) {
576 /*!re2c
577 nul { return true; }
578
579 whitespace { TOK(Id::WS); }
580
581 // Resolved in fix-up pass
582 pound_comment { TOK(Id::MaybeComment); }
583
584 // not that relevant for shell
585 identifier { TOK(Id::Name); }
586
587 // Not the start of a string, escaped, comment, identifier, here doc
588 [^\x00"'$#_a-zA-Z\\<]+ { TOK(Id::Other); }
589
590 // echo is like a string
591 "\\" . { TOK(Id::Str); }
592
593 ['] { TOK_MODE(Id::Str, sh_mode_e::SQ); }
594 ["] { TOK_MODE(Id::Str, sh_mode_e::DQ); }
595 "$'" { TOK_MODE(Id::Str, sh_mode_e::DollarSQ); }
596
597 // <<- is another syntax
598 here_op = "<<" [-]? [ \t]*;
599 h_delim = [_a-zA-Z][_a-zA-Z0-9]*;
600
601 // unquoted or quoted
602 here_op @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
603 here_op ['] @s h_delim @e ['] { SUBMATCH(s, e); TOK(Id::HereBegin); }
604 here_op ["] @s h_delim @e ["] { SUBMATCH(s, e); TOK(Id::HereBegin); }
605 here_op "\\" @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
606
607 // NOT Unknown, as in Python
608 * { TOK(Id::Other); }
609
610 */
611 }
612 break;
613
614 case sh_mode_e::SQ:
615 // Search until next ' unconditionally
616 while (true) {
617 /*!re2c
618 nul { return true; }
619
620 ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
621
622 [^\x00']* { TOK(Id::Str); }
623
624 * { TOK(Id::Str); }
625
626 */
627 }
628 break;
629
630 case sh_mode_e::DQ:
631 // Search until next " that's not preceded by "
632 while (true) {
633 /*!re2c
634 nul { return true; }
635
636 ["] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
637
638 dq_middle { TOK(Id::Str); }
639
640 * { TOK(Id::Str); }
641
642 */
643 }
644 break;
645
646 case sh_mode_e::DollarSQ:
647 // Search until next ' that's not preceded by "
648 while (true) {
649 /*!re2c
650 nul { return true; }
651
652 ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
653
654 sq_middle { TOK(Id::Str); }
655
656 * { TOK(Id::Str); }
657
658 */
659 }
660 break;
661 case sh_mode_e::YshSQ:
662 case sh_mode_e::YshDQ:
663 case sh_mode_e::YshJ:
664 assert(0);
665 }
666
667 tok->end_col = p - lexer->line_;
668 lexer->p_current = p;
669 return false;
670}
671
672// TODO:
673// - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
674// - same as C++ raw string, I think
675// - similar to here docs, but less complex
676//
677// Inherent problems with "micro segmentation":
678//
679// - Nested double quotes in shell. echo "hi ${name:-"default"}"
680// - This means that lexing is **dependent on** parsing: does the second
681// double quote **close** the first one, or does it start a nested string?
682// - lexing is non-recursive, parsing is recursive
683
684// Shell Comments depend on operator chars
685// echo one # comment
686// echo $(( 16#ff ))'
687
688#endif // MICRO_SYNTAX_H