doctools/micro_syntax.re2c.h

OILS / doctools / micro_syntax.re2c.h View on Github | oilshell.org

688 lines, 516 significant

1	#ifndef MICRO_SYNTAX_H
2	#define MICRO_SYNTAX_H
3
4	#include <assert.h>
5	#include <string.h> // strlen()
6
7	#include <vector>
8
9	enum class Id {
10	// Common to nearly all languages
11	Comm,
12	MaybeComment, // for shell, resolved in a fix-up pass
13
14	WS,
15
16	Name, // Keyword or Identifier
17	Str, // "" and Python r""
18	// '' and Python r''
19	// ''' """
20	// body of here docs
21
22	Other, // any other text
23	Unknown,
24
25	// C++
26	DelimStrBegin, // for C++ R"zzz(hello)zzz"
27	DelimStrEnd,
28	Re2c, // re2c code block
29
30	MaybePreproc, // resolved to PreprocCommand/PreprocOther in fix-up pass
31	PreprocCommand, // resolved #define
32	PreprocOther, // any other text
33	LineCont, // backslash at end of line, for #define continuation
34
35	// Braces for C++ block structure. Could be done in second pass after
36	// removing comments/strings?
37	LBrace,
38	RBrace,
39
40	// Shell
41	HereBegin,
42	HereEnd,
43
44	// Zero-width token to detect #ifdef and Python INDENT/DEDENT
45	// StartLine,
46
47	// These are special zero-width tokens for Python
48	// Indent,
49	// Dedent,
50	// Maintain our own stack!
51	// https://stackoverflow.com/questions/40960123/how-exactly-a-dedent-token-is-generated-in-python
52	};
53
54	struct Token {
55	Token() : id(Id::Unknown), end_col(0), submatch_start(0), submatch_end(0) {
56	}
57	Token(Id id, int end_col)
58	: id(id), end_col(end_col), submatch_start(0), submatch_end(0) {
59	}
60
61	Id id;
62	int end_col; // offset from char* line
63	int submatch_start; // ditto
64	int submatch_end; // ditto
65	};
66
67	// Lexer and Matcher are specialized on py_mode_e, cpp_mode_e, ...
68
69	template <typename T>
70	class Lexer {
71	public:
72	Lexer(char* line) : line_(line), p_current(line), line_mode(T::Outer) {
73	}
74
75	void SetLine(char* line) {
76	line_ = line;
77	p_current = line;
78	}
79
80	const char* line_;
81	const char* p_current; // points into line
82	T line_mode; // current mode, starts with Outer
83	};
84
85	template <typename T>
86	class Matcher {
87	public:
88	// Returns whether EOL was hit. Mutates lexer state, and fills in tok out
89	// param.
90	bool Match(Lexer<T>* lexer, Token* tok);
91	};
92
93	// Macros for semantic actions
94
95	#define TOK(k) \
96	tok->id = k; \
97	break;
98
99	#define TOK_MODE(k, m) \
100	tok->id = k; \
101	lexer->line_mode = m; \
102	break;
103
104	// Must call TOK*() after this
105	#define SUBMATCH(s, e) \
106	tok->submatch_start = s - lexer->line_; \
107	tok->submatch_end = e - lexer->line_;
108
109	// Regex definitions shared between languages
110
111	/*!re2c
112	re2c:yyfill:enable = 0;
113	re2c:define:YYCTYPE = char;
114	re2c:define:YYCURSOR = p;
115
116	nul = [\x00];
117	not_nul = [^\x00];
118
119	// Whitespace is needed for SLOC, to tell if a line is entirely blank
120	whitespace = [ \t\r\n]*;
121
122	identifier = [_a-zA-Z][_a-zA-Z0-9]*;
123
124	// Python and C++ have "" strings
125	// C++ char literals are similar, e.g. '\''
126	// We are not more precise
127
128	sq_middle = ( [^\x00'\\] \| "\\" not_nul )*;
129	dq_middle = ( [^\x00"\\] \| "\\" not_nul )*;
130
131	sq_string = ['] sq_middle ['];
132	dq_string = ["] dq_middle ["];
133
134	// Shell and Python have # comments
135	pound_comment = "#" not_nul*;
136
137	// YSH and Python have ''' """
138	triple_sq = "'''";
139	triple_dq = ["]["]["];
140	*/
141
142	enum class text_mode_e {
143	Outer, // default
144	};
145
146	// Returns whether EOL was hit
147	template <>
148	bool Matcher<text_mode_e>::Match(Lexer<text_mode_e>* lexer, Token* tok) {
149	const char* p = lexer->p_current; // mutated by re2c
150
151	while (true) {
152	/*!re2c
153	nul { return true; }
154
155	// whitespace at start of line
156	whitespace { TOK(Id::WS); }
157
158	// This rule consumes trailing whitespace, but
159	// it's OK. We're counting significant lines, not
160	// highlighting.
161	[^\x00]+ { TOK(Id::Other); }
162
163	* { TOK(Id::Other); }
164
165	*/
166	}
167
168	tok->end_col = p - lexer->line_;
169	lexer->p_current = p;
170	return false;
171	}
172
173	enum class asdl_mode_e {
174	Outer,
175	};
176
177	// Returns whether EOL was hit
178	template <>
179	bool Matcher<asdl_mode_e>::Match(Lexer<asdl_mode_e>* lexer, Token* tok) {
180	const char* p = lexer->p_current; // mutated by re2c
181
182	switch (lexer->line_mode) {
183	case asdl_mode_e::Outer:
184	while (true) {
185	/*!re2c
186	nul { return true; }
187
188	whitespace { TOK(Id::WS); }
189
190	identifier { TOK(Id::Name); }
191
192	pound_comment { TOK(Id::Comm); }
193
194	// Not the start of a comment, identifier
195	[^\x00#_a-zA-Z]+ { TOK(Id::Other); }
196
197	// e.g. unclosed quote like "foo
198	* { TOK(Id::Unknown); }
199
200	*/
201	}
202	break;
203	}
204
205	tok->end_col = p - lexer->line_;
206	lexer->p_current = p;
207	return false;
208	}
209
210	enum class py_mode_e {
211	Outer, // default
212	MultiSQ, // inside '''
213	MultiDQ, // inside """
214	};
215
216	// Returns whether EOL was hit
217	template <>
218	bool Matcher<py_mode_e>::Match(Lexer<py_mode_e>* lexer, Token* tok) {
219	const char* p = lexer->p_current; // mutated by re2c
220	const char* YYMARKER = p;
221
222	switch (lexer->line_mode) {
223	case py_mode_e::Outer:
224	while (true) {
225	/*!re2c
226	nul { return true; }
227
228	whitespace { TOK(Id::WS); }
229
230	identifier { TOK(Id::Name); }
231
232	[r]? sq_string { TOK(Id::Str); }
233	[r]? dq_string { TOK(Id::Str); }
234
235	// optional raw prefix
236	[r]? triple_sq { TOK_MODE(Id::Str, py_mode_e::MultiSQ); }
237	[r]? triple_dq { TOK_MODE(Id::Str, py_mode_e::MultiDQ); }
238
239	pound_comment { TOK(Id::Comm); }
240
241	// Not the start of a string, comment, identifier
242	[^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
243
244	// e.g. unclosed quote like "foo
245	* { TOK(Id::Unknown); }
246
247	*/
248	}
249	break;
250
251	case py_mode_e::MultiSQ:
252	while (true) {
253	/*!re2c
254	nul { return true; }
255
256	triple_sq { TOK_MODE(Id::Str, py_mode_e::Outer); }
257
258	[^\x00']* { TOK(Id::Str); }
259
260	* { TOK(Id::Str); }
261
262	*/
263	}
264	break;
265
266	case py_mode_e::MultiDQ:
267	while (true) {
268	/*!re2c
269	nul { return true; }
270
271	triple_dq { TOK_MODE(Id::Str, py_mode_e::Outer); }
272
273	[^\x00"]* { TOK(Id::Str); }
274
275	* { TOK(Id::Str); }
276
277	*/
278	}
279	break;
280	}
281
282	tok->end_col = p - lexer->line_;
283	lexer->p_current = p;
284	return false;
285	}
286
287	enum class cpp_mode_e {
288	Outer, // default
289	Comm, // inside /* */ comment
290	DelimStr, // R"zz(string literal)zz"
291	Re2c, // /* !re2c
292	};
293
294	// Returns whether EOL was hit
295	template <>
296	bool Matcher<cpp_mode_e>::Match(Lexer<cpp_mode_e>* lexer, Token* tok) {
297	const char* p = lexer->p_current; // mutated by re2c
298	const char* YYMARKER = p;
299	const char s, e; // submatch extraction
300
301	// Autogenerated tag variables used by the lexer to track tag values.
302	/!stags:re2c format = 'const char @@;\n'; */
303
304	switch (lexer->line_mode) {
305	case cpp_mode_e::Outer:
306
307	while (true) {
308	/*!re2c
309	nul { return true; }
310
311	whitespace { TOK(Id::WS); }
312
313	"{" { TOK(Id::LBrace); }
314	"}" { TOK(Id::RBrace); }
315
316	identifier { TOK(Id::Name); }
317
318	// approximation for C++ char literals
319	sq_string { TOK(Id::Str); }
320	dq_string { TOK(Id::Str); }
321
322	// Not the start of a string, comment, identifier
323	[^\x00"'/_a-zA-Z{}]+ { TOK(Id::Other); }
324
325	"//" not_nul* { TOK(Id::Comm); }
326
327	// Treat re2c as preprocessor block
328	"/" "*!re2c" { TOK_MODE(Id::Re2c, cpp_mode_e::Re2c); }
329
330	"/" "*" { TOK_MODE(Id::Comm, cpp_mode_e::Comm); }
331
332	// Not sure what the rules are for R"zz(hello)zz". Make it similar to
333	// here docs.
334	cpp_delim_str = [_a-zA-Z]*;
335
336	"R" ["] @s cpp_delim_str @e "(" {
337	SUBMATCH(s, e);
338	TOK_MODE(Id::DelimStrBegin, cpp_mode_e::DelimStr);
339	}
340
341	// e.g. unclosed quote like "foo
342	* { TOK(Id::Unknown); }
343
344	*/
345	}
346	break;
347
348	case cpp_mode_e::Comm:
349	// Search until next */
350	while (true) {
351	/*!re2c
352	nul { return true; }
353
354	"*" "/" { TOK_MODE(Id::Comm, cpp_mode_e::Outer); }
355
356	[^\x00] { TOK(Id::Comm); }
357
358	* { TOK(Id::Comm); }
359
360	*/
361	}
362	break;
363
364	case cpp_mode_e::Re2c:
365	// Search until next */
366	while (true) {
367	/*!re2c
368	nul { return true; }
369
370	"*" "/" { TOK_MODE(Id::Re2c, cpp_mode_e::Outer); }
371
372	[^\x00] { TOK(Id::Re2c); }
373
374	* { TOK(Id::Re2c); }
375
376	*/
377	}
378	break;
379
380	case cpp_mode_e::DelimStr:
381	// Search until next */
382	while (true) {
383	/*!re2c
384	nul { return true; }
385
386	")" @s cpp_delim_str @e ["] {
387	SUBMATCH(s, e);
388	TOK(Id::DelimStrEnd);
389
390	// Caller is responsible for checking the extracted delimiter, and
391	// setting mode back to Cpp::Outer!
392	}
393
394	[^\x00)]* { TOK(Id::Str); }
395
396	* { TOK(Id::Str); }
397
398	*/
399	}
400	break;
401	}
402
403	tok->end_col = p - lexer->line_;
404	lexer->p_current = p;
405	return false;
406	}
407
408	class Hook {
409	public:
410	// Return true if this is a preprocessor line, and fill in tokens
411	// Caller should check last token for whether there is a continuation line.
412	virtual void TryPreprocess(char* line, std::vector<Token>* tokens) {
413	;
414	}
415	virtual ~Hook() {
416	}
417	};
418
419	enum class pp_mode_e {
420	Outer,
421	};
422
423	// Returns whether EOL was hit
424	template <>
425	bool Matcher<pp_mode_e>::Match(Lexer<pp_mode_e>* lexer, Token* tok) {
426	const char* p = lexer->p_current; // mutated by re2c
427	const char* YYMARKER = p;
428
429	switch (lexer->line_mode) {
430	case pp_mode_e::Outer:
431	while (true) {
432	/*!re2c
433	nul { return true; }
434
435	// Resolved in fix-up pass
436	// #include #define etc. only valid at the
437	// beginning
438	[ \t]* "#" [a-z]+ { TOK(Id::MaybePreproc); }
439
440	// C-style comments can end these lines
441	"//" not_nul* { TOK(Id::Comm); }
442
443	[\\] [\n] { TOK(Id::LineCont); }
444
445	// A line could be all whitespace, then \ at the
446	// end. And it's not significant
447	whitespace { TOK(Id::WS); }
448
449	// Not the start of a command, comment, or line
450	// continuation
451	[^\x00#/\\]+ { TOK(Id::PreprocOther); }
452
453	* { TOK(Id::PreprocOther); }
454
455	*/
456	}
457	break;
458	}
459
460	tok->end_col = p - lexer->line_;
461	lexer->p_current = p;
462	return false;
463	}
464
465	class CppHook : public Hook {
466	public:
467	virtual void TryPreprocess(char* line, std::vector<Token>* tokens);
468	};
469
470	enum class R_mode_e {
471	Outer, // default
472
473	SQ, // inside multi-line ''
474	DQ, // inside multi-line ""
475	};
476
477	// Returns whether EOL was hit
478	template <>
479	bool Matcher<R_mode_e>::Match(Lexer<R_mode_e>* lexer, Token* tok) {
480	const char* p = lexer->p_current; // mutated by re2c
481	const char* YYMARKER = p;
482
483	switch (lexer->line_mode) {
484	case R_mode_e::Outer:
485	while (true) {
486	/*!re2c
487	nul { return true; }
488
489	whitespace { TOK(Id::WS); }
490
491	pound_comment { TOK(Id::Comm); }
492
493	identifier { TOK(Id::Name); }
494
495	// Not the start of a string, escaped, comment, identifier
496	[^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
497
498	['] { TOK_MODE(Id::Str, R_mode_e::SQ); }
499	["] { TOK_MODE(Id::Str, R_mode_e::DQ); }
500
501	* { TOK(Id::Unknown); }
502
503	*/
504	}
505	break;
506
507	case R_mode_e::SQ:
508	while (true) {
509	/*!re2c
510	nul { return true; }
511
512	['] { TOK_MODE(Id::Str, R_mode_e::Outer); }
513
514	sq_middle { TOK(Id::Str); }
515
516	* { TOK(Id::Str); }
517
518	*/
519	}
520	break;
521
522	case R_mode_e::DQ:
523	while (true) {
524	/*!re2c
525	nul { return true; }
526
527	["] { TOK_MODE(Id::Str, R_mode_e::Outer); }
528
529	dq_middle { TOK(Id::Str); }
530
531	* { TOK(Id::Str); }
532
533	*/
534	}
535	break;
536	}
537
538	tok->end_col = p - lexer->line_;
539	lexer->p_current = p;
540	return false;
541	}
542
543	// Problem with shell: nested double quotes!!!
544	// We probably discourage this in YSH
545
546	enum class sh_mode_e {
547	Outer, // default
548
549	SQ, // inside multi-line ''
550	DollarSQ, // inside multi-line $''
551	DQ, // inside multi-line ""
552
553	// We could have a separate thing for this
554	YshSQ, // inside '''
555	YshDQ, // inside """
556	YshJ, // inside j"""
557	};
558
559	// Returns whether EOL was hit
560
561	// Submatch docs:
562	// https://re2c.org/manual/manual_c.html#submatch-extraction
563
564	template <>
565	bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
566	const char* p = lexer->p_current; // mutated by re2c
567	const char* YYMARKER = p;
568	const char s, e; // submatch extraction
569
570	// Autogenerated tag variables used by the lexer to track tag values.
571	/!stags:re2c format = 'const char @@;\n'; */
572
573	switch (lexer->line_mode) {
574	case sh_mode_e::Outer:
575	while (true) {
576	/*!re2c
577	nul { return true; }
578
579	whitespace { TOK(Id::WS); }
580
581	// Resolved in fix-up pass
582	pound_comment { TOK(Id::MaybeComment); }
583
584	// not that relevant for shell
585	identifier { TOK(Id::Name); }
586
587	// Not the start of a string, escaped, comment, identifier, here doc
588	[^\x00"'$#_a-zA-Z\\<]+ { TOK(Id::Other); }
589
590	// echo is like a string
591	"\\" . { TOK(Id::Str); }
592
593	['] { TOK_MODE(Id::Str, sh_mode_e::SQ); }
594	["] { TOK_MODE(Id::Str, sh_mode_e::DQ); }
595	"$'" { TOK_MODE(Id::Str, sh_mode_e::DollarSQ); }
596
597	// <<- is another syntax
598	here_op = "<<" [-]? [ \t]*;
599	h_delim = [_a-zA-Z][_a-zA-Z0-9]*;
600
601	// unquoted or quoted
602	here_op @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
603	here_op ['] @s h_delim @e ['] { SUBMATCH(s, e); TOK(Id::HereBegin); }
604	here_op ["] @s h_delim @e ["] { SUBMATCH(s, e); TOK(Id::HereBegin); }
605	here_op "\\" @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
606
607	// NOT Unknown, as in Python
608	* { TOK(Id::Other); }
609
610	*/
611	}
612	break;
613
614	case sh_mode_e::SQ:
615	// Search until next ' unconditionally
616	while (true) {
617	/*!re2c
618	nul { return true; }
619
620	['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
621
622	[^\x00']* { TOK(Id::Str); }
623
624	* { TOK(Id::Str); }
625
626	*/
627	}
628	break;
629
630	case sh_mode_e::DQ:
631	// Search until next " that's not preceded by "
632	while (true) {
633	/*!re2c
634	nul { return true; }
635
636	["] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
637
638	dq_middle { TOK(Id::Str); }
639
640	* { TOK(Id::Str); }
641
642	*/
643	}
644	break;
645
646	case sh_mode_e::DollarSQ:
647	// Search until next ' that's not preceded by "
648	while (true) {
649	/*!re2c
650	nul { return true; }
651
652	['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
653
654	sq_middle { TOK(Id::Str); }
655
656	* { TOK(Id::Str); }
657
658	*/
659	}
660	break;
661	case sh_mode_e::YshSQ:
662	case sh_mode_e::YshDQ:
663	case sh_mode_e::YshJ:
664	assert(0);
665	}
666
667	tok->end_col = p - lexer->line_;
668	lexer->p_current = p;
669	return false;
670	}
671
672	// TODO:
673	// - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
674	// - same as C++ raw string, I think
675	// - similar to here docs, but less complex
676	//
677	// Inherent problems with "micro segmentation":
678	//
679	// - Nested double quotes in shell. echo "hi ${name:-"default"}"
680	// - This means that lexing is dependent on parsing: does the second
681	// double quote close the first one, or does it start a nested string?
682	// - lexing is non-recursive, parsing is recursive
683
684	// Shell Comments depend on operator chars
685	// echo one # comment
686	// echo $(( 16#ff ))'
687
688	#endif // MICRO_SYNTAX_H