doctools/micro_syntax.cc

OILS / doctools / micro_syntax.cc View on Github | oilshell.org

1122 lines, 713 significant

1	// Micro Syntax
2	//
3	// See doctools/micro-syntax.md
4
5	#include "micro_syntax.h" // requires -I $BASE_DIR
6
7	#include <assert.h>
8	#include <errno.h>
9	#include <getopt.h>
10	#include <stdarg.h> // va_list, etc.
11	#include <stdbool.h>
12	#include <stdio.h>
13	#include <stdlib.h> // free
14	#include <string.h>
15
16	#include <string>
17	#include <vector>
18
19	const char* RESET = "\x1b[0;0m";
20	const char* BOLD = "\x1b[1m";
21	const char* UNDERLINE = "\x1b[4m";
22	const char* REVERSE = "\x1b[7m"; // reverse video
23
24	const char* BLACK = "\x1b[30m";
25	const char* RED = "\x1b[31m";
26	const char* GREEN = "\x1b[32m";
27	const char* YELLOW = "\x1b[33m";
28	const char* BLUE = "\x1b[34m";
29	const char* PURPLE = "\x1b[35m";
30	const char* CYAN = "\x1b[36m";
31	const char* WHITE = "\x1b[37m";
32
33	const char* BLACK2 = "\x1b[90m";
34	const char* RED2 = "\x1b[91m";
35	const char* BLUE2 = "\x1b[94m";
36
37	void Log(const char* fmt, ...) {
38	va_list args;
39	va_start(args, fmt);
40	vfprintf(stderr, fmt, args);
41	va_end(args);
42	fputs("\n", stderr);
43	}
44
45	enum class lang_e {
46	PlainText,
47
48	Cpp, // including C
49	Py,
50	Shell,
51	Ysh, // ''' etc.
52	Asdl,
53	R, // uses # comments
54
55	// JS, // uses // comments
56	};
57
58	class Reader {
59	// We don't care about internal NUL, so this interface doesn't allow it
60
61	public:
62	Reader(FILE* f, const char* filename)
63	: f_(f), filename_(filename), line_(nullptr), allocated_size_(0) {
64	}
65
66	const char* Filename() { // for error messages only, nullptr for stdin
67	return filename_;
68	}
69
70	bool NextLine() {
71	// Returns false if there was an error, and sets err_num_.
72	// Returns true if not error, and Current() can be checked.
73
74	// Note: getline() frees the previous line, so we don't have to
75	ssize_t len = getline(&line_, &allocated_size_, f_);
76
77	if (len < 0) { // EOF is -1
78	// man page says the buffer should be freed if getline() fails
79	free(line_);
80
81	line_ = nullptr; // tell the caller not to continue
82
83	if (errno != 0) { // I/O error
84	err_num_ = errno;
85	return false;
86	}
87	}
88	return true;
89	}
90
91	char* Current() {
92	// Returns nullptr on EOF.
93	return line_;
94	}
95
96	FILE* f_;
97	const char* filename_;
98
99	char* line_; // valid for one NextLine() call, nullptr on EOF or error
100	size_t allocated_size_; // unused, but must pass address to getline()
101	int err_num_; // set on error
102	};
103
104	class Printer {
105	public:
106	virtual void PrintLineNumber(int line_num) = 0;
107	virtual void PrintLineEnd() {
108	}
109	virtual void PrintToken(const char* line, int line_num, int start_col,
110	Token token) = 0;
111	virtual void Swap(std::string* s) {
112	assert(0);
113	}
114	virtual ~Printer() {
115	}
116	};
117
118	class HtmlPrinter : public Printer {
119	public:
120	HtmlPrinter() : Printer(), out_() {
121	}
122
123	virtual void Swap(std::string* s) {
124	// assert(s != nullptr);
125	out_.swap(*s);
126	}
127
128	virtual void PrintLineNumber(int line_num) {
129	char buf[16];
130	snprintf(buf, 16, "%d", line_num);
131
132	out_.append("<tr><td class=num>"); // <tr> closed by PrintLineEnd()
133	out_.append(buf);
134	out_.append("</td><td id=L"); // jump to line with foo.html#L32
135	out_.append(buf);
136	out_.append(" class=line>"); // <td> closed by PrintLineEnd()
137	}
138
139	virtual void PrintLineEnd() {
140	out_.append("</td></tr>");
141	}
142
143	virtual void PrintToken(const char* line, int line_num, int start_col,
144	Token tok) {
145	const char* p_start = line + start_col;
146	int num_bytes = tok.end_col - start_col;
147
148	switch (tok.id) {
149	case Id::Comm:
150	PrintSpan("comm", p_start, num_bytes);
151	break;
152
153	case Id::Name:
154	PrintEscaped(p_start, num_bytes);
155	break;
156
157	case Id::PreprocCommand:
158	case Id::LineCont:
159	PrintSpan("preproc", p_start, num_bytes);
160	break;
161
162	case Id::Re2c:
163	PrintSpan("re2c", p_start, num_bytes);
164	break;
165
166	case Id::Other:
167	// PrintSpan("other", p_start, num_bytes);
168	PrintEscaped(p_start, num_bytes);
169	break;
170
171	// for now these are strings
172	case Id::HereBegin:
173	case Id::HereEnd:
174	case Id::Str:
175	PrintSpan("str", p_start, num_bytes);
176	break;
177
178	case Id::LBrace:
179	case Id::RBrace:
180	PrintSpan("brace", p_start, num_bytes);
181	break;
182
183	case Id::Unknown:
184	PrintSpan("x", p_start, num_bytes);
185	break;
186
187	default:
188	PrintEscaped(p_start, num_bytes);
189	break;
190	}
191	}
192
193	private:
194	void PrintEscaped(const char* s, int len) {
195	// HTML escape the code string
196	for (int i = 0; i < len; ++i) {
197	char c = s[i];
198
199	switch (c) {
200	case '<':
201	out_.append("<");
202	break;
203	case '>':
204	out_.append(">");
205	break;
206	case '&':
207	out_.append("&");
208	break;
209	default:
210	// Is this inefficient? Fill 1 char
211	out_.append(1, s[i]);
212	break;
213	}
214	}
215	}
216
217	void PrintSpan(const char* css_class, const char* s, int len) {
218	out_.append("<span class=");
219	out_.append(css_class);
220	out_.append(">");
221
222	PrintEscaped(s, len);
223
224	out_.append("</span>");
225	}
226
227	std::string out_;
228	};
229
230	struct Flags {
231	lang_e lang;
232	bool tsv;
233	bool web;
234	bool more_color;
235	bool comments_only;
236
237	int argc;
238	char** argv;
239	};
240
241	class AnsiPrinter : public Printer {
242	public:
243	AnsiPrinter(const Flags& flag) : Printer(), flag_(flag) {
244	}
245
246	virtual void PrintLineNumber(int line_num) {
247	if (flag_.comments_only) {
248	return;
249	}
250	printf("%s%5d%s ", BLACK2, line_num, RESET);
251	}
252
253	virtual void PrintToken(const char* line, int line_num, int start_col,
254	Token tok) {
255	const char* p_start = line + start_col;
256	int num_bytes = tok.end_col - start_col;
257	switch (tok.id) {
258	case Id::Comm:
259	if (flag_.comments_only) {
260	PrintAlways(p_start, num_bytes);
261	} else {
262	PrintColor(BLUE, p_start, num_bytes);
263	}
264	break;
265
266	case Id::Name:
267	PrintText(p_start, num_bytes);
268	break;
269
270	case Id::PreprocCommand:
271	case Id::LineCont:
272	PrintColor(PURPLE, p_start, num_bytes);
273	break;
274
275	case Id::Re2c:
276	PrintColor(PURPLE, p_start, num_bytes);
277	break;
278
279	case Id::Other:
280	if (flag_.more_color) {
281	PrintColor(PURPLE, p_start, num_bytes);
282	} else {
283	PrintText(p_start, num_bytes);
284	}
285	break;
286
287	case Id::WS:
288	if (flag_.more_color) {
289	fputs(REVERSE, stdout);
290	PrintColor(WHITE, p_start, num_bytes);
291	} else {
292	PrintText(p_start, num_bytes);
293	}
294	break;
295
296	case Id::Str:
297	PrintColor(RED, p_start, num_bytes);
298	break;
299
300	case Id::HereBegin:
301	case Id::HereEnd: {
302	PrintColor(RED2, p_start, num_bytes);
303
304	// Debug submatch extraction
305	#if 0
306	fputs(RED, stdout);
307	int n = tok.submatch_len;
308	fwrite(tok.submatch_start, 1, n, stdout);
309	fputs(RESET, stdout);
310	#endif
311	} break;
312
313	case Id::DelimStrBegin:
314	case Id::DelimStrEnd: {
315	PrintColor(RED2, p_start, num_bytes);
316
317	// Debug submatch extraction
318	#if 0
319	fputs(RED, stdout);
320	int n = tok.submatch_len;
321	fwrite(tok.submatch_start, 1, n, stdout);
322	fputs(RESET, stdout);
323	#endif
324	} break;
325
326	case Id::LBrace:
327	case Id::RBrace:
328	PrintColor(GREEN, p_start, num_bytes);
329	break;
330
331	case Id::Unknown:
332	// Make errors red
333	fputs(REVERSE, stdout);
334	PrintColor(RED, p_start, num_bytes);
335	break;
336
337	default:
338	PrintText(p_start, num_bytes);
339	break;
340	}
341	}
342
343	private:
344	void PrintColor(const char* color, const char* s, int n) {
345	fputs(color, stdout);
346	PrintText(s, n);
347	fputs(RESET, stdout);
348	}
349
350	void PrintText(const char* s, int n) {
351	if (flag_.comments_only) {
352	for (int i = 0; i < n; ++i) {
353	// Replace everything but newline with space
354	// TODO: I think we always want a newline token, including in comments.
355	// That will simplify this.
356	char c = (s[i] == '\n') ? '\n' : ' ';
357	fwrite(&c, 1, 1, stdout);
358	}
359	} else {
360	fwrite(s, 1, n, stdout);
361	}
362	}
363
364	void PrintAlways(const char* s, int n) {
365	fwrite(s, 1, n, stdout);
366	}
367
368	const Flags& flag_;
369	};
370
371	const char* Id_str(Id id) {
372	switch (id) {
373	case Id::Comm:
374	return "Comm";
375	case Id::MaybeComment: // fix-up doesn't guarantee this is gone
376	return "MaybeComment";
377	case Id::WS:
378	return "WS";
379	case Id::Re2c:
380	return "Re2c";
381
382	case Id::MaybePreproc: // fix-up doesn't guarantee this is gone
383	return "MaybePreproc";
384	case Id::PreprocCommand:
385	return "PreprocCommand";
386	case Id::PreprocOther:
387	return "PreprocOther";
388	case Id::LineCont:
389	return "LineCont";
390
391	case Id::Name:
392	return "Name";
393	case Id::Other:
394	return "Other";
395
396	case Id::Str:
397	return "Str";
398
399	case Id::HereBegin:
400	return "HereBegin";
401	case Id::HereEnd:
402	return "HereEnd";
403	case Id::DelimStrBegin:
404	return "DelimStrBegin";
405	case Id::DelimStrEnd:
406	return "DelimStrEnd";
407
408	case Id::LBrace:
409	return "LBrace";
410	case Id::RBrace:
411	return "RBrace";
412
413	case Id::Unknown:
414	return "Unknown";
415	default:
416	assert(0);
417	}
418	}
419
420	class TsvPrinter : public Printer {
421	public:
422	virtual void PrintLineNumber(int line_num) {
423	;
424	}
425
426	virtual void Swap(std::string* s) {
427	// out_.swap(*s);
428	}
429
430	virtual void PrintToken(const char* line, int line_num, int start_col,
431	Token tok) {
432	printf("%d\t%s\t%d\t%d\n", line_num, Id_str(tok.id), start_col,
433	tok.end_col);
434	// printf(" -> mode %d\n", lexer.line_mode);
435	}
436	virtual ~TsvPrinter() {
437	}
438	};
439
440	bool TokenIsSignificant(Id id) {
441	switch (id) {
442	case Id::Name:
443	case Id::Other:
444	case Id::PreprocCommand:
445	case Id::PreprocOther:
446	case Id::Re2c:
447	return true;
448
449	// Comments, whitespace, and string literals aren't significant
450	// TODO: can abort on Id::Unknown?
451	default:
452	break;
453	}
454	return false;
455	}
456
457	class OutputStream {
458	// stdout contains either
459	// - netstrings of HTML, or TSV Token structs
460	// - ANSI text
461
462	public:
463	OutputStream(Printer* pr) : pr_(pr) {
464	}
465	virtual void PathBegin(const char* path) = 0;
466	virtual void Line(int line_num, const char* line,
467	const std::vector<Token>& tokens) = 0;
468	virtual void PathEnd(int num_lines, int num_sig_lines) = 0;
469	virtual ~OutputStream() {
470	}
471
472	protected:
473	Printer* pr_; // how to print each file
474	};
475
476	class NetStringOutput : public OutputStream {
477	public:
478	NetStringOutput(Printer* pr) : OutputStream(pr) {
479	}
480
481	virtual void PathBegin(const char* path) {
482	if (path == nullptr) {
483	path = "<stdin>";
484	}
485	PrintNetString(path, strlen(path));
486	}
487
488	virtual void Line(int line_num, const char* line,
489	const std::vector<Token>& tokens) {
490	pr_->PrintLineNumber(line_num);
491
492	int start_col = 0;
493	for (auto tok : tokens) {
494	pr_->PrintToken(line, line_num, start_col, tok);
495	start_col = tok.end_col;
496	}
497
498	pr_->PrintLineEnd();
499	}
500
501	virtual void PathEnd(int num_lines, int num_sig_lines) {
502	std::string string_for_file;
503	pr_->Swap(&string_for_file);
504
505	PrintNetString(string_for_file.c_str(), string_for_file.size());
506
507	// Output summary in JSON
508	// TODO: change this to a 4th column
509	char buf[64];
510	int n = snprintf(buf, 64, "{\"num_lines\": %d, \"num_sig_lines\": %d}",
511	num_lines, num_sig_lines);
512	PrintNetString(buf, n);
513	}
514
515	private:
516	void PrintNetString(const char* s, int len) {
517	fprintf(stdout, "%d:%*s,", len, len, s);
518	}
519	};
520
521	class AnsiOutput : public OutputStream {
522	public:
523	AnsiOutput(Printer* pr) : OutputStream(pr) {
524	}
525
526	// TODO: Can respect --comments-only
527	virtual void PathBegin(const char* path) {
528	if (path == nullptr) {
529	path = "<stdin>";
530	}
531	// diff uses +++ ---
532	printf("\n");
533	printf("=== %s%s%s%s ===\n", BOLD, PURPLE, path, RESET);
534	printf("\n");
535	}
536
537	virtual void Line(int line_num, const char* line,
538	const std::vector<Token>& tokens) {
539	pr_->PrintLineNumber(line_num);
540
541	int start_col = 0;
542	for (auto tok : tokens) {
543	pr_->PrintToken(line, line_num, start_col, tok);
544	start_col = tok.end_col;
545	}
546
547	pr_->PrintLineEnd();
548	};
549
550	// TODO: Can respect --comments-only
551	virtual void PathEnd(int num_lines, int num_sig_lines) {
552	fprintf(stdout, "%s%d lines, %d significant%s\n", GREEN, num_lines,
553	num_sig_lines, RESET);
554	};
555	};
556
557	void PrintTokens(std::vector<Token>& toks) {
558	int start_col = 0;
559	int i = 0;
560	Log("===");
561	for (auto tok : toks) {
562	Log("%2d %10s %2d %2d", i, Id_str(tok.id), start_col, tok.end_col);
563	start_col = tok.end_col;
564	++i;
565	}
566	Log("===");
567	}
568
569	// BUGGY, needs unit tests
570
571	// Fiddly function, reduces the size of the output a bit
572	// "hi" becomes 1 Id::DQ token instead of 3 separate Id::DQ tokens
573	void Optimize(std::vector<Token>* tokens) {
574	std::vector<Token>& toks = *tokens; // alias
575
576	// PrintTokens(toks);
577
578	int n = toks.size();
579	if (n < 1) { // nothing to de-duplicate
580	return;
581	}
582
583	int left = 0;
584	int right = 1;
585	while (right < n) {
586	Log("right ID = %s, end %d", Id_str(toks[right].id), toks[right].end_col);
587
588	if (toks[left].id == toks[right].id) {
589	// Join the tokens together
590	toks[left].end_col = toks[right].end_col;
591	} else {
592	toks[left] = toks[right];
593	left++;
594	Log(" not eq, left = %d", left);
595	}
596	right++;
597	}
598	Log("left = %d, right = %d", left, right);
599
600	// Fiddly condition: one more iteration. Need some unit tests for this.
601	toks[left] = toks[right - 1];
602	left++;
603	assert(left <= n);
604
605	// Erase the remaining ones
606	toks.resize(left);
607
608	// PrintTokens(toks);
609	}
610
611	// Version of the above that's not in-place, led to a bug fix
612	void Optimize2(std::vector<Token>* tokens) {
613	std::vector<Token> optimized;
614
615	int n = tokens->size();
616	if (n < 1) {
617	return;
618	}
619
620	optimized.reserve(n);
621
622	int left = 0;
623	int right = 1;
624	while (right < n) {
625	optimized.push_back((*tokens)[left]);
626	left++;
627	right++;
628	}
629	optimized.push_back((*tokens)[left]);
630	left++;
631
632	tokens->swap(optimized);
633	}
634
635	bool LineEqualsHereDelim(const char* line, std::string& here_delim) {
636	// Compare EOF vs. EOF\n or EOF\t\n or x\n
637
638	// Hack: skip leading tab unconditionally, even though that's only alowed in
639	// <<- Really we should capture the operator and the delim?
640	if (*line == '\t') {
641	line++;
642	}
643
644	int n = strlen(line);
645	int h = here_delim.size();
646
647	// Log("Here delim=%s line=%s", here_delim.c_str(), line);
648
649	// Line should be at least one longer, EOF\n
650	if (n <= h) {
651	// Log(" [0] line too short");
652	return false;
653	}
654
655	int i = 0;
656	for (; i < h; ++i) {
657	if (here_delim[i] != line[i]) {
658	// Log(" [1] byte %d not equal", i);
659	return false;
660	}
661	}
662
663	while (i < n) {
664	switch (line[i]) {
665	case ' ':
666	case '\t':
667	case '\r':
668	case '\n':
669	break;
670	default:
671	// Log(" [2] byte %d not whitespace", i);
672	return false; // line can't have whitespace on the end
673	}
674	++i;
675	}
676
677	return true;
678	}
679
680	void CppHook::TryPreprocess(char* line, std::vector<Token>* tokens) {
681	// Fills tokens, which can be checked for beginning and end tokens
682
683	Lexer<pp_mode_e> lexer(line);
684	Matcher<pp_mode_e> matcher;
685
686	while (true) { // tokens on each line
687	Token tok;
688	// Log("Match %d", lexer.p_current - lexer.line_);
689	bool eol = matcher.Match(&lexer, &tok);
690	// Log("EOL %d", eol);
691	if (eol) {
692	break;
693	}
694	// Log("TOK %s %d", Id_str(tok.id), tok.end_col);
695	tokens->push_back(tok); // make a copy
696	}
697	}
698
699	void FixShellComments(std::vector<Token>& tokens) {
700	int n = tokens.size();
701	for (int i = 0; i < n; ++i) {
702	// # comment at start of line
703	if (tokens[i].id == Id::MaybeComment) {
704	if (i == 0) {
705	tokens[i].id = Id::Comm;
706	}
707	if (i != 0 and tokens[i - 1].id == Id::WS) {
708	tokens[i].id = Id::Comm;
709	}
710	}
711	}
712	}
713
714	// This templated method causes some code expansion, but not too much. The
715	// binary went from 38 KB to 42 KB, after being stripped.
716	// We get a little type safety with py_mode_e vs cpp_mode_e.
717
718	template <typename T>
719	int ScanOne(Reader* reader, OutputStream* out, Hook* hook) {
720	Lexer<T> lexer(nullptr);
721	Matcher<T> matcher;
722
723	int line_num = 1;
724	int num_sig = 0;
725
726	std::vector<std::string> here_list; // delimiters to pop
727	std::vector<int> here_start_num;
728
729	// For multi-line strings. This has 0 or 1 entries, and the 1 entry can be
730	// the empty string.
731	std::vector<std::string> delim_begin;
732
733	while (true) { // read each line, handling errors
734	if (!reader->NextLine()) {
735	const char* name = reader->Filename() ?: "<stdin>";
736	Log("micro-syntax: getline() error on %s: %s", name,
737	strerror(reader->err_num_));
738	return 1;
739	}
740	char* line = reader->Current();
741	if (line == nullptr) {
742	break; // EOF
743	}
744
745	std::vector<Token> pre_tokens;
746
747	hook->TryPreprocess(line, &pre_tokens);
748
749	// e.g #define at beginning of line
750	if (pre_tokens.size() && pre_tokens[0].id == Id::MaybePreproc) {
751	pre_tokens[0].id = Id::PreprocCommand;
752
753	out->Line(line_num, line, pre_tokens);
754
755	line_num += 1;
756	num_sig += 1;
757
758	Token last = pre_tokens.back();
759	while (last.id == Id::LineCont) {
760	const char* blame = reader->Filename() ?: "<stdin>";
761	if (!reader->NextLine()) {
762	Log("micro-syntax: getline() error on %s: %s", blame,
763	strerror(reader->err_num_));
764	return 1;
765	}
766	char* line = reader->Current();
767	if (line == nullptr) {
768	Log("Unexpected end-of-file in preprocessor in %s", blame);
769	return 1;
770	}
771
772	pre_tokens.clear();
773	hook->TryPreprocess(line, &pre_tokens);
774
775	out->Line(line_num, line, pre_tokens);
776
777	line_num += 1;
778	num_sig += 1;
779
780	last = pre_tokens.back();
781	}
782	continue; // Skip the rest of the loop
783	}
784
785	//
786	// Main Loop for "normal" lines (not preprocessor or here doc)
787	//
788
789	std::vector<Token> tokens;
790	lexer.SetLine(line);
791
792	bool line_is_sig = false;
793	while (true) { // tokens on each line
794	Token tok;
795	bool eol = matcher.Match(&lexer, &tok);
796	if (eol) {
797	break;
798	}
799
800	switch (tok.id) {
801	case Id::HereBegin: {
802	// Put a copy on the stack
803	int n = tok.submatch_end - tok.submatch_start;
804	here_list.emplace_back(line + tok.submatch_start, n);
805	here_start_num.push_back(line_num);
806	} break;
807
808	case Id::DelimStrBegin: {
809	if (delim_begin.empty()) {
810	int n = tok.submatch_end - tok.submatch_start;
811	delim_begin.emplace_back(line + tok.submatch_start, n);
812	} else {
813	// We have entered cpp_mode_e::DelimStr, which means we should never
814	// return another DelimStrBegin
815	assert(0);
816	}
817	} break;
818
819	case Id::DelimStrEnd: {
820	if (delim_begin.empty()) {
821	// We should never get this unless we got a DelimStrBegin first
822	assert(0);
823	} else {
824	size_t n = tok.submatch_end - tok.submatch_start;
825	std::string end_delim(line + tok.submatch_start, n);
826
827	if (end_delim == delim_begin.back()) {
828	lexer.line_mode = T::Outer; // the string is ended
829	delim_begin.pop_back();
830	} else {
831	tok.id = Id::Str; // mismatched delimiter is just a string
832	}
833	}
834	} break;
835
836	default:
837	break;
838	}
839
840	tokens.push_back(tok); // make a copy
841
842	if (TokenIsSignificant(tok.id)) {
843	line_is_sig = true;
844	}
845	}
846
847	#if 0
848	PrintTokens(tokens);
849	Log("%d tokens before", tokens.size());
850	Optimize(&tokens);
851	Log("%d tokens after", tokens.size());
852	PrintTokens(tokens);
853	#endif
854
855	FixShellComments(tokens);
856
857	out->Line(line_num, line, tokens);
858	tokens.clear();
859
860	// Potentially multiple here docs for this line
861	int here_index = 0;
862	for (auto here_delim : here_list) {
863	// Log("HERE %s", here_delim.c_str());
864
865	while (true) {
866	const char* blame = reader->Filename() ?: "<stdin>";
867	if (!reader->NextLine()) {
868	Log("micro-syntax: getline() error on %s: %s", blame,
869	strerror(reader->err_num_));
870	return 1;
871	}
872	char* line = reader->Current();
873	if (line == nullptr) {
874	int start_line = here_start_num[here_index];
875	Log("Unexpected end-of-file in here doc in %s, start line %d", blame,
876	start_line);
877	return 1;
878	}
879
880	line_num++;
881
882	if (LineEqualsHereDelim(line, here_delim)) {
883	int n = strlen(line);
884	Token whole_line(Id::HereEnd, n);
885	tokens.push_back(whole_line);
886	out->Line(line_num, line, tokens);
887	tokens.clear();
888	break;
889
890	} else {
891	int n = strlen(line);
892	Token whole_line(Id::Str, n);
893	tokens.push_back(whole_line);
894	out->Line(line_num, line, tokens);
895	tokens.clear();
896
897	// Log(" not equal: %s", line);
898	}
899	}
900	here_index++;
901	}
902	here_list.clear();
903	here_start_num.clear();
904
905	line_num++;
906	num_sig += line_is_sig;
907	}
908
909	out->PathEnd(line_num - 1, num_sig);
910	return 0;
911	}
912
913	int ScanFiles(const Flags& flag, std::vector<char> files, OutputStream out,
914	Hook* hook) {
915	Reader* reader = nullptr;
916
917	int status = 0;
918	for (auto path : files) {
919	FILE* f;
920	if (path == nullptr) {
921	f = stdin;
922	} else {
923	f = fopen(path, "r");
924	if (f == nullptr) {
925	Log("Error opening %s: %s", path, strerror(errno));
926	return 1;
927	}
928	}
929	out->PathBegin(path);
930
931	reader = new Reader(f, path);
932
933	switch (flag.lang) {
934	case lang_e::PlainText:
935	status = ScanOne<text_mode_e>(reader, out, hook);
936	break;
937
938	case lang_e::Py:
939	status = ScanOne<py_mode_e>(reader, out, hook);
940	break;
941
942	case lang_e::Cpp:
943	status = ScanOne<cpp_mode_e>(reader, out, hook);
944	break;
945
946	case lang_e::Shell:
947	status = ScanOne<sh_mode_e>(reader, out, hook);
948	break;
949
950	case lang_e::Asdl:
951	status = ScanOne<asdl_mode_e>(reader, out, hook);
952	break;
953
954	case lang_e::R:
955	status = ScanOne<R_mode_e>(reader, out, hook);
956	break;
957
958	default:
959	assert(0);
960	}
961
962	delete reader;
963
964	if (path == nullptr) {
965	;
966	} else {
967	fclose(f);
968	}
969
970	if (status != 0) {
971	break;
972	}
973	}
974
975	return status;
976	}
977
978	void PrintHelp() {
979	puts(R"(Usage: micro-syntax FLAGS* FILE*
980
981	Recognizes the syntax of each file,, and prints it to stdout.
982
983	If there are no files, reads stdin.
984
985	Flags:
986	-h --help This help
987
988	-l --lang Language: py\|cpp\|shell\|...
989	-t Print tokens as TSV, instead of ANSI color
990	-w Print HTML for the web
991
992	-m More color, useful for debugging tokens
993
994	-n --no-comments Omit comments
995	-o --comments-only Only print comments
996	-e --empty-strs Substitute string literals for empty strings
997	--color on off always more
998
999	)");
1000	}
1001
1002	int main(int argc, char** argv) {
1003	Flags flag = {lang_e::PlainText};
1004
1005	// http://www.gnu.org/software/libc/manual/html_node/Example-of-Getopt.html
1006	// + means to be strict about flag parsing.
1007	int c;
1008	while ((c = getopt(argc, argv, "+hl:motw")) != -1) {
1009	switch (c) {
1010	case 'h':
1011	PrintHelp();
1012	return 0;
1013
1014	case 'l':
1015	if (strcmp(optarg, "cpp") == 0) {
1016	flag.lang = lang_e::Cpp;
1017
1018	} else if (strcmp(optarg, "py") == 0) {
1019	flag.lang = lang_e::Py;
1020
1021	} else if (strcmp(optarg, "shell") == 0) {
1022	flag.lang = lang_e::Shell;
1023
1024	} else if (strcmp(optarg, "asdl") == 0) {
1025	flag.lang = lang_e::Asdl;
1026
1027	} else if (strcmp(optarg, "R") == 0) {
1028	flag.lang = lang_e::R;
1029
1030	// TODO: implement all of these
1031	} else if (strcmp(optarg, "js") == 0) {
1032	flag.lang = lang_e::PlainText;
1033
1034	} else if (strcmp(optarg, "css") == 0) {
1035	flag.lang = lang_e::PlainText;
1036
1037	} else if (strcmp(optarg, "md") == 0) {
1038	flag.lang = lang_e::PlainText;
1039
1040	} else if (strcmp(optarg, "yaml") == 0) {
1041	flag.lang = lang_e::PlainText;
1042
1043	} else if (strcmp(optarg, "txt") == 0) {
1044	flag.lang = lang_e::PlainText;
1045
1046	} else if (strcmp(optarg, "other") == 0) {
1047	flag.lang = lang_e::PlainText;
1048
1049	} else {
1050	Log("Expected -l LANG to be cpp\|py\|shell\|asdl\|R\|js\|css\|md\|yaml\|txt, "
1051	"got %s",
1052	optarg);
1053	return 2;
1054	}
1055	break;
1056
1057	case 'm':
1058	flag.more_color = true;
1059	break;
1060
1061	case 'o':
1062	flag.comments_only = true;
1063	break;
1064
1065	case 't':
1066	flag.tsv = true;
1067	break;
1068
1069	case 'w':
1070	flag.web = true;
1071	break;
1072
1073	case '?': // getopt library will print error
1074	return 2;
1075
1076	default:
1077	abort(); // should never happen
1078	}
1079	}
1080
1081	int a = optind; // index into argv
1082	flag.argv = argv + a;
1083	flag.argc = argc - a;
1084
1085	std::vector<char*> files; // filename, or nullptr for stdin
1086	if (flag.argc != 0) {
1087	for (int i = 0; i < flag.argc; ++i) {
1088	files.push_back(flag.argv[i]);
1089	}
1090	} else {
1091	files.push_back(nullptr); // stands for stdin
1092	}
1093
1094	Printer* pr; // for each file
1095	OutputStream* out; // the entire stream
1096
1097	if (flag.tsv) {
1098	pr = new TsvPrinter();
1099	out = new NetStringOutput(pr);
1100	} else if (flag.web) {
1101	pr = new HtmlPrinter();
1102	out = new NetStringOutput(pr);
1103	} else {
1104	pr = new AnsiPrinter(flag);
1105	out = new AnsiOutput(pr);
1106	}
1107
1108	Hook* hook = nullptr;
1109	if (flag.lang == lang_e::Cpp) {
1110	hook = new CppHook();
1111	} else {
1112	hook = new Hook(); // default hook
1113	}
1114
1115	int status = ScanFiles(flag, files, out, hook);
1116
1117	delete hook;
1118	delete pr;
1119	delete out;
1120
1121	return status;
1122	}