OILS / doctools / micro_syntax.cc View on Github | oilshell.org

1122 lines, 713 significant
1// Micro Syntax
2//
3// See doctools/micro-syntax.md
4
5#include "micro_syntax.h" // requires -I $BASE_DIR
6
7#include <assert.h>
8#include <errno.h>
9#include <getopt.h>
10#include <stdarg.h> // va_list, etc.
11#include <stdbool.h>
12#include <stdio.h>
13#include <stdlib.h> // free
14#include <string.h>
15
16#include <string>
17#include <vector>
18
19const char* RESET = "\x1b[0;0m";
20const char* BOLD = "\x1b[1m";
21const char* UNDERLINE = "\x1b[4m";
22const char* REVERSE = "\x1b[7m"; // reverse video
23
24const char* BLACK = "\x1b[30m";
25const char* RED = "\x1b[31m";
26const char* GREEN = "\x1b[32m";
27const char* YELLOW = "\x1b[33m";
28const char* BLUE = "\x1b[34m";
29const char* PURPLE = "\x1b[35m";
30const char* CYAN = "\x1b[36m";
31const char* WHITE = "\x1b[37m";
32
33const char* BLACK2 = "\x1b[90m";
34const char* RED2 = "\x1b[91m";
35const char* BLUE2 = "\x1b[94m";
36
37void Log(const char* fmt, ...) {
38 va_list args;
39 va_start(args, fmt);
40 vfprintf(stderr, fmt, args);
41 va_end(args);
42 fputs("\n", stderr);
43}
44
45enum class lang_e {
46 PlainText,
47
48 Cpp, // including C
49 Py,
50 Shell,
51 Ysh, // ''' etc.
52 Asdl,
53 R, // uses # comments
54
55 // JS, // uses // comments
56};
57
58class Reader {
59 // We don't care about internal NUL, so this interface doesn't allow it
60
61 public:
62 Reader(FILE* f, const char* filename)
63 : f_(f), filename_(filename), line_(nullptr), allocated_size_(0) {
64 }
65
66 const char* Filename() { // for error messages only, nullptr for stdin
67 return filename_;
68 }
69
70 bool NextLine() {
71 // Returns false if there was an error, and sets err_num_.
72 // Returns true if not error, and Current() can be checked.
73
74 // Note: getline() frees the previous line, so we don't have to
75 ssize_t len = getline(&line_, &allocated_size_, f_);
76
77 if (len < 0) { // EOF is -1
78 // man page says the buffer should be freed if getline() fails
79 free(line_);
80
81 line_ = nullptr; // tell the caller not to continue
82
83 if (errno != 0) { // I/O error
84 err_num_ = errno;
85 return false;
86 }
87 }
88 return true;
89 }
90
91 char* Current() {
92 // Returns nullptr on EOF.
93 return line_;
94 }
95
96 FILE* f_;
97 const char* filename_;
98
99 char* line_; // valid for one NextLine() call, nullptr on EOF or error
100 size_t allocated_size_; // unused, but must pass address to getline()
101 int err_num_; // set on error
102};
103
104class Printer {
105 public:
106 virtual void PrintLineNumber(int line_num) = 0;
107 virtual void PrintLineEnd() {
108 }
109 virtual void PrintToken(const char* line, int line_num, int start_col,
110 Token token) = 0;
111 virtual void Swap(std::string* s) {
112 assert(0);
113 }
114 virtual ~Printer() {
115 }
116};
117
118class HtmlPrinter : public Printer {
119 public:
120 HtmlPrinter() : Printer(), out_() {
121 }
122
123 virtual void Swap(std::string* s) {
124 // assert(s != nullptr);
125 out_.swap(*s);
126 }
127
128 virtual void PrintLineNumber(int line_num) {
129 char buf[16];
130 snprintf(buf, 16, "%d", line_num);
131
132 out_.append("<tr><td class=num>"); // <tr> closed by PrintLineEnd()
133 out_.append(buf);
134 out_.append("</td><td id=L"); // jump to line with foo.html#L32
135 out_.append(buf);
136 out_.append(" class=line>"); // <td> closed by PrintLineEnd()
137 }
138
139 virtual void PrintLineEnd() {
140 out_.append("</td></tr>");
141 }
142
143 virtual void PrintToken(const char* line, int line_num, int start_col,
144 Token tok) {
145 const char* p_start = line + start_col;
146 int num_bytes = tok.end_col - start_col;
147
148 switch (tok.id) {
149 case Id::Comm:
150 PrintSpan("comm", p_start, num_bytes);
151 break;
152
153 case Id::Name:
154 PrintEscaped(p_start, num_bytes);
155 break;
156
157 case Id::PreprocCommand:
158 case Id::LineCont:
159 PrintSpan("preproc", p_start, num_bytes);
160 break;
161
162 case Id::Re2c:
163 PrintSpan("re2c", p_start, num_bytes);
164 break;
165
166 case Id::Other:
167 // PrintSpan("other", p_start, num_bytes);
168 PrintEscaped(p_start, num_bytes);
169 break;
170
171 // for now these are strings
172 case Id::HereBegin:
173 case Id::HereEnd:
174 case Id::Str:
175 PrintSpan("str", p_start, num_bytes);
176 break;
177
178 case Id::LBrace:
179 case Id::RBrace:
180 PrintSpan("brace", p_start, num_bytes);
181 break;
182
183 case Id::Unknown:
184 PrintSpan("x", p_start, num_bytes);
185 break;
186
187 default:
188 PrintEscaped(p_start, num_bytes);
189 break;
190 }
191 }
192
193 private:
194 void PrintEscaped(const char* s, int len) {
195 // HTML escape the code string
196 for (int i = 0; i < len; ++i) {
197 char c = s[i];
198
199 switch (c) {
200 case '<':
201 out_.append("&lt;");
202 break;
203 case '>':
204 out_.append("&gt;");
205 break;
206 case '&':
207 out_.append("&amp;");
208 break;
209 default:
210 // Is this inefficient? Fill 1 char
211 out_.append(1, s[i]);
212 break;
213 }
214 }
215 }
216
217 void PrintSpan(const char* css_class, const char* s, int len) {
218 out_.append("<span class=");
219 out_.append(css_class);
220 out_.append(">");
221
222 PrintEscaped(s, len);
223
224 out_.append("</span>");
225 }
226
227 std::string out_;
228};
229
230struct Flags {
231 lang_e lang;
232 bool tsv;
233 bool web;
234 bool more_color;
235 bool comments_only;
236
237 int argc;
238 char** argv;
239};
240
241class AnsiPrinter : public Printer {
242 public:
243 AnsiPrinter(const Flags& flag) : Printer(), flag_(flag) {
244 }
245
246 virtual void PrintLineNumber(int line_num) {
247 if (flag_.comments_only) {
248 return;
249 }
250 printf("%s%5d%s ", BLACK2, line_num, RESET);
251 }
252
253 virtual void PrintToken(const char* line, int line_num, int start_col,
254 Token tok) {
255 const char* p_start = line + start_col;
256 int num_bytes = tok.end_col - start_col;
257 switch (tok.id) {
258 case Id::Comm:
259 if (flag_.comments_only) {
260 PrintAlways(p_start, num_bytes);
261 } else {
262 PrintColor(BLUE, p_start, num_bytes);
263 }
264 break;
265
266 case Id::Name:
267 PrintText(p_start, num_bytes);
268 break;
269
270 case Id::PreprocCommand:
271 case Id::LineCont:
272 PrintColor(PURPLE, p_start, num_bytes);
273 break;
274
275 case Id::Re2c:
276 PrintColor(PURPLE, p_start, num_bytes);
277 break;
278
279 case Id::Other:
280 if (flag_.more_color) {
281 PrintColor(PURPLE, p_start, num_bytes);
282 } else {
283 PrintText(p_start, num_bytes);
284 }
285 break;
286
287 case Id::WS:
288 if (flag_.more_color) {
289 fputs(REVERSE, stdout);
290 PrintColor(WHITE, p_start, num_bytes);
291 } else {
292 PrintText(p_start, num_bytes);
293 }
294 break;
295
296 case Id::Str:
297 PrintColor(RED, p_start, num_bytes);
298 break;
299
300 case Id::HereBegin:
301 case Id::HereEnd: {
302 PrintColor(RED2, p_start, num_bytes);
303
304 // Debug submatch extraction
305#if 0
306 fputs(RED, stdout);
307 int n = tok.submatch_len;
308 fwrite(tok.submatch_start, 1, n, stdout);
309 fputs(RESET, stdout);
310#endif
311 } break;
312
313 case Id::DelimStrBegin:
314 case Id::DelimStrEnd: {
315 PrintColor(RED2, p_start, num_bytes);
316
317 // Debug submatch extraction
318#if 0
319 fputs(RED, stdout);
320 int n = tok.submatch_len;
321 fwrite(tok.submatch_start, 1, n, stdout);
322 fputs(RESET, stdout);
323#endif
324 } break;
325
326 case Id::LBrace:
327 case Id::RBrace:
328 PrintColor(GREEN, p_start, num_bytes);
329 break;
330
331 case Id::Unknown:
332 // Make errors red
333 fputs(REVERSE, stdout);
334 PrintColor(RED, p_start, num_bytes);
335 break;
336
337 default:
338 PrintText(p_start, num_bytes);
339 break;
340 }
341 }
342
343 private:
344 void PrintColor(const char* color, const char* s, int n) {
345 fputs(color, stdout);
346 PrintText(s, n);
347 fputs(RESET, stdout);
348 }
349
350 void PrintText(const char* s, int n) {
351 if (flag_.comments_only) {
352 for (int i = 0; i < n; ++i) {
353 // Replace everything but newline with space
354 // TODO: I think we always want a newline token, including in comments.
355 // That will simplify this.
356 char c = (s[i] == '\n') ? '\n' : ' ';
357 fwrite(&c, 1, 1, stdout);
358 }
359 } else {
360 fwrite(s, 1, n, stdout);
361 }
362 }
363
364 void PrintAlways(const char* s, int n) {
365 fwrite(s, 1, n, stdout);
366 }
367
368 const Flags& flag_;
369};
370
371const char* Id_str(Id id) {
372 switch (id) {
373 case Id::Comm:
374 return "Comm";
375 case Id::MaybeComment: // fix-up doesn't guarantee this is gone
376 return "MaybeComment";
377 case Id::WS:
378 return "WS";
379 case Id::Re2c:
380 return "Re2c";
381
382 case Id::MaybePreproc: // fix-up doesn't guarantee this is gone
383 return "MaybePreproc";
384 case Id::PreprocCommand:
385 return "PreprocCommand";
386 case Id::PreprocOther:
387 return "PreprocOther";
388 case Id::LineCont:
389 return "LineCont";
390
391 case Id::Name:
392 return "Name";
393 case Id::Other:
394 return "Other";
395
396 case Id::Str:
397 return "Str";
398
399 case Id::HereBegin:
400 return "HereBegin";
401 case Id::HereEnd:
402 return "HereEnd";
403 case Id::DelimStrBegin:
404 return "DelimStrBegin";
405 case Id::DelimStrEnd:
406 return "DelimStrEnd";
407
408 case Id::LBrace:
409 return "LBrace";
410 case Id::RBrace:
411 return "RBrace";
412
413 case Id::Unknown:
414 return "Unknown";
415 default:
416 assert(0);
417 }
418}
419
420class TsvPrinter : public Printer {
421 public:
422 virtual void PrintLineNumber(int line_num) {
423 ;
424 }
425
426 virtual void Swap(std::string* s) {
427 // out_.swap(*s);
428 }
429
430 virtual void PrintToken(const char* line, int line_num, int start_col,
431 Token tok) {
432 printf("%d\t%s\t%d\t%d\n", line_num, Id_str(tok.id), start_col,
433 tok.end_col);
434 // printf(" -> mode %d\n", lexer.line_mode);
435 }
436 virtual ~TsvPrinter() {
437 }
438};
439
440bool TokenIsSignificant(Id id) {
441 switch (id) {
442 case Id::Name:
443 case Id::Other:
444 case Id::PreprocCommand:
445 case Id::PreprocOther:
446 case Id::Re2c:
447 return true;
448
449 // Comments, whitespace, and string literals aren't significant
450 // TODO: can abort on Id::Unknown?
451 default:
452 break;
453 }
454 return false;
455}
456
457class OutputStream {
458 // stdout contains either
459 // - netstrings of HTML, or TSV Token structs
460 // - ANSI text
461
462 public:
463 OutputStream(Printer* pr) : pr_(pr) {
464 }
465 virtual void PathBegin(const char* path) = 0;
466 virtual void Line(int line_num, const char* line,
467 const std::vector<Token>& tokens) = 0;
468 virtual void PathEnd(int num_lines, int num_sig_lines) = 0;
469 virtual ~OutputStream() {
470 }
471
472 protected:
473 Printer* pr_; // how to print each file
474};
475
476class NetStringOutput : public OutputStream {
477 public:
478 NetStringOutput(Printer* pr) : OutputStream(pr) {
479 }
480
481 virtual void PathBegin(const char* path) {
482 if (path == nullptr) {
483 path = "<stdin>";
484 }
485 PrintNetString(path, strlen(path));
486 }
487
488 virtual void Line(int line_num, const char* line,
489 const std::vector<Token>& tokens) {
490 pr_->PrintLineNumber(line_num);
491
492 int start_col = 0;
493 for (auto tok : tokens) {
494 pr_->PrintToken(line, line_num, start_col, tok);
495 start_col = tok.end_col;
496 }
497
498 pr_->PrintLineEnd();
499 }
500
501 virtual void PathEnd(int num_lines, int num_sig_lines) {
502 std::string string_for_file;
503 pr_->Swap(&string_for_file);
504
505 PrintNetString(string_for_file.c_str(), string_for_file.size());
506
507 // Output summary in JSON
508 // TODO: change this to a 4th column
509 char buf[64];
510 int n = snprintf(buf, 64, "{\"num_lines\": %d, \"num_sig_lines\": %d}",
511 num_lines, num_sig_lines);
512 PrintNetString(buf, n);
513 }
514
515 private:
516 void PrintNetString(const char* s, int len) {
517 fprintf(stdout, "%d:%*s,", len, len, s);
518 }
519};
520
521class AnsiOutput : public OutputStream {
522 public:
523 AnsiOutput(Printer* pr) : OutputStream(pr) {
524 }
525
526 // TODO: Can respect --comments-only
527 virtual void PathBegin(const char* path) {
528 if (path == nullptr) {
529 path = "<stdin>";
530 }
531 // diff uses +++ ---
532 printf("\n");
533 printf("=== %s%s%s%s ===\n", BOLD, PURPLE, path, RESET);
534 printf("\n");
535 }
536
537 virtual void Line(int line_num, const char* line,
538 const std::vector<Token>& tokens) {
539 pr_->PrintLineNumber(line_num);
540
541 int start_col = 0;
542 for (auto tok : tokens) {
543 pr_->PrintToken(line, line_num, start_col, tok);
544 start_col = tok.end_col;
545 }
546
547 pr_->PrintLineEnd();
548 };
549
550 // TODO: Can respect --comments-only
551 virtual void PathEnd(int num_lines, int num_sig_lines) {
552 fprintf(stdout, "%s%d lines, %d significant%s\n", GREEN, num_lines,
553 num_sig_lines, RESET);
554 };
555};
556
557void PrintTokens(std::vector<Token>& toks) {
558 int start_col = 0;
559 int i = 0;
560 Log("===");
561 for (auto tok : toks) {
562 Log("%2d %10s %2d %2d", i, Id_str(tok.id), start_col, tok.end_col);
563 start_col = tok.end_col;
564 ++i;
565 }
566 Log("===");
567}
568
569// BUGGY, needs unit tests
570
571// Fiddly function, reduces the size of the output a bit
572// "hi" becomes 1 Id::DQ token instead of 3 separate Id::DQ tokens
573void Optimize(std::vector<Token>* tokens) {
574 std::vector<Token>& toks = *tokens; // alias
575
576 // PrintTokens(toks);
577
578 int n = toks.size();
579 if (n < 1) { // nothing to de-duplicate
580 return;
581 }
582
583 int left = 0;
584 int right = 1;
585 while (right < n) {
586 Log("right ID = %s, end %d", Id_str(toks[right].id), toks[right].end_col);
587
588 if (toks[left].id == toks[right].id) {
589 // Join the tokens together
590 toks[left].end_col = toks[right].end_col;
591 } else {
592 toks[left] = toks[right];
593 left++;
594 Log(" not eq, left = %d", left);
595 }
596 right++;
597 }
598 Log("left = %d, right = %d", left, right);
599
600 // Fiddly condition: one more iteration. Need some unit tests for this.
601 toks[left] = toks[right - 1];
602 left++;
603 assert(left <= n);
604
605 // Erase the remaining ones
606 toks.resize(left);
607
608 // PrintTokens(toks);
609}
610
611// Version of the above that's not in-place, led to a bug fix
612void Optimize2(std::vector<Token>* tokens) {
613 std::vector<Token> optimized;
614
615 int n = tokens->size();
616 if (n < 1) {
617 return;
618 }
619
620 optimized.reserve(n);
621
622 int left = 0;
623 int right = 1;
624 while (right < n) {
625 optimized.push_back((*tokens)[left]);
626 left++;
627 right++;
628 }
629 optimized.push_back((*tokens)[left]);
630 left++;
631
632 tokens->swap(optimized);
633}
634
635bool LineEqualsHereDelim(const char* line, std::string& here_delim) {
636 // Compare EOF vs. EOF\n or EOF\t\n or x\n
637
638 // Hack: skip leading tab unconditionally, even though that's only alowed in
639 // <<- Really we should capture the operator and the delim?
640 if (*line == '\t') {
641 line++;
642 }
643
644 int n = strlen(line);
645 int h = here_delim.size();
646
647 // Log("Here delim=%s line=%s", here_delim.c_str(), line);
648
649 // Line should be at least one longer, EOF\n
650 if (n <= h) {
651 // Log(" [0] line too short");
652 return false;
653 }
654
655 int i = 0;
656 for (; i < h; ++i) {
657 if (here_delim[i] != line[i]) {
658 // Log(" [1] byte %d not equal", i);
659 return false;
660 }
661 }
662
663 while (i < n) {
664 switch (line[i]) {
665 case ' ':
666 case '\t':
667 case '\r':
668 case '\n':
669 break;
670 default:
671 // Log(" [2] byte %d not whitespace", i);
672 return false; // line can't have whitespace on the end
673 }
674 ++i;
675 }
676
677 return true;
678}
679
680void CppHook::TryPreprocess(char* line, std::vector<Token>* tokens) {
681 // Fills tokens, which can be checked for beginning and end tokens
682
683 Lexer<pp_mode_e> lexer(line);
684 Matcher<pp_mode_e> matcher;
685
686 while (true) { // tokens on each line
687 Token tok;
688 // Log("Match %d", lexer.p_current - lexer.line_);
689 bool eol = matcher.Match(&lexer, &tok);
690 // Log("EOL %d", eol);
691 if (eol) {
692 break;
693 }
694 // Log("TOK %s %d", Id_str(tok.id), tok.end_col);
695 tokens->push_back(tok); // make a copy
696 }
697}
698
699void FixShellComments(std::vector<Token>& tokens) {
700 int n = tokens.size();
701 for (int i = 0; i < n; ++i) {
702 // # comment at start of line
703 if (tokens[i].id == Id::MaybeComment) {
704 if (i == 0) {
705 tokens[i].id = Id::Comm;
706 }
707 if (i != 0 and tokens[i - 1].id == Id::WS) {
708 tokens[i].id = Id::Comm;
709 }
710 }
711 }
712}
713
714// This templated method causes some code expansion, but not too much. The
715// binary went from 38 KB to 42 KB, after being stripped.
716// We get a little type safety with py_mode_e vs cpp_mode_e.
717
718template <typename T>
719int ScanOne(Reader* reader, OutputStream* out, Hook* hook) {
720 Lexer<T> lexer(nullptr);
721 Matcher<T> matcher;
722
723 int line_num = 1;
724 int num_sig = 0;
725
726 std::vector<std::string> here_list; // delimiters to pop
727 std::vector<int> here_start_num;
728
729 // For multi-line strings. This has 0 or 1 entries, and the 1 entry can be
730 // the empty string.
731 std::vector<std::string> delim_begin;
732
733 while (true) { // read each line, handling errors
734 if (!reader->NextLine()) {
735 const char* name = reader->Filename() ?: "<stdin>";
736 Log("micro-syntax: getline() error on %s: %s", name,
737 strerror(reader->err_num_));
738 return 1;
739 }
740 char* line = reader->Current();
741 if (line == nullptr) {
742 break; // EOF
743 }
744
745 std::vector<Token> pre_tokens;
746
747 hook->TryPreprocess(line, &pre_tokens);
748
749 // e.g #define at beginning of line
750 if (pre_tokens.size() && pre_tokens[0].id == Id::MaybePreproc) {
751 pre_tokens[0].id = Id::PreprocCommand;
752
753 out->Line(line_num, line, pre_tokens);
754
755 line_num += 1;
756 num_sig += 1;
757
758 Token last = pre_tokens.back();
759 while (last.id == Id::LineCont) {
760 const char* blame = reader->Filename() ?: "<stdin>";
761 if (!reader->NextLine()) {
762 Log("micro-syntax: getline() error on %s: %s", blame,
763 strerror(reader->err_num_));
764 return 1;
765 }
766 char* line = reader->Current();
767 if (line == nullptr) {
768 Log("Unexpected end-of-file in preprocessor in %s", blame);
769 return 1;
770 }
771
772 pre_tokens.clear();
773 hook->TryPreprocess(line, &pre_tokens);
774
775 out->Line(line_num, line, pre_tokens);
776
777 line_num += 1;
778 num_sig += 1;
779
780 last = pre_tokens.back();
781 }
782 continue; // Skip the rest of the loop
783 }
784
785 //
786 // Main Loop for "normal" lines (not preprocessor or here doc)
787 //
788
789 std::vector<Token> tokens;
790 lexer.SetLine(line);
791
792 bool line_is_sig = false;
793 while (true) { // tokens on each line
794 Token tok;
795 bool eol = matcher.Match(&lexer, &tok);
796 if (eol) {
797 break;
798 }
799
800 switch (tok.id) {
801 case Id::HereBegin: {
802 // Put a copy on the stack
803 int n = tok.submatch_end - tok.submatch_start;
804 here_list.emplace_back(line + tok.submatch_start, n);
805 here_start_num.push_back(line_num);
806 } break;
807
808 case Id::DelimStrBegin: {
809 if (delim_begin.empty()) {
810 int n = tok.submatch_end - tok.submatch_start;
811 delim_begin.emplace_back(line + tok.submatch_start, n);
812 } else {
813 // We have entered cpp_mode_e::DelimStr, which means we should never
814 // return another DelimStrBegin
815 assert(0);
816 }
817 } break;
818
819 case Id::DelimStrEnd: {
820 if (delim_begin.empty()) {
821 // We should never get this unless we got a DelimStrBegin first
822 assert(0);
823 } else {
824 size_t n = tok.submatch_end - tok.submatch_start;
825 std::string end_delim(line + tok.submatch_start, n);
826
827 if (end_delim == delim_begin.back()) {
828 lexer.line_mode = T::Outer; // the string is ended
829 delim_begin.pop_back();
830 } else {
831 tok.id = Id::Str; // mismatched delimiter is just a string
832 }
833 }
834 } break;
835
836 default:
837 break;
838 }
839
840 tokens.push_back(tok); // make a copy
841
842 if (TokenIsSignificant(tok.id)) {
843 line_is_sig = true;
844 }
845 }
846
847#if 0
848 PrintTokens(tokens);
849 Log("%d tokens before", tokens.size());
850 Optimize(&tokens);
851 Log("%d tokens after", tokens.size());
852 PrintTokens(tokens);
853#endif
854
855 FixShellComments(tokens);
856
857 out->Line(line_num, line, tokens);
858 tokens.clear();
859
860 // Potentially multiple here docs for this line
861 int here_index = 0;
862 for (auto here_delim : here_list) {
863 // Log("HERE %s", here_delim.c_str());
864
865 while (true) {
866 const char* blame = reader->Filename() ?: "<stdin>";
867 if (!reader->NextLine()) {
868 Log("micro-syntax: getline() error on %s: %s", blame,
869 strerror(reader->err_num_));
870 return 1;
871 }
872 char* line = reader->Current();
873 if (line == nullptr) {
874 int start_line = here_start_num[here_index];
875 Log("Unexpected end-of-file in here doc in %s, start line %d", blame,
876 start_line);
877 return 1;
878 }
879
880 line_num++;
881
882 if (LineEqualsHereDelim(line, here_delim)) {
883 int n = strlen(line);
884 Token whole_line(Id::HereEnd, n);
885 tokens.push_back(whole_line);
886 out->Line(line_num, line, tokens);
887 tokens.clear();
888 break;
889
890 } else {
891 int n = strlen(line);
892 Token whole_line(Id::Str, n);
893 tokens.push_back(whole_line);
894 out->Line(line_num, line, tokens);
895 tokens.clear();
896
897 // Log(" not equal: %s", line);
898 }
899 }
900 here_index++;
901 }
902 here_list.clear();
903 here_start_num.clear();
904
905 line_num++;
906 num_sig += line_is_sig;
907 }
908
909 out->PathEnd(line_num - 1, num_sig);
910 return 0;
911}
912
913int ScanFiles(const Flags& flag, std::vector<char*> files, OutputStream* out,
914 Hook* hook) {
915 Reader* reader = nullptr;
916
917 int status = 0;
918 for (auto path : files) {
919 FILE* f;
920 if (path == nullptr) {
921 f = stdin;
922 } else {
923 f = fopen(path, "r");
924 if (f == nullptr) {
925 Log("Error opening %s: %s", path, strerror(errno));
926 return 1;
927 }
928 }
929 out->PathBegin(path);
930
931 reader = new Reader(f, path);
932
933 switch (flag.lang) {
934 case lang_e::PlainText:
935 status = ScanOne<text_mode_e>(reader, out, hook);
936 break;
937
938 case lang_e::Py:
939 status = ScanOne<py_mode_e>(reader, out, hook);
940 break;
941
942 case lang_e::Cpp:
943 status = ScanOne<cpp_mode_e>(reader, out, hook);
944 break;
945
946 case lang_e::Shell:
947 status = ScanOne<sh_mode_e>(reader, out, hook);
948 break;
949
950 case lang_e::Asdl:
951 status = ScanOne<asdl_mode_e>(reader, out, hook);
952 break;
953
954 case lang_e::R:
955 status = ScanOne<R_mode_e>(reader, out, hook);
956 break;
957
958 default:
959 assert(0);
960 }
961
962 delete reader;
963
964 if (path == nullptr) {
965 ;
966 } else {
967 fclose(f);
968 }
969
970 if (status != 0) {
971 break;
972 }
973 }
974
975 return status;
976}
977
978void PrintHelp() {
979 puts(R"(Usage: micro-syntax FLAGS* FILE*
980
981Recognizes the syntax of each file,, and prints it to stdout.
982
983If there are no files, reads stdin.
984
985Flags:
986 -h --help This help
987
988 -l --lang Language: py|cpp|shell|...
989 -t Print tokens as TSV, instead of ANSI color
990 -w Print HTML for the web
991
992 -m More color, useful for debugging tokens
993
994 -n --no-comments Omit comments
995 -o --comments-only Only print comments
996 -e --empty-strs Substitute string literals for empty strings
997 --color on off always more
998
999)");
1000}
1001
1002int main(int argc, char** argv) {
1003 Flags flag = {lang_e::PlainText};
1004
1005 // http://www.gnu.org/software/libc/manual/html_node/Example-of-Getopt.html
1006 // + means to be strict about flag parsing.
1007 int c;
1008 while ((c = getopt(argc, argv, "+hl:motw")) != -1) {
1009 switch (c) {
1010 case 'h':
1011 PrintHelp();
1012 return 0;
1013
1014 case 'l':
1015 if (strcmp(optarg, "cpp") == 0) {
1016 flag.lang = lang_e::Cpp;
1017
1018 } else if (strcmp(optarg, "py") == 0) {
1019 flag.lang = lang_e::Py;
1020
1021 } else if (strcmp(optarg, "shell") == 0) {
1022 flag.lang = lang_e::Shell;
1023
1024 } else if (strcmp(optarg, "asdl") == 0) {
1025 flag.lang = lang_e::Asdl;
1026
1027 } else if (strcmp(optarg, "R") == 0) {
1028 flag.lang = lang_e::R;
1029
1030 // TODO: implement all of these
1031 } else if (strcmp(optarg, "js") == 0) {
1032 flag.lang = lang_e::PlainText;
1033
1034 } else if (strcmp(optarg, "css") == 0) {
1035 flag.lang = lang_e::PlainText;
1036
1037 } else if (strcmp(optarg, "md") == 0) {
1038 flag.lang = lang_e::PlainText;
1039
1040 } else if (strcmp(optarg, "yaml") == 0) {
1041 flag.lang = lang_e::PlainText;
1042
1043 } else if (strcmp(optarg, "txt") == 0) {
1044 flag.lang = lang_e::PlainText;
1045
1046 } else if (strcmp(optarg, "other") == 0) {
1047 flag.lang = lang_e::PlainText;
1048
1049 } else {
1050 Log("Expected -l LANG to be cpp|py|shell|asdl|R|js|css|md|yaml|txt, "
1051 "got %s",
1052 optarg);
1053 return 2;
1054 }
1055 break;
1056
1057 case 'm':
1058 flag.more_color = true;
1059 break;
1060
1061 case 'o':
1062 flag.comments_only = true;
1063 break;
1064
1065 case 't':
1066 flag.tsv = true;
1067 break;
1068
1069 case 'w':
1070 flag.web = true;
1071 break;
1072
1073 case '?': // getopt library will print error
1074 return 2;
1075
1076 default:
1077 abort(); // should never happen
1078 }
1079 }
1080
1081 int a = optind; // index into argv
1082 flag.argv = argv + a;
1083 flag.argc = argc - a;
1084
1085 std::vector<char*> files; // filename, or nullptr for stdin
1086 if (flag.argc != 0) {
1087 for (int i = 0; i < flag.argc; ++i) {
1088 files.push_back(flag.argv[i]);
1089 }
1090 } else {
1091 files.push_back(nullptr); // stands for stdin
1092 }
1093
1094 Printer* pr; // for each file
1095 OutputStream* out; // the entire stream
1096
1097 if (flag.tsv) {
1098 pr = new TsvPrinter();
1099 out = new NetStringOutput(pr);
1100 } else if (flag.web) {
1101 pr = new HtmlPrinter();
1102 out = new NetStringOutput(pr);
1103 } else {
1104 pr = new AnsiPrinter(flag);
1105 out = new AnsiOutput(pr);
1106 }
1107
1108 Hook* hook = nullptr;
1109 if (flag.lang == lang_e::Cpp) {
1110 hook = new CppHook();
1111 } else {
1112 hook = new Hook(); // default hook
1113 }
1114
1115 int status = ScanFiles(flag, files, out, hook);
1116
1117 delete hook;
1118 delete pr;
1119 delete out;
1120
1121 return status;
1122}