| 1 | #ifndef MICRO_SYNTAX_H
 | 
| 2 | #define MICRO_SYNTAX_H
 | 
| 3 | 
 | 
| 4 | #include <assert.h>
 | 
| 5 | #include <string.h>  // strlen()
 | 
| 6 | 
 | 
| 7 | #include <vector>
 | 
| 8 | 
 | 
| 9 | enum class Id {
 | 
| 10 |   // Common to nearly all languages
 | 
| 11 |   Comm,
 | 
| 12 |   MaybeComment,  // for shell, resolved in a fix-up pass
 | 
| 13 | 
 | 
| 14 |   WS,
 | 
| 15 | 
 | 
| 16 |   Name,  // Keyword or Identifier
 | 
| 17 |   Str,   // "" and Python r""
 | 
| 18 |          // '' and Python r''
 | 
| 19 |          // ''' """
 | 
| 20 |          // body of here docs
 | 
| 21 | 
 | 
| 22 |   Other,  // any other text
 | 
| 23 |   Unknown,
 | 
| 24 | 
 | 
| 25 |   // C++
 | 
| 26 |   DelimStrBegin,  // for C++ R"zzz(hello)zzz"
 | 
| 27 |   DelimStrEnd,
 | 
| 28 |   Re2c,  // re2c code block
 | 
| 29 | 
 | 
| 30 |   MaybePreproc,    // resolved to PreprocCommand/PreprocOther in fix-up pass
 | 
| 31 |   PreprocCommand,  // resolved #define
 | 
| 32 |   PreprocOther,    // any other text
 | 
| 33 |   LineCont,        // backslash at end of line, for #define continuation
 | 
| 34 | 
 | 
| 35 |   // Braces for C++ block structure. Could be done in second pass after
 | 
| 36 |   // removing comments/strings?
 | 
| 37 |   LBrace,
 | 
| 38 |   RBrace,
 | 
| 39 | 
 | 
| 40 |   // Shell
 | 
| 41 |   HereBegin,
 | 
| 42 |   HereEnd,
 | 
| 43 | 
 | 
| 44 |   // Zero-width token to detect #ifdef and Python INDENT/DEDENT
 | 
| 45 |   // StartLine,
 | 
| 46 | 
 | 
| 47 |   // These are special zero-width tokens for Python
 | 
| 48 |   // Indent,
 | 
| 49 |   // Dedent,
 | 
| 50 |   // Maintain our own stack!
 | 
| 51 |   // https://stackoverflow.com/questions/40960123/how-exactly-a-dedent-token-is-generated-in-python
 | 
| 52 | };
 | 
| 53 | 
 | 
| 54 | struct Token {
 | 
| 55 |   Token() : id(Id::Unknown), end_col(0), submatch_start(0), submatch_end(0) {
 | 
| 56 |   }
 | 
| 57 |   Token(Id id, int end_col)
 | 
| 58 |       : id(id), end_col(end_col), submatch_start(0), submatch_end(0) {
 | 
| 59 |   }
 | 
| 60 | 
 | 
| 61 |   Id id;
 | 
| 62 |   int end_col;         // offset from char* line
 | 
| 63 |   int submatch_start;  // ditto
 | 
| 64 |   int submatch_end;    // ditto
 | 
| 65 | };
 | 
| 66 | 
 | 
| 67 | // Lexer and Matcher are specialized on py_mode_e, cpp_mode_e, ...
 | 
| 68 | 
 | 
| 69 | template <typename T>
 | 
| 70 | class Lexer {
 | 
| 71 |  public:
 | 
| 72 |   Lexer(char* line) : line_(line), p_current(line), line_mode(T::Outer) {
 | 
| 73 |   }
 | 
| 74 | 
 | 
| 75 |   void SetLine(char* line) {
 | 
| 76 |     line_ = line;
 | 
| 77 |     p_current = line;
 | 
| 78 |   }
 | 
| 79 | 
 | 
| 80 |   const char* line_;
 | 
| 81 |   const char* p_current;  // points into line
 | 
| 82 |   T line_mode;            // current mode, starts with Outer
 | 
| 83 | };
 | 
| 84 | 
 | 
| 85 | template <typename T>
 | 
| 86 | class Matcher {
 | 
| 87 |  public:
 | 
| 88 |   // Returns whether EOL was hit.  Mutates lexer state, and fills in tok out
 | 
| 89 |   // param.
 | 
| 90 |   bool Match(Lexer<T>* lexer, Token* tok);
 | 
| 91 | };
 | 
| 92 | 
 | 
| 93 | // Macros for semantic actions
 | 
| 94 | 
 | 
| 95 | #define TOK(k) \
 | 
| 96 |   tok->id = k; \
 | 
| 97 |   break;
 | 
| 98 | 
 | 
| 99 | #define TOK_MODE(k, m)  \
 | 
| 100 |   tok->id = k;          \
 | 
| 101 |   lexer->line_mode = m; \
 | 
| 102 |   break;
 | 
| 103 | 
 | 
| 104 | // Must call TOK*() after this
 | 
| 105 | #define SUBMATCH(s, e)                    \
 | 
| 106 |   tok->submatch_start = s - lexer->line_; \
 | 
| 107 |   tok->submatch_end = e - lexer->line_;
 | 
| 108 | 
 | 
| 109 | // Regex definitions shared between languages
 | 
| 110 | 
 | 
| 111 | /*!re2c
 | 
| 112 |   re2c:yyfill:enable = 0;
 | 
| 113 |   re2c:define:YYCTYPE = char;
 | 
| 114 |   re2c:define:YYCURSOR = p;
 | 
| 115 | 
 | 
| 116 |   nul = [\x00];
 | 
| 117 |   not_nul = [^\x00];
 | 
| 118 | 
 | 
| 119 |   // Whitespace is needed for SLOC, to tell if a line is entirely blank
 | 
| 120 |   whitespace = [ \t\r\n]*;
 | 
| 121 | 
 | 
| 122 |   identifier = [_a-zA-Z][_a-zA-Z0-9]*;
 | 
| 123 | 
 | 
| 124 |   // Python and C++ have "" strings
 | 
| 125 |   // C++ char literals are similar, e.g. '\''
 | 
| 126 |   // We are not more precise
 | 
| 127 | 
 | 
| 128 |   sq_middle = ( [^\x00'\\] | "\\" not_nul )*;
 | 
| 129 |   dq_middle = ( [^\x00"\\] | "\\" not_nul )*;
 | 
| 130 | 
 | 
| 131 |   sq_string = ['] sq_middle ['];
 | 
| 132 |   dq_string = ["] dq_middle ["];
 | 
| 133 | 
 | 
| 134 |   // Shell and Python have # comments
 | 
| 135 |   pound_comment        = "#" not_nul*;
 | 
| 136 | 
 | 
| 137 |   // YSH and Python have ''' """
 | 
| 138 |   triple_sq = "'''";
 | 
| 139 |   triple_dq = ["]["]["];
 | 
| 140 | */
 | 
| 141 | 
 | 
| 142 | enum class text_mode_e {
 | 
| 143 |   Outer,  // default
 | 
| 144 | };
 | 
| 145 | 
 | 
| 146 | // Returns whether EOL was hit
 | 
| 147 | template <>
 | 
| 148 | bool Matcher<text_mode_e>::Match(Lexer<text_mode_e>* lexer, Token* tok) {
 | 
| 149 |   const char* p = lexer->p_current;  // mutated by re2c
 | 
| 150 | 
 | 
| 151 |   while (true) {
 | 
| 152 |     /*!re2c
 | 
| 153 |       nul                    { return true; }
 | 
| 154 | 
 | 
| 155 |                              // whitespace at start of line
 | 
| 156 |       whitespace             { TOK(Id::WS); }
 | 
| 157 | 
 | 
| 158 |                              // This rule consumes trailing whitespace, but
 | 
| 159 |                              // it's OK.  We're counting significant lines, not
 | 
| 160 |                              // highlighting.
 | 
| 161 |       [^\x00]+               { TOK(Id::Other); }
 | 
| 162 | 
 | 
| 163 |       *                      { TOK(Id::Other); }
 | 
| 164 | 
 | 
| 165 |     */
 | 
| 166 |   }
 | 
| 167 | 
 | 
| 168 |   tok->end_col = p - lexer->line_;
 | 
| 169 |   lexer->p_current = p;
 | 
| 170 |   return false;
 | 
| 171 | }
 | 
| 172 | 
 | 
| 173 | enum class asdl_mode_e {
 | 
| 174 |   Outer,
 | 
| 175 | };
 | 
| 176 | 
 | 
| 177 | // Returns whether EOL was hit
 | 
| 178 | template <>
 | 
| 179 | bool Matcher<asdl_mode_e>::Match(Lexer<asdl_mode_e>* lexer, Token* tok) {
 | 
| 180 |   const char* p = lexer->p_current;  // mutated by re2c
 | 
| 181 | 
 | 
| 182 |   switch (lexer->line_mode) {
 | 
| 183 |   case asdl_mode_e::Outer:
 | 
| 184 |     while (true) {
 | 
| 185 |       /*!re2c
 | 
| 186 |         nul                    { return true; }
 | 
| 187 | 
 | 
| 188 |         whitespace             { TOK(Id::WS); }
 | 
| 189 | 
 | 
| 190 |         identifier             { TOK(Id::Name); }
 | 
| 191 | 
 | 
| 192 |         pound_comment          { TOK(Id::Comm); }
 | 
| 193 | 
 | 
| 194 |         // Not the start of a comment, identifier
 | 
| 195 |         [^\x00#_a-zA-Z]+       { TOK(Id::Other); }
 | 
| 196 | 
 | 
| 197 |         // e.g. unclosed quote like "foo
 | 
| 198 |         *                      { TOK(Id::Unknown); }
 | 
| 199 | 
 | 
| 200 |       */
 | 
| 201 |     }
 | 
| 202 |     break;
 | 
| 203 |   }
 | 
| 204 | 
 | 
| 205 |   tok->end_col = p - lexer->line_;
 | 
| 206 |   lexer->p_current = p;
 | 
| 207 |   return false;
 | 
| 208 | }
 | 
| 209 | 
 | 
| 210 | enum class py_mode_e {
 | 
| 211 |   Outer,    // default
 | 
| 212 |   MultiSQ,  // inside '''
 | 
| 213 |   MultiDQ,  // inside """
 | 
| 214 | };
 | 
| 215 | 
 | 
| 216 | // Returns whether EOL was hit
 | 
| 217 | template <>
 | 
| 218 | bool Matcher<py_mode_e>::Match(Lexer<py_mode_e>* lexer, Token* tok) {
 | 
| 219 |   const char* p = lexer->p_current;  // mutated by re2c
 | 
| 220 |   const char* YYMARKER = p;
 | 
| 221 | 
 | 
| 222 |   switch (lexer->line_mode) {
 | 
| 223 |   case py_mode_e::Outer:
 | 
| 224 |     while (true) {
 | 
| 225 |       /*!re2c
 | 
| 226 |         nul                    { return true; }
 | 
| 227 | 
 | 
| 228 |         whitespace             { TOK(Id::WS); }
 | 
| 229 | 
 | 
| 230 |         identifier             { TOK(Id::Name); }
 | 
| 231 | 
 | 
| 232 |         [r]? sq_string         { TOK(Id::Str); }
 | 
| 233 |         [r]? dq_string         { TOK(Id::Str); }
 | 
| 234 | 
 | 
| 235 |         // optional raw prefix
 | 
| 236 |         [r]? triple_sq         { TOK_MODE(Id::Str, py_mode_e::MultiSQ); }
 | 
| 237 |         [r]? triple_dq         { TOK_MODE(Id::Str, py_mode_e::MultiDQ); }
 | 
| 238 | 
 | 
| 239 |         pound_comment          { TOK(Id::Comm); }
 | 
| 240 | 
 | 
| 241 |         // Not the start of a string, comment, identifier
 | 
| 242 |         [^\x00"'#_a-zA-Z]+     { TOK(Id::Other); }
 | 
| 243 | 
 | 
| 244 |         // e.g. unclosed quote like "foo
 | 
| 245 |         *                      { TOK(Id::Unknown); }
 | 
| 246 | 
 | 
| 247 |       */
 | 
| 248 |     }
 | 
| 249 |     break;
 | 
| 250 | 
 | 
| 251 |   case py_mode_e::MultiSQ:
 | 
| 252 |     while (true) {
 | 
| 253 |       /*!re2c
 | 
| 254 |         nul       { return true; }
 | 
| 255 | 
 | 
| 256 |         triple_sq { TOK_MODE(Id::Str, py_mode_e::Outer); }
 | 
| 257 | 
 | 
| 258 |         [^\x00']* { TOK(Id::Str); }
 | 
| 259 | 
 | 
| 260 |         *         { TOK(Id::Str); }
 | 
| 261 | 
 | 
| 262 |       */
 | 
| 263 |     }
 | 
| 264 |     break;
 | 
| 265 | 
 | 
| 266 |   case py_mode_e::MultiDQ:
 | 
| 267 |     while (true) {
 | 
| 268 |       /*!re2c
 | 
| 269 |         nul       { return true; }
 | 
| 270 | 
 | 
| 271 |         triple_dq { TOK_MODE(Id::Str, py_mode_e::Outer); }
 | 
| 272 | 
 | 
| 273 |         [^\x00"]* { TOK(Id::Str); }
 | 
| 274 | 
 | 
| 275 |         *         { TOK(Id::Str); }
 | 
| 276 | 
 | 
| 277 |       */
 | 
| 278 |     }
 | 
| 279 |     break;
 | 
| 280 |   }
 | 
| 281 | 
 | 
| 282 |   tok->end_col = p - lexer->line_;
 | 
| 283 |   lexer->p_current = p;
 | 
| 284 |   return false;
 | 
| 285 | }
 | 
| 286 | 
 | 
| 287 | enum class cpp_mode_e {
 | 
| 288 |   Outer,     // default
 | 
| 289 |   Comm,      // inside /* */ comment
 | 
| 290 |   DelimStr,  // R"zz(string literal)zz"
 | 
| 291 |   Re2c,      // /* !re2c
 | 
| 292 | };
 | 
| 293 | 
 | 
| 294 | // Returns whether EOL was hit
 | 
| 295 | template <>
 | 
| 296 | bool Matcher<cpp_mode_e>::Match(Lexer<cpp_mode_e>* lexer, Token* tok) {
 | 
| 297 |   const char* p = lexer->p_current;  // mutated by re2c
 | 
| 298 |   const char* YYMARKER = p;
 | 
| 299 |   const char *s, *e;  // submatch extraction
 | 
| 300 | 
 | 
| 301 |   // Autogenerated tag variables used by the lexer to track tag values.
 | 
| 302 |   /*!stags:re2c format = 'const char *@@;\n'; */
 | 
| 303 | 
 | 
| 304 |   switch (lexer->line_mode) {
 | 
| 305 |   case cpp_mode_e::Outer:
 | 
| 306 | 
 | 
| 307 |     while (true) {
 | 
| 308 |       /*!re2c
 | 
| 309 |         nul                     { return true; }
 | 
| 310 | 
 | 
| 311 |         whitespace              { TOK(Id::WS); }
 | 
| 312 | 
 | 
| 313 |         "{"                     { TOK(Id::LBrace); }
 | 
| 314 |         "}"                     { TOK(Id::RBrace); }
 | 
| 315 | 
 | 
| 316 |         identifier              { TOK(Id::Name); }
 | 
| 317 | 
 | 
| 318 |         // approximation for C++ char literals
 | 
| 319 |         sq_string               { TOK(Id::Str); }
 | 
| 320 |         dq_string               { TOK(Id::Str); }
 | 
| 321 | 
 | 
| 322 |         // Not the start of a string, comment, identifier
 | 
| 323 |         [^\x00"'/_a-zA-Z{}]+    { TOK(Id::Other); }
 | 
| 324 | 
 | 
| 325 |         "//" not_nul*           { TOK(Id::Comm); }
 | 
| 326 | 
 | 
| 327 |         // Treat re2c as preprocessor block
 | 
| 328 |         "/" "*!re2c"            { TOK_MODE(Id::Re2c, cpp_mode_e::Re2c); }
 | 
| 329 | 
 | 
| 330 |         "/" "*"                 { TOK_MODE(Id::Comm, cpp_mode_e::Comm); }
 | 
| 331 | 
 | 
| 332 |         // Not sure what the rules are for R"zz(hello)zz".  Make it similar to
 | 
| 333 |         // here docs.
 | 
| 334 |         cpp_delim_str = [_a-zA-Z]*;
 | 
| 335 | 
 | 
| 336 |         "R" ["] @s cpp_delim_str @e "(" {
 | 
| 337 |           SUBMATCH(s, e);
 | 
| 338 |           TOK_MODE(Id::DelimStrBegin, cpp_mode_e::DelimStr);
 | 
| 339 |         }
 | 
| 340 | 
 | 
| 341 |         // e.g. unclosed quote like "foo
 | 
| 342 |         *                       { TOK(Id::Unknown); }
 | 
| 343 | 
 | 
| 344 |       */
 | 
| 345 |     }
 | 
| 346 |     break;
 | 
| 347 | 
 | 
| 348 |   case cpp_mode_e::Comm:
 | 
| 349 |     // Search until next */
 | 
| 350 |     while (true) {
 | 
| 351 |       /*!re2c
 | 
| 352 |         nul       { return true; }
 | 
| 353 | 
 | 
| 354 |         "*" "/"   { TOK_MODE(Id::Comm, cpp_mode_e::Outer); }
 | 
| 355 | 
 | 
| 356 |         [^\x00*]* { TOK(Id::Comm); }
 | 
| 357 | 
 | 
| 358 |         *         { TOK(Id::Comm); }
 | 
| 359 | 
 | 
| 360 |       */
 | 
| 361 |     }
 | 
| 362 |     break;
 | 
| 363 | 
 | 
| 364 |   case cpp_mode_e::Re2c:
 | 
| 365 |     // Search until next */
 | 
| 366 |     while (true) {
 | 
| 367 |       /*!re2c
 | 
| 368 |         nul       { return true; }
 | 
| 369 | 
 | 
| 370 |         "*" "/"   { TOK_MODE(Id::Re2c, cpp_mode_e::Outer); }
 | 
| 371 | 
 | 
| 372 |         [^\x00*]* { TOK(Id::Re2c); }
 | 
| 373 | 
 | 
| 374 |         *         { TOK(Id::Re2c); }
 | 
| 375 | 
 | 
| 376 |       */
 | 
| 377 |     }
 | 
| 378 |     break;
 | 
| 379 | 
 | 
| 380 |   case cpp_mode_e::DelimStr:
 | 
| 381 |     // Search until next */
 | 
| 382 |     while (true) {
 | 
| 383 |       /*!re2c
 | 
| 384 |         nul       { return true; }
 | 
| 385 | 
 | 
| 386 |         ")" @s cpp_delim_str @e ["] {
 | 
| 387 |           SUBMATCH(s, e);
 | 
| 388 |           TOK(Id::DelimStrEnd);
 | 
| 389 | 
 | 
| 390 |           // Caller is responsible for checking the extracted delimiter, and
 | 
| 391 |           // setting mode back to Cpp::Outer!
 | 
| 392 |         }
 | 
| 393 | 
 | 
| 394 |         [^\x00)]* { TOK(Id::Str); }
 | 
| 395 | 
 | 
| 396 |         *         { TOK(Id::Str); }
 | 
| 397 | 
 | 
| 398 |       */
 | 
| 399 |     }
 | 
| 400 |     break;
 | 
| 401 |   }
 | 
| 402 | 
 | 
| 403 |   tok->end_col = p - lexer->line_;
 | 
| 404 |   lexer->p_current = p;
 | 
| 405 |   return false;
 | 
| 406 | }
 | 
| 407 | 
 | 
| 408 | class Hook {
 | 
| 409 |  public:
 | 
| 410 |   // Return true if this is a preprocessor line, and fill in tokens
 | 
| 411 |   // Caller should check last token for whether there is a continuation line.
 | 
| 412 |   virtual void TryPreprocess(char* line, std::vector<Token>* tokens) {
 | 
| 413 |     ;
 | 
| 414 |   }
 | 
| 415 |   virtual ~Hook() {
 | 
| 416 |   }
 | 
| 417 | };
 | 
| 418 | 
 | 
| 419 | enum class pp_mode_e {
 | 
| 420 |   Outer,
 | 
| 421 | };
 | 
| 422 | 
 | 
| 423 | // Returns whether EOL was hit
 | 
| 424 | template <>
 | 
| 425 | bool Matcher<pp_mode_e>::Match(Lexer<pp_mode_e>* lexer, Token* tok) {
 | 
| 426 |   const char* p = lexer->p_current;  // mutated by re2c
 | 
| 427 |   const char* YYMARKER = p;
 | 
| 428 | 
 | 
| 429 |   switch (lexer->line_mode) {
 | 
| 430 |   case pp_mode_e::Outer:
 | 
| 431 |     while (true) {
 | 
| 432 |       /*!re2c
 | 
| 433 |         nul                    { return true; }
 | 
| 434 | 
 | 
| 435 |                                // Resolved in fix-up pass
 | 
| 436 |                                // #include #define etc. only valid at the
 | 
| 437 |                                // beginning
 | 
| 438 |         [ \t]* "#" [a-z]+      { TOK(Id::MaybePreproc); }
 | 
| 439 | 
 | 
| 440 |                                // C-style comments can end these lines
 | 
| 441 |         "//" not_nul*          { TOK(Id::Comm); }
 | 
| 442 | 
 | 
| 443 |         [\\] [\n]              { TOK(Id::LineCont); }
 | 
| 444 | 
 | 
| 445 |                                // A line could be all whitespace, then \ at the
 | 
| 446 |                                // end.  And it's not significant
 | 
| 447 |         whitespace             { TOK(Id::WS); }
 | 
| 448 | 
 | 
| 449 |                                // Not the start of a command, comment, or line
 | 
| 450 |                                // continuation
 | 
| 451 |         [^\x00#/\\]+           { TOK(Id::PreprocOther); }
 | 
| 452 | 
 | 
| 453 |         *                      { TOK(Id::PreprocOther); }
 | 
| 454 | 
 | 
| 455 |       */
 | 
| 456 |     }
 | 
| 457 |     break;
 | 
| 458 |   }
 | 
| 459 | 
 | 
| 460 |   tok->end_col = p - lexer->line_;
 | 
| 461 |   lexer->p_current = p;
 | 
| 462 |   return false;
 | 
| 463 | }
 | 
| 464 | 
 | 
| 465 | class CppHook : public Hook {
 | 
| 466 |  public:
 | 
| 467 |   virtual void TryPreprocess(char* line, std::vector<Token>* tokens);
 | 
| 468 | };
 | 
| 469 | 
 | 
| 470 | enum class R_mode_e {
 | 
| 471 |   Outer,  // default
 | 
| 472 | 
 | 
| 473 |   SQ,  // inside multi-line ''
 | 
| 474 |   DQ,  // inside multi-line ""
 | 
| 475 | };
 | 
| 476 | 
 | 
| 477 | // Returns whether EOL was hit
 | 
| 478 | template <>
 | 
| 479 | bool Matcher<R_mode_e>::Match(Lexer<R_mode_e>* lexer, Token* tok) {
 | 
| 480 |   const char* p = lexer->p_current;  // mutated by re2c
 | 
| 481 |   const char* YYMARKER = p;
 | 
| 482 | 
 | 
| 483 |   switch (lexer->line_mode) {
 | 
| 484 |   case R_mode_e::Outer:
 | 
| 485 |     while (true) {
 | 
| 486 |       /*!re2c
 | 
| 487 |         nul                    { return true; }
 | 
| 488 | 
 | 
| 489 |         whitespace             { TOK(Id::WS); }
 | 
| 490 | 
 | 
| 491 |         pound_comment          { TOK(Id::Comm); }
 | 
| 492 | 
 | 
| 493 |         identifier             { TOK(Id::Name); }
 | 
| 494 | 
 | 
| 495 |         // Not the start of a string, escaped, comment, identifier
 | 
| 496 |         [^\x00"'#_a-zA-Z]+     { TOK(Id::Other); }
 | 
| 497 | 
 | 
| 498 |         [']                    { TOK_MODE(Id::Str, R_mode_e::SQ); }
 | 
| 499 |         ["]                    { TOK_MODE(Id::Str, R_mode_e::DQ); }
 | 
| 500 | 
 | 
| 501 |         *                      { TOK(Id::Unknown); }
 | 
| 502 | 
 | 
| 503 |       */
 | 
| 504 |     }
 | 
| 505 |     break;
 | 
| 506 | 
 | 
| 507 |   case R_mode_e::SQ:
 | 
| 508 |     while (true) {
 | 
| 509 |       /*!re2c
 | 
| 510 |         nul       { return true; }
 | 
| 511 | 
 | 
| 512 |         [']       { TOK_MODE(Id::Str, R_mode_e::Outer); }
 | 
| 513 | 
 | 
| 514 |         sq_middle { TOK(Id::Str); }
 | 
| 515 | 
 | 
| 516 |         *         { TOK(Id::Str); }
 | 
| 517 | 
 | 
| 518 |       */
 | 
| 519 |     }
 | 
| 520 |     break;
 | 
| 521 | 
 | 
| 522 |   case R_mode_e::DQ:
 | 
| 523 |     while (true) {
 | 
| 524 |       /*!re2c
 | 
| 525 |         nul       { return true; }
 | 
| 526 | 
 | 
| 527 |         ["]       { TOK_MODE(Id::Str, R_mode_e::Outer); }
 | 
| 528 | 
 | 
| 529 |         dq_middle { TOK(Id::Str); }
 | 
| 530 | 
 | 
| 531 |         *         { TOK(Id::Str); }
 | 
| 532 | 
 | 
| 533 |       */
 | 
| 534 |     }
 | 
| 535 |     break;
 | 
| 536 |   }
 | 
| 537 | 
 | 
| 538 |   tok->end_col = p - lexer->line_;
 | 
| 539 |   lexer->p_current = p;
 | 
| 540 |   return false;
 | 
| 541 | }
 | 
| 542 | 
 | 
| 543 | // Problem with shell: nested double quotes!!!
 | 
| 544 | // We probably discourage this in YSH
 | 
| 545 | 
 | 
| 546 | enum class sh_mode_e {
 | 
| 547 |   Outer,  // default
 | 
| 548 | 
 | 
| 549 |   SQ,        // inside multi-line ''
 | 
| 550 |   DollarSQ,  // inside multi-line $''
 | 
| 551 |   DQ,        // inside multi-line ""
 | 
| 552 | 
 | 
| 553 |   // We could have a separate thing for this
 | 
| 554 |   YshSQ,  // inside '''
 | 
| 555 |   YshDQ,  // inside """
 | 
| 556 |   YshJ,   // inside j"""
 | 
| 557 | };
 | 
| 558 | 
 | 
| 559 | // Returns whether EOL was hit
 | 
| 560 | 
 | 
| 561 | // Submatch docs:
 | 
| 562 | //   https://re2c.org/manual/manual_c.html#submatch-extraction
 | 
| 563 | 
 | 
| 564 | template <>
 | 
| 565 | bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
 | 
| 566 |   const char* p = lexer->p_current;  // mutated by re2c
 | 
| 567 |   const char* YYMARKER = p;
 | 
| 568 |   const char *s, *e;  // submatch extraction
 | 
| 569 | 
 | 
| 570 |   // Autogenerated tag variables used by the lexer to track tag values.
 | 
| 571 |   /*!stags:re2c format = 'const char *@@;\n'; */
 | 
| 572 | 
 | 
| 573 |   switch (lexer->line_mode) {
 | 
| 574 |   case sh_mode_e::Outer:
 | 
| 575 |     while (true) {
 | 
| 576 |       /*!re2c
 | 
| 577 |         nul                    { return true; }
 | 
| 578 | 
 | 
| 579 |         whitespace             { TOK(Id::WS); }
 | 
| 580 | 
 | 
| 581 |                                // Resolved in fix-up pass
 | 
| 582 |         pound_comment          { TOK(Id::MaybeComment); }
 | 
| 583 | 
 | 
| 584 |         // not that relevant for shell
 | 
| 585 |         identifier             { TOK(Id::Name); }
 | 
| 586 | 
 | 
| 587 |         // Not the start of a string, escaped, comment, identifier, here doc
 | 
| 588 |         [^\x00"'$#_a-zA-Z\\<]+  { TOK(Id::Other); }
 | 
| 589 | 
 | 
| 590 |                                // echo is like a string
 | 
| 591 |         "\\" .                 { TOK(Id::Str); }
 | 
| 592 | 
 | 
| 593 |         [']                    { TOK_MODE(Id::Str, sh_mode_e::SQ); }
 | 
| 594 |         ["]                    { TOK_MODE(Id::Str, sh_mode_e::DQ); }
 | 
| 595 |         "$'"                   { TOK_MODE(Id::Str, sh_mode_e::DollarSQ); }
 | 
| 596 | 
 | 
| 597 |         // <<- is another syntax
 | 
| 598 |         here_op    = "<<" [-]? [ \t]*;
 | 
| 599 |         h_delim    = [_a-zA-Z][_a-zA-Z0-9]*;
 | 
| 600 | 
 | 
| 601 |         // unquoted or quoted
 | 
| 602 |         here_op      @s h_delim @e     { SUBMATCH(s, e); TOK(Id::HereBegin); }
 | 
| 603 |         here_op [']  @s h_delim @e ['] { SUBMATCH(s, e); TOK(Id::HereBegin); }
 | 
| 604 |         here_op ["]  @s h_delim @e ["] { SUBMATCH(s, e); TOK(Id::HereBegin); }
 | 
| 605 |         here_op "\\" @s h_delim @e     { SUBMATCH(s, e); TOK(Id::HereBegin); }
 | 
| 606 | 
 | 
| 607 |                                        // NOT Unknown, as in Python
 | 
| 608 |         *                              { TOK(Id::Other); }
 | 
| 609 | 
 | 
| 610 |       */
 | 
| 611 |     }
 | 
| 612 |     break;
 | 
| 613 | 
 | 
| 614 |   case sh_mode_e::SQ:
 | 
| 615 |     // Search until next ' unconditionally
 | 
| 616 |     while (true) {
 | 
| 617 |       /*!re2c
 | 
| 618 |         nul       { return true; }
 | 
| 619 | 
 | 
| 620 |         [']       { TOK_MODE(Id::Str, sh_mode_e::Outer); }
 | 
| 621 | 
 | 
| 622 |         [^\x00']* { TOK(Id::Str); }
 | 
| 623 | 
 | 
| 624 |         *         { TOK(Id::Str); }
 | 
| 625 | 
 | 
| 626 |       */
 | 
| 627 |     }
 | 
| 628 |     break;
 | 
| 629 | 
 | 
| 630 |   case sh_mode_e::DQ:
 | 
| 631 |     // Search until next " that's not preceded by "
 | 
| 632 |     while (true) {
 | 
| 633 |       /*!re2c
 | 
| 634 |         nul       { return true; }
 | 
| 635 | 
 | 
| 636 |         ["]       { TOK_MODE(Id::Str, sh_mode_e::Outer); }
 | 
| 637 | 
 | 
| 638 |         dq_middle { TOK(Id::Str); }
 | 
| 639 | 
 | 
| 640 |         *         { TOK(Id::Str); }
 | 
| 641 | 
 | 
| 642 |       */
 | 
| 643 |     }
 | 
| 644 |     break;
 | 
| 645 | 
 | 
| 646 |   case sh_mode_e::DollarSQ:
 | 
| 647 |     // Search until next ' that's not preceded by "
 | 
| 648 |     while (true) {
 | 
| 649 |       /*!re2c
 | 
| 650 |         nul       { return true; }
 | 
| 651 | 
 | 
| 652 |         [']       { TOK_MODE(Id::Str, sh_mode_e::Outer); }
 | 
| 653 | 
 | 
| 654 |         sq_middle { TOK(Id::Str); }
 | 
| 655 | 
 | 
| 656 |         *         { TOK(Id::Str); }
 | 
| 657 | 
 | 
| 658 |       */
 | 
| 659 |     }
 | 
| 660 |     break;
 | 
| 661 |   case sh_mode_e::YshSQ:
 | 
| 662 |   case sh_mode_e::YshDQ:
 | 
| 663 |   case sh_mode_e::YshJ:
 | 
| 664 |     assert(0);
 | 
| 665 |   }
 | 
| 666 | 
 | 
| 667 |   tok->end_col = p - lexer->line_;
 | 
| 668 |   lexer->p_current = p;
 | 
| 669 |   return false;
 | 
| 670 | }
 | 
| 671 | 
 | 
| 672 | // TODO:
 | 
| 673 | // - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
 | 
| 674 | //   - same as C++ raw string, I think
 | 
| 675 | //   - similar to here docs, but less complex
 | 
| 676 | //
 | 
| 677 | // Inherent problems with "micro segmentation":
 | 
| 678 | //
 | 
| 679 | // - Nested double quotes in shell.  echo "hi ${name:-"default"}"
 | 
| 680 | //   - This means that lexing is **dependent on** parsing: does the second
 | 
| 681 | //   double quote **close** the first one, or does it start a nested string?
 | 
| 682 | //   - lexing is non-recursive, parsing is recursive
 | 
| 683 | 
 | 
| 684 | // Shell Comments depend on operator chars
 | 
| 685 | //   echo one # comment
 | 
| 686 | //   echo $(( 16#ff ))'
 | 
| 687 | 
 | 
| 688 | #endif  // MICRO_SYNTAX_H
 |