OILS / asdl / front_end.py View on Github | oilshell.org

516 lines, 297 significant
1"""front_end.py: Lexer and parser for the ASDL schema language."""
2from __future__ import print_function
3
4import re
5
6from asdl import ast
7from asdl.ast import (AST, Use, Module, TypeDecl, Constructor, Field, Sum,
8 SimpleSum, Product)
9from asdl.util import log
10
11_ = log
12
13_KEYWORDS = ['use', 'module', 'generate']
14
15_TOKENS = [
16 ('Keyword', ''),
17 ('Name', ''),
18
19 # For operators, the string matters
20 ('Equals', '='),
21 ('Comma', ','),
22 ('Question', '?'),
23 ('Pipe', '|'),
24 ('Asterisk', '*'),
25 ('LParen', '('),
26 ('RParen', ')'),
27 ('LBrace', '{'),
28 ('RBrace', '}'),
29 ('Percent', '%'),
30
31 # Oil addition for parameterized types.
32 ('LBracket', '['),
33 ('RBracket', ']'),
34
35 # - Start with Dict[string, bool].
36 # - List[string] is an alias for string*
37 #
38 # statically typed: Dict and List
39 # dynamically typed: dict and list
40]
41
42_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
43_TOKEN_INT = {} # string like '(' -> integer
44
45
46class TokenKind(object):
47 """ASDL tokens.
48
49 TokenKind.LBrace = 5, etc.
50 """
51 pass
52
53
54for i, (name, val) in enumerate(_TOKENS):
55 setattr(TokenKind, name, i)
56 _TOKEN_INT[val] = i
57
58
59class Token(object):
60
61 def __init__(self, kind, value, lineno):
62 self.kind = kind
63 self.value = value
64 self.lineno = lineno
65
66
67class ASDLSyntaxError(Exception):
68
69 def __init__(self, msg, lineno=None):
70 self.msg = msg
71 self.lineno = lineno or '<unknown>'
72
73 def __str__(self):
74 return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
75
76
77def _Tokenize(f):
78 """Tokenize the given buffer.
79
80 Yield Token objects.
81 """
82 for lineno, line in enumerate(f, 1):
83 for m in re.finditer(r'\s*(\w+|--.*|#.*|.)', line.strip()):
84 c = m.group(1)
85 if c in _KEYWORDS:
86 yield Token(TokenKind.Keyword, c, lineno)
87
88 elif c[0].isalpha():
89 yield Token(TokenKind.Name, c, lineno)
90
91 elif c.startswith('--') or c.startswith('#'):
92 # ASDL comments start with --
93 # Added # comments like Python and shell
94 break
95
96 else:
97 # Operators
98 try:
99 op_kind = _TOKEN_INT[c]
100 except KeyError:
101 raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
102 yield Token(op_kind, c, lineno)
103
104
105def _SumIsSimple(variant_list):
106 """Return True if a sum is a simple.
107
108 A sum is simple if its types have no fields, e.g. unaryop = Invert |
109 Not | UAdd | USub
110 """
111 for t in variant_list:
112 if t.fields or t.shared_type:
113 return False
114 return True
115
116
117_CODE_GEN_OPTIONS = [
118 'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
119 'integers', # integer builtin_i instead of strongly typed builtin_e
120 'uint16', # like integers, but use uint16_t instead
121 'bit_set', # not implemented: 1 << n instead of n
122
123 # probably don't need this
124 # 'common_synthetic_field:left_tok',
125
126 # Put this type, and transitive closure of types it references, in the
127 # unique "first class variant" namespace, and generate type reflection.
128 'reflect_all_types',
129
130 # Squeeze and Freeze, with the number of bits as a option Hm the headers
131 # here still need type reflection. Probably OK.
132 'mirror_all_types:16',
133]
134
135
136class ASDLParser(object):
137 """Parser for ASDL files.
138
139 Create, then call the parse method on a buffer containing ASDL. This
140 is a simple recursive descent parser that uses _Tokenize for the
141 lexing.
142 """
143
144 def __init__(self):
145 self._tokenizer = None
146 self.cur_token = None
147
148 def parse(self, f):
149 """Parse the ASDL in the file and return an AST with a Module root."""
150 self._tokenizer = _Tokenize(f)
151 self._advance()
152 return self._parse_module()
153
154 def _parse_module(self):
155 """
156 type_decl : NAME (':' NAME) '=' compound_type
157 module : 'module' NAME '{' use* type_decl* '}'
158
159 We added:
160 : for code gen options
161 use for imports
162
163 alloc_members =
164 List
165 | Dict
166 | Struct
167 generate [bit_set]
168
169 -- color::Red, not color_e::Red or color_i::Red
170 color = Red | Green
171 generate [integers, no_sum_suffix]
172 """
173 if not self._at_keyword('module'):
174 raise ASDLSyntaxError(
175 'Expected "module" (found {})'.format(self.cur_token.value),
176 self.cur_token.lineno)
177 self._advance()
178 name = self._match(TokenKind.Name)
179 self._match(TokenKind.LBrace)
180
181 uses = []
182 while self._at_keyword('use'):
183 uses.append(self._parse_use())
184
185 defs = []
186 while self.cur_token.kind == TokenKind.Name:
187 typename = self._advance()
188 self._match(TokenKind.Equals)
189 type_ = self._parse_compound_type()
190 defs.append(TypeDecl(typename, type_))
191
192 self._match(TokenKind.RBrace)
193 return Module(name, uses, defs)
194
195 def _parse_use(self):
196 """Use: 'use' NAME+ '{' NAME+ '}'.
197
198 example: use frontend syntax { Token }
199
200 This means frontend/syntax.asdl.h :: Token
201 """
202 self._advance() # past 'use'
203 module_parts = []
204 while self.cur_token.kind == TokenKind.Name:
205 part = self._advance()
206 module_parts.append(part)
207
208 self._match(TokenKind.LBrace)
209
210 type_names = []
211 while self.cur_token.kind == TokenKind.Name:
212 t = self._advance()
213 type_names.append(t)
214 if self.cur_token.kind == TokenKind.RParen:
215 break
216 elif self.cur_token.kind == TokenKind.Comma:
217 self._advance()
218
219 self._match(TokenKind.RBrace)
220 #print('MOD %s' % module_parts)
221 return Use(module_parts, type_names)
222
223 def _parse_compound_type(self):
224 """Constructor : NAME fields? | NAME '%' NAME # shared variant.
225
226 compound_type : product
227 | constructor ('|' constructor)*
228 """
229 if self.cur_token.kind == TokenKind.LParen:
230 # If we see a (, it's a product
231 return self._parse_product()
232 else:
233 # Otherwise it's a sum. Look for ConstructorId
234 sumlist = []
235 while True:
236 cons_name = self._match(TokenKind.Name)
237
238 shared_type = None
239 fields = None
240 if self.cur_token.kind == TokenKind.LParen:
241 fields = self._parse_fields()
242 elif self.cur_token.kind == TokenKind.Percent:
243 self._advance()
244 shared_type = self._match(TokenKind.Name)
245 else:
246 pass
247
248 cons = Constructor(cons_name, shared_type, fields)
249 sumlist.append(cons)
250
251 if self.cur_token.kind != TokenKind.Pipe:
252 break
253 self._advance()
254 generate = self._parse_optional_generate()
255
256 # Additional validation
257 if generate is not None:
258 for g in generate:
259 if g not in _CODE_GEN_OPTIONS:
260 raise ASDLSyntaxError('Invalid code gen option %r' % g,
261 self.cur_token.lineno)
262
263 if _SumIsSimple(sumlist):
264 return SimpleSum(sumlist, generate)
265 else:
266 return Sum(sumlist, generate)
267
268 def _parse_type_expr(self):
269 """One or two params:
270
271 type_params : '[' type_expr ( ',' type_expr )* ']'
272
273 type_expr : NAME type_params? (''?' | '*')? # allow one suffix
274
275 NAME is validated against Optional, List, Dict afterward
276 """
277 type_name = self._match(TokenKind.Name)
278
279 # Accept Python-like naming!
280 if type_name == 'str':
281 type_name = 'string'
282
283 children = []
284 if self.cur_token.kind == TokenKind.LBracket:
285 self._advance()
286 children.append(self._parse_type_expr())
287 if self.cur_token.kind == TokenKind.Comma:
288 self._advance()
289 children.append(self._parse_type_expr())
290
291 self._match(TokenKind.RBracket)
292
293 if type_name in ('List', 'Optional'):
294 if len(children) != 1:
295 raise ASDLSyntaxError(
296 'Expected 1 type param to {}'.format(type_name),
297 self.cur_token.lineno)
298 elif type_name == 'Dict':
299 if len(children) != 2:
300 raise ASDLSyntaxError(
301 'Expected 2 type params to {}'.format(type_name),
302 self.cur_token.lineno)
303 else:
304 if len(children) != 0:
305 raise ASDLSyntaxError(
306 'Expected zero type params to {}'.format(type_name),
307 self.cur_token.lineno)
308
309 if len(children):
310 typ = ast.ParameterizedType(type_name, children)
311 else:
312 typ = ast.NamedType(type_name)
313
314 if self.cur_token.kind == TokenKind.Asterisk:
315 # string* is equivalent to List[string]
316 typ = ast.ParameterizedType('List', [typ])
317 self._advance()
318
319 elif self.cur_token.kind == TokenKind.Question:
320 # string* is equivalent to Optional[string]
321 typ = ast.ParameterizedType('Optional', [typ])
322 self._advance()
323
324 return typ
325
326 def _parse_fields(self):
327 """fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
328
329 fields : '(' fields_inner? ')'
330
331 Name Quantifier? should be changed to typename.
332 """
333 fields = []
334 self._match(TokenKind.LParen)
335 while self.cur_token.kind == TokenKind.Name:
336 typ = self._parse_type_expr()
337 field_name = self._match(TokenKind.Name)
338
339 fields.append(Field(typ, field_name))
340
341 if self.cur_token.kind == TokenKind.RParen:
342 break
343 elif self.cur_token.kind == TokenKind.Comma:
344 self._advance()
345
346 self._match(TokenKind.RParen)
347 return fields
348
349 def _parse_list(self):
350 """list_inner: NAME ( ',' NAME )* ','?
351
352 list : '[' list_inner? ']'
353 """
354 generate = []
355 self._match(TokenKind.LBracket)
356 while self.cur_token.kind == TokenKind.Name:
357 name = self._match(TokenKind.Name)
358
359 generate.append(name)
360
361 if self.cur_token.kind == TokenKind.RBracket:
362 break
363 elif self.cur_token.kind == TokenKind.Comma:
364 self._advance()
365
366 self._match(TokenKind.RBracket)
367 return generate
368
369 def _parse_optional_generate(self):
370 """Attributes = 'generate' list."""
371 if self._at_keyword('generate'):
372 self._advance()
373 return self._parse_list()
374 else:
375 return None
376
377 def _parse_product(self):
378 """Product: fields attributes?"""
379 return Product(self._parse_fields())
380
381 def _advance(self):
382 """Return current token; read next token into self.cur_token."""
383 cur_val = None if self.cur_token is None else self.cur_token.value
384 try:
385 self.cur_token = next(self._tokenizer)
386 except StopIteration:
387 self.cur_token = None
388 return cur_val
389
390 def _match(self, kind):
391 """The 'match' primitive of RD parsers.
392
393 * Verifies that the current token is of the given kind (kind can
394 be a tuple, in which the kind must match one of its members).
395 * Returns the value of the current token
396 * Reads in the next token
397
398 Args:
399 kind: A TokenKind, or a tuple of TokenKind
400 """
401 if self.cur_token.kind == kind:
402 value = self.cur_token.value
403 self._advance()
404 return value
405 else:
406 raise ASDLSyntaxError(
407 'Expected token {}, got {}'.format(_TOKEN_STR[kind],
408 self.cur_token.value),
409 self.cur_token.lineno)
410
411 def _at_keyword(self, keyword):
412 return (self.cur_token.kind == TokenKind.Keyword and
413 self.cur_token.value == keyword)
414
415
416_PRIMITIVE_TYPES = [
417 'string',
418 'int',
419 'uint16', # used for Token length - should we generalize this?
420 'BigInt',
421 'float',
422 'bool',
423
424 # 'any' is used:
425 # - for value.Obj in the the Oil expression evaluator. We're not doing any
426 # dynamic or static checking now.
427 'any',
428]
429
430
431def _ResolveType(typ, type_lookup):
432 # type: (AST, dict) -> None
433 """Recursively attach a 'resolved' field to AST nodes."""
434 if isinstance(typ, ast.NamedType):
435 if typ.name not in _PRIMITIVE_TYPES:
436 ast_node = type_lookup.get(typ.name)
437 if ast_node is None:
438 raise ASDLSyntaxError("Couldn't find type %r" % typ.name)
439 typ.resolved = ast_node
440
441 elif isinstance(typ, ast.ParameterizedType):
442 for child in typ.children:
443 _ResolveType(child, type_lookup)
444
445 if typ.type_name == 'Optional':
446 child = typ.children[0]
447 if isinstance(child, ast.NamedType):
448 if child.name in _PRIMITIVE_TYPES and child.name != 'string':
449 raise ASDLSyntaxError(
450 'Optional primitive type {} not allowed'.format(
451 child.name))
452
453 if child.resolved and isinstance(child.resolved,
454 ast.SimpleSum):
455 raise ASDLSyntaxError(
456 'Optional simple sum type {} not allowed'.format(
457 child.name))
458
459 else:
460 raise AssertionError()
461
462
463def _ResolveFields(field_ast_nodes, type_lookup):
464 """
465 Args:
466 type_lookup: Populated by name resolution
467 """
468 for field in field_ast_nodes:
469 _ResolveType(field.typ, type_lookup)
470
471
472def _ResolveModule(module, app_types):
473 """Name resolution for NamedType."""
474 # Types that fields are declared with: int, id, word_part, etc.
475 # Fields are NOT declared with Constructor names.
476 type_lookup = dict(app_types)
477
478 # Note: we don't actually load the type, and instead leave that to MyPy /
479 # C++. A consequence of this is TypeNameHeuristic().
480 for u in module.uses:
481 for type_name in u.type_names:
482 type_lookup[type_name] = u # type: ast.Use()
483
484 # NOTE: We need two passes because types can be mutually recursive, e.g.
485 # asdl/arith.asdl.
486
487 # First pass: collect declared types and make entries for them.
488 for d in module.dfns:
489 type_lookup[d.name] = d.value
490
491 # Second pass: add NamedType.resolved field
492 for d in module.dfns:
493 ast_node = d.value
494 if isinstance(ast_node, ast.Product):
495 #log('fields %s', ast_node.fields)
496 _ResolveFields(ast_node.fields, type_lookup)
497
498 elif isinstance(ast_node, ast.Sum):
499 for cons in ast_node.types:
500 _ResolveFields(cons.fields, type_lookup)
501
502 else:
503 raise AssertionError(ast_node)
504
505
506def LoadSchema(f, app_types, verbose=False):
507 """Returns an AST for the schema."""
508 p = ASDLParser()
509 schema_ast = p.parse(f)
510 if verbose:
511 import sys
512 schema_ast.Print(sys.stdout, 0)
513
514 # Make sure all the names are valid
515 _ResolveModule(schema_ast, app_types)
516 return schema_ast