1 | #-------------------------------------------------------------------------------
|
2 | # Parser for ASDL [1] definition files. Reads in an ASDL description and parses
|
3 | # it into an AST that describes it.
|
4 | #
|
5 | # The EBNF we're parsing here: Figure 1 of the paper [1]. Extended to support
|
6 | # modules and attributes after a product. Words starting with Capital letters
|
7 | # are terminals. Literal tokens are in "double quotes". Others are
|
8 | # non-terminals. Id is either TokenId or ConstructorId.
|
9 | #
|
10 | # module ::= "module" Id "{" [definitions] "}"
|
11 | # definitions ::= { TypeId "=" type }
|
12 | # type ::= product | sum
|
13 | # product ::= fields ["attributes" fields]
|
14 | # fields ::= "(" { field, "," } field ")"
|
15 | # field ::= TypeId ["?" | "*"] [Id]
|
16 | # sum ::= constructor { "|" constructor } ["attributes" fields]
|
17 | # constructor ::= ConstructorId [fields]
|
18 | #
|
19 | # [1] "The Zephyr Abstract Syntax Description Language" by Wang, et. al. See
|
20 | # http://asdl.sourceforge.net/
|
21 | #-------------------------------------------------------------------------------
|
22 | from __future__ import print_function
|
23 |
|
24 | import cStringIO
|
25 | import re
|
26 |
|
27 | __all__ = [
|
28 | 'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor',
|
29 | 'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check',
|
30 | 'is_simple']
|
31 |
|
32 |
|
33 | # PATCH: Moved this function from asdl_c.py.
|
34 | def is_simple(sum):
|
35 | """Return True if a sum is a simple.
|
36 |
|
37 | A sum is simple if its types have no fields, e.g.
|
38 | unaryop = Invert | Not | UAdd | USub
|
39 | """
|
40 | for t in sum.types:
|
41 | if t.fields:
|
42 | return False
|
43 | return True
|
44 |
|
45 |
|
46 | class StrType(object):
|
47 | def __repr__(self):
|
48 | return '<Str>'
|
49 |
|
50 | class IntType(object):
|
51 | def __repr__(self):
|
52 | return '<Int>'
|
53 |
|
54 | class BoolType(object):
|
55 | def __repr__(self):
|
56 | return '<Bool>'
|
57 |
|
58 |
|
59 | class ArrayType(object):
|
60 | def __init__(self, desc):
|
61 | self.desc = desc
|
62 |
|
63 | def __repr__(self):
|
64 | return '<Array %s>' % self.desc
|
65 |
|
66 | class MaybeType(object):
|
67 | def __init__(self, desc):
|
68 | self.desc = desc # another descriptor
|
69 |
|
70 | def __repr__(self):
|
71 | return '<Maybe %s>' % self.desc
|
72 |
|
73 | class UserType(object):
|
74 | def __init__(self, typ):
|
75 | assert isinstance(typ, type), typ
|
76 | self.typ = typ
|
77 |
|
78 | def __repr__(self):
|
79 | return '<UserType %s>' % self.typ
|
80 |
|
81 |
|
82 | class TypeLookup(object):
|
83 | """Look up types by name.
|
84 |
|
85 | The names in a flat namespace.
|
86 | """
|
87 |
|
88 | def __init__(self, module, app_types=None):
|
89 | # types that fields are declared with: int, id, word_part, etc.
|
90 | # Fields are not declared with constructor names.
|
91 | self.declared_types = {}
|
92 |
|
93 | for d in module.dfns:
|
94 | self.declared_types[d.name] = d.value
|
95 |
|
96 | if app_types is not None:
|
97 | self.declared_types.update(app_types)
|
98 |
|
99 | # Primitive types.
|
100 | self.declared_types['string'] = StrType()
|
101 | self.declared_types['int'] = IntType()
|
102 | self.declared_types['bool'] = BoolType()
|
103 |
|
104 | # Types with fields that need to be reflected on: Product and Constructor.
|
105 | self.compound_types = {}
|
106 | for d in module.dfns:
|
107 | typ = d.value
|
108 | if isinstance(typ, Product):
|
109 | self.compound_types[d.name] = typ
|
110 | elif isinstance(typ, Sum):
|
111 | # e.g. 'assign_op' is simple, or 'bracket_op' is not simple.
|
112 | self.compound_types[d.name] = typ
|
113 |
|
114 | for cons in typ.types:
|
115 | self.compound_types[cons.name] = cons
|
116 |
|
117 | def ByFieldInstance(self, field):
|
118 | """
|
119 | Args:
|
120 | field: Field() instance
|
121 | """
|
122 | t = self.declared_types[field.type]
|
123 | if field.seq:
|
124 | return ArrayType(t)
|
125 |
|
126 | if field.opt:
|
127 | return MaybeType(t)
|
128 |
|
129 | return t
|
130 |
|
131 | def ByTypeName(self, type_name):
|
132 | """
|
133 | Args:
|
134 | type_name: string, e.g. 'word_part' or 'LiteralPart'
|
135 | """
|
136 | if not type_name in self.compound_types:
|
137 | print('FATAL: %s' % self.compound_types.keys())
|
138 | return self.compound_types[type_name]
|
139 |
|
140 | def __repr__(self):
|
141 | return repr(self.declared_types)
|
142 |
|
143 |
|
144 | def _CheckFieldsAndWire(typ, type_lookup):
|
145 | for f in typ.fields:
|
146 | # Will fail if it doesn't exist
|
147 | _ = type_lookup.ByFieldInstance(f)
|
148 | typ.type_lookup = type_lookup # wire it for lookup
|
149 |
|
150 |
|
151 | def ResolveTypes(module, app_types=None):
|
152 | # Walk the module, checking types, and adding the type_lookup attribute
|
153 | # everywhere.
|
154 | type_lookup = TypeLookup(module, app_types=app_types)
|
155 |
|
156 | for node in module.dfns:
|
157 | assert isinstance(node, Type), node
|
158 | v = node.value
|
159 | if isinstance(v, Product):
|
160 | _CheckFieldsAndWire(v, type_lookup)
|
161 |
|
162 | elif isinstance(v, Sum):
|
163 | for cons in v.types:
|
164 | _CheckFieldsAndWire(cons, type_lookup)
|
165 |
|
166 | else:
|
167 | raise AssertionError(v)
|
168 |
|
169 | return type_lookup
|
170 |
|
171 |
|
172 | # The following classes define nodes into which the ASDL description is parsed.
|
173 | # Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST
|
174 | # structure used by a programming language. But ASDL files themselves need to be
|
175 | # parsed. This module parses ASDL files and uses a simple AST to represent them.
|
176 | # See the EBNF at the top of the file to understand the logical connection
|
177 | # between the various node types.
|
178 |
|
179 | builtin_types = {'string', 'int', 'bool'}
|
180 |
|
181 | class AST(object):
|
182 | def Print(self, f, indent):
|
183 | raise NotImplementedError
|
184 |
|
185 | def __repr__(self):
|
186 | f = cStringIO.StringIO()
|
187 | self.Print(f, 0)
|
188 | return f.getvalue()
|
189 |
|
190 |
|
191 | class Module(AST):
|
192 | def __init__(self, name, dfns):
|
193 | self.name = name
|
194 | self.dfns = dfns
|
195 |
|
196 | def Print(self, f, indent):
|
197 | ind = indent * ' '
|
198 | f.write('%sModule %s {\n' % (ind, self.name))
|
199 |
|
200 | for d in self.dfns:
|
201 | d.Print(f, indent+1)
|
202 | f.write('\n')
|
203 | f.write('%s}\n' % ind)
|
204 |
|
205 |
|
206 | class Type(AST):
|
207 | def __init__(self, name, value):
|
208 | self.name = name
|
209 | self.value = value
|
210 |
|
211 | def Print(self, f, indent):
|
212 | ind = indent * ' '
|
213 | f.write('%sType %s {\n' % (ind, self.name))
|
214 | self.value.Print(f, indent+1)
|
215 | f.write('%s}\n' % ind)
|
216 |
|
217 |
|
218 |
|
219 | class _CompoundType(AST):
|
220 | """Either a Product or Constructor.
|
221 |
|
222 | encode.py and format.py need a reflection API.
|
223 | """
|
224 |
|
225 | def __init__(self, fields):
|
226 | self.fields = fields or []
|
227 |
|
228 | # Add fake spids field.
|
229 | # TODO: Only do this if 'attributes' are set.
|
230 | if self.fields:
|
231 | self.fields.append(Field('int', 'spids', seq=True))
|
232 |
|
233 | self.field_lookup = {f.name: f for f in self.fields}
|
234 | self.type_lookup = None # set by ResolveTypes()
|
235 |
|
236 | self.type_cache = {}
|
237 |
|
238 | def GetFieldNames(self):
|
239 | for f in self.fields:
|
240 | yield f.name
|
241 |
|
242 | def GetFields(self):
|
243 | for f in self.fields:
|
244 | field_name = f.name
|
245 | yield field_name, self.LookupFieldType(field_name)
|
246 |
|
247 | def LookupFieldType(self, field_name):
|
248 | # Cache and return it. We don't want to create new instances every
|
249 | # time we iterate over the fields.
|
250 | try:
|
251 | return self.type_cache[field_name]
|
252 | except KeyError:
|
253 | field = self.field_lookup[field_name]
|
254 | desc = self.type_lookup.ByFieldInstance(field)
|
255 | self.type_cache[field_name] = desc
|
256 | return desc
|
257 |
|
258 |
|
259 | class Constructor(_CompoundType):
|
260 | def __init__(self, name, fields=None):
|
261 | _CompoundType.__init__(self, fields)
|
262 | self.name = name
|
263 |
|
264 | def Print(self, f, indent):
|
265 | ind = indent * ' '
|
266 | f.write('%sConstructor %s' % (ind, self.name))
|
267 |
|
268 | if self.fields:
|
269 | f.write(' {\n')
|
270 | for field in self.fields:
|
271 | field.Print(f, indent+1)
|
272 | f.write('%s}' % ind)
|
273 |
|
274 | f.write('\n')
|
275 |
|
276 |
|
277 | class Field(AST):
|
278 | def __init__(self, type, name=None, seq=False, opt=False):
|
279 | self.type = type
|
280 | self.name = name
|
281 | self.seq = seq
|
282 | self.opt = opt
|
283 |
|
284 | def Print(self, f, indent):
|
285 | extra = []
|
286 | if self.seq:
|
287 | extra.append('seq=True')
|
288 | elif self.opt:
|
289 | extra.append('opt=True')
|
290 | else:
|
291 | extra = ""
|
292 |
|
293 | ind = indent * ' '
|
294 | f.write('%sField %s %s' % (ind, self.name, self.type))
|
295 | if extra:
|
296 | f.write(' (')
|
297 | f.write(', '.join(extra))
|
298 | f.write(')')
|
299 | f.write('\n')
|
300 |
|
301 |
|
302 | class Sum(AST):
|
303 | def __init__(self, types, attributes=None):
|
304 | self.types = types
|
305 | self.attributes = attributes or []
|
306 |
|
307 | def Print(self, f, indent):
|
308 | ind = indent * ' '
|
309 | f.write('%sSum {\n' % ind)
|
310 | for t in self.types:
|
311 | t.Print(f, indent+1)
|
312 | if self.attributes:
|
313 | f.write('%s\n' % self.attributes)
|
314 | f.write('%s}\n' % ind)
|
315 |
|
316 |
|
317 | class Product(_CompoundType):
|
318 | def __init__(self, fields, attributes=None):
|
319 | _CompoundType.__init__(self, fields)
|
320 | self.attributes = attributes or []
|
321 |
|
322 | def Print(self, f, indent):
|
323 | ind = indent * ' '
|
324 | f.write('%sProduct {\n' % ind)
|
325 | for field in self.fields:
|
326 | field.Print(f, indent+1)
|
327 | if self.attributes:
|
328 | f.write('%s\n' % self.attributes)
|
329 | f.write('%s}\n' % ind)
|
330 |
|
331 | # A generic visitor for the meta-AST that describes ASDL. This can be used by
|
332 | # emitters. Note that this visitor does not provide a generic visit method, so a
|
333 | # subclass needs to define visit methods from visitModule to as deep as the
|
334 | # interesting node.
|
335 | # We also define a Check visitor that makes sure the parsed ASDL is well-formed.
|
336 |
|
337 | class VisitorBase(object):
|
338 | """Generic tree visitor for ASTs."""
|
339 | def __init__(self):
|
340 | self.cache = {}
|
341 |
|
342 | def visit(self, obj, *args):
|
343 | klass = obj.__class__
|
344 | meth = self.cache.get(klass)
|
345 | if meth is None:
|
346 | methname = "visit" + klass.__name__
|
347 | meth = getattr(self, methname, None)
|
348 | self.cache[klass] = meth
|
349 | if meth:
|
350 | try:
|
351 | meth(obj, *args)
|
352 | except Exception as e:
|
353 | print("Error visiting %r: %s" % (obj, e))
|
354 | raise
|
355 |
|
356 | class Check(VisitorBase):
|
357 | """A visitor that checks a parsed ASDL tree for correctness.
|
358 |
|
359 | Errors are printed and accumulated.
|
360 | """
|
361 | def __init__(self):
|
362 | super(Check, self).__init__()
|
363 | self.cons = {}
|
364 | self.errors = 0
|
365 | self.types = {} # list of declared field types
|
366 |
|
367 | def visitModule(self, mod):
|
368 | for dfn in mod.dfns:
|
369 | self.visit(dfn)
|
370 |
|
371 | def visitType(self, type):
|
372 | self.visit(type.value, str(type.name))
|
373 |
|
374 | def visitSum(self, sum, name):
|
375 | for t in sum.types:
|
376 | # Simple sum types can't conflict
|
377 | if is_simple(sum):
|
378 | continue
|
379 | self.visit(t, name)
|
380 |
|
381 | def visitConstructor(self, cons, name):
|
382 | key = str(cons.name)
|
383 | conflict = self.cons.get(key)
|
384 | if conflict is None:
|
385 | self.cons[key] = name
|
386 | else:
|
387 | # Problem: This assumes a flat namespace, which we don't want!
|
388 | # This check is more evidence that a flat namespace isn't
|
389 | # desirable.
|
390 | print('Redefinition of constructor {}'.format(key))
|
391 | print('Defined in {} and {}'.format(conflict, name))
|
392 | self.errors += 1
|
393 | for f in cons.fields:
|
394 | self.visit(f, key)
|
395 |
|
396 | def visitField(self, field, name):
|
397 | key = str(field.type)
|
398 | l = self.types.setdefault(key, [])
|
399 | l.append(name)
|
400 |
|
401 | def visitProduct(self, prod, name):
|
402 | for f in prod.fields:
|
403 | self.visit(f, name)
|
404 |
|
405 | def check(mod, app_types=None):
|
406 | """Check the parsed ASDL tree for correctness.
|
407 |
|
408 | Return True if success. For failure, the errors are printed out and False
|
409 | is returned.
|
410 | """
|
411 | app_types = app_types or {}
|
412 | v = Check()
|
413 | v.visit(mod)
|
414 | return not v.errors
|
415 |
|
416 | # The ASDL parser itself comes next. The only interesting external interface
|
417 | # here is the top-level parse function.
|
418 |
|
419 | def parse(f):
|
420 | """Parse ASDL from the given file and return a Module node describing it."""
|
421 | parser = ASDLParser()
|
422 | return parser.parse(f)
|
423 |
|
424 | # Types for describing tokens in an ASDL specification.
|
425 | class TokenKind:
|
426 | """TokenKind is provides a scope for enumerated token kinds."""
|
427 | (ConstructorId, TypeId, Equals, Comma, Question, Pipe, Asterisk,
|
428 | LParen, RParen, LBrace, RBrace) = range(11)
|
429 |
|
430 | operator_table = {
|
431 | '=': Equals, ',': Comma, '?': Question, '|': Pipe, '(': LParen,
|
432 | ')': RParen, '*': Asterisk, '{': LBrace, '}': RBrace}
|
433 |
|
434 | class Token:
|
435 | def __init__(self, kind, value, lineno):
|
436 | self.kind = kind
|
437 | self.value = value
|
438 | self.lineno = lineno
|
439 |
|
440 | class ASDLSyntaxError(Exception):
|
441 | def __init__(self, msg, lineno=None):
|
442 | self.msg = msg
|
443 | self.lineno = lineno or '<unknown>'
|
444 |
|
445 | def __str__(self):
|
446 | return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
|
447 |
|
448 | def tokenize_asdl(f):
|
449 | """Tokenize the given buffer. Yield Token objects."""
|
450 | for lineno, line in enumerate(f, 1):
|
451 | for m in re.finditer(r'\s*(\w+|--.*|.)', line.strip()):
|
452 | c = m.group(1)
|
453 | if c[0].isalpha():
|
454 | # Some kind of identifier
|
455 | if c[0].isupper():
|
456 | yield Token(TokenKind.ConstructorId, c, lineno)
|
457 | else:
|
458 | yield Token(TokenKind.TypeId, c, lineno)
|
459 | elif c[:2] == '--':
|
460 | # Comment
|
461 | break
|
462 | else:
|
463 | # Operators
|
464 | try:
|
465 | op_kind = TokenKind.operator_table[c]
|
466 | except KeyError:
|
467 | raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
|
468 | yield Token(op_kind, c, lineno)
|
469 |
|
470 | class ASDLParser:
|
471 | """Parser for ASDL files.
|
472 |
|
473 | Create, then call the parse method on a buffer containing ASDL.
|
474 | This is a simple recursive descent parser that uses tokenize_asdl for the
|
475 | lexing.
|
476 | """
|
477 | def __init__(self):
|
478 | self._tokenizer = None
|
479 | self.cur_token = None
|
480 |
|
481 | def parse(self, f):
|
482 | """Parse the ASDL in the file and return an AST with a Module root.
|
483 | """
|
484 | self._tokenizer = tokenize_asdl(f)
|
485 | self._advance()
|
486 | return self._parse_module()
|
487 |
|
488 | def _parse_module(self):
|
489 | if self._at_keyword('module'):
|
490 | self._advance()
|
491 | else:
|
492 | raise ASDLSyntaxError(
|
493 | 'Expected "module" (found {})'.format(self.cur_token.value),
|
494 | self.cur_token.lineno)
|
495 | name = self._match(self._id_kinds)
|
496 | self._match(TokenKind.LBrace)
|
497 | defs = self._parse_definitions()
|
498 | self._match(TokenKind.RBrace)
|
499 | return Module(name, defs)
|
500 |
|
501 | def _parse_definitions(self):
|
502 | defs = []
|
503 | while self.cur_token.kind == TokenKind.TypeId:
|
504 | typename = self._advance()
|
505 | self._match(TokenKind.Equals)
|
506 | type = self._parse_type()
|
507 | defs.append(Type(typename, type))
|
508 | return defs
|
509 |
|
510 | def _parse_type(self):
|
511 | if self.cur_token.kind == TokenKind.LParen:
|
512 | # If we see a (, it's a product
|
513 | return self._parse_product()
|
514 | else:
|
515 | # Otherwise it's a sum. Look for ConstructorId
|
516 | sumlist = [Constructor(self._match(TokenKind.ConstructorId),
|
517 | self._parse_optional_fields())]
|
518 | while self.cur_token.kind == TokenKind.Pipe:
|
519 | # More constructors
|
520 | self._advance()
|
521 | sumlist.append(Constructor(
|
522 | self._match(TokenKind.ConstructorId),
|
523 | self._parse_optional_fields()))
|
524 | return Sum(sumlist, self._parse_optional_attributes())
|
525 |
|
526 | def _parse_product(self):
|
527 | return Product(self._parse_fields(), self._parse_optional_attributes())
|
528 |
|
529 | def _parse_fields(self):
|
530 | fields = []
|
531 | self._match(TokenKind.LParen)
|
532 | while self.cur_token.kind == TokenKind.TypeId:
|
533 | typename = self._advance()
|
534 | is_seq, is_opt = self._parse_optional_field_quantifier()
|
535 | id = (self._advance() if self.cur_token.kind in self._id_kinds
|
536 | else None)
|
537 | fields.append(Field(typename, id, seq=is_seq, opt=is_opt))
|
538 | if self.cur_token.kind == TokenKind.RParen:
|
539 | break
|
540 | elif self.cur_token.kind == TokenKind.Comma:
|
541 | self._advance()
|
542 | self._match(TokenKind.RParen)
|
543 | return fields
|
544 |
|
545 | def _parse_optional_fields(self):
|
546 | if self.cur_token.kind == TokenKind.LParen:
|
547 | return self._parse_fields()
|
548 | else:
|
549 | return None
|
550 |
|
551 | def _parse_optional_attributes(self):
|
552 | if self._at_keyword('attributes'):
|
553 | self._advance()
|
554 | return self._parse_fields()
|
555 | else:
|
556 | return None
|
557 |
|
558 | def _parse_optional_field_quantifier(self):
|
559 | is_seq, is_opt = False, False
|
560 | if self.cur_token.kind == TokenKind.Asterisk:
|
561 | is_seq = True
|
562 | self._advance()
|
563 | elif self.cur_token.kind == TokenKind.Question:
|
564 | is_opt = True
|
565 | self._advance()
|
566 | return is_seq, is_opt
|
567 |
|
568 | def _advance(self):
|
569 | """ Return the value of the current token and read the next one into
|
570 | self.cur_token.
|
571 | """
|
572 | cur_val = None if self.cur_token is None else self.cur_token.value
|
573 | try:
|
574 | self.cur_token = next(self._tokenizer)
|
575 | except StopIteration:
|
576 | self.cur_token = None
|
577 | return cur_val
|
578 |
|
579 | _id_kinds = (TokenKind.ConstructorId, TokenKind.TypeId)
|
580 |
|
581 | def _match(self, kind):
|
582 | """The 'match' primitive of RD parsers.
|
583 |
|
584 | * Verifies that the current token is of the given kind (kind can
|
585 | be a tuple, in which the kind must match one of its members).
|
586 | * Returns the value of the current token
|
587 | * Reads in the next token
|
588 | """
|
589 | if (isinstance(kind, tuple) and self.cur_token.kind in kind or
|
590 | self.cur_token.kind == kind
|
591 | ):
|
592 | value = self.cur_token.value
|
593 | self._advance()
|
594 | return value
|
595 | else:
|
596 | raise ASDLSyntaxError(
|
597 | 'Unmatched {} (found {})'.format(kind, self.cur_token.kind),
|
598 | self.cur_token.lineno)
|
599 |
|
600 | def _at_keyword(self, keyword):
|
601 | return (self.cur_token.kind == TokenKind.TypeId and
|
602 | self.cur_token.value == keyword)
|
603 |
|
604 |
|
605 | def LoadSchema(f, app_types):
|
606 | """Parse an ASDL schema. Used for code gen and metaprogramming."""
|
607 | asdl_module = parse(f)
|
608 |
|
609 | if not check(asdl_module, app_types):
|
610 | raise AssertionError('ASDL file is invalid')
|
611 |
|
612 | type_lookup = ResolveTypes(asdl_module, app_types)
|
613 | return asdl_module, type_lookup
|