OILS / opy / _regtest / src / asdl / asdl_.py View on Github | oilshell.org

613 lines, 392 significant
1#-------------------------------------------------------------------------------
2# Parser for ASDL [1] definition files. Reads in an ASDL description and parses
3# it into an AST that describes it.
4#
5# The EBNF we're parsing here: Figure 1 of the paper [1]. Extended to support
6# modules and attributes after a product. Words starting with Capital letters
7# are terminals. Literal tokens are in "double quotes". Others are
8# non-terminals. Id is either TokenId or ConstructorId.
9#
10# module ::= "module" Id "{" [definitions] "}"
11# definitions ::= { TypeId "=" type }
12# type ::= product | sum
13# product ::= fields ["attributes" fields]
14# fields ::= "(" { field, "," } field ")"
15# field ::= TypeId ["?" | "*"] [Id]
16# sum ::= constructor { "|" constructor } ["attributes" fields]
17# constructor ::= ConstructorId [fields]
18#
19# [1] "The Zephyr Abstract Syntax Description Language" by Wang, et. al. See
20# http://asdl.sourceforge.net/
21#-------------------------------------------------------------------------------
22from __future__ import print_function
23
24import cStringIO
25import re
26
27__all__ = [
28 'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor',
29 'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check',
30 'is_simple']
31
32
33# PATCH: Moved this function from asdl_c.py.
34def is_simple(sum):
35 """Return True if a sum is a simple.
36
37 A sum is simple if its types have no fields, e.g.
38 unaryop = Invert | Not | UAdd | USub
39 """
40 for t in sum.types:
41 if t.fields:
42 return False
43 return True
44
45
46class StrType(object):
47 def __repr__(self):
48 return '<Str>'
49
50class IntType(object):
51 def __repr__(self):
52 return '<Int>'
53
54class BoolType(object):
55 def __repr__(self):
56 return '<Bool>'
57
58
59class ArrayType(object):
60 def __init__(self, desc):
61 self.desc = desc
62
63 def __repr__(self):
64 return '<Array %s>' % self.desc
65
66class MaybeType(object):
67 def __init__(self, desc):
68 self.desc = desc # another descriptor
69
70 def __repr__(self):
71 return '<Maybe %s>' % self.desc
72
73class UserType(object):
74 def __init__(self, typ):
75 assert isinstance(typ, type), typ
76 self.typ = typ
77
78 def __repr__(self):
79 return '<UserType %s>' % self.typ
80
81
82class TypeLookup(object):
83 """Look up types by name.
84
85 The names in a flat namespace.
86 """
87
88 def __init__(self, module, app_types=None):
89 # types that fields are declared with: int, id, word_part, etc.
90 # Fields are not declared with constructor names.
91 self.declared_types = {}
92
93 for d in module.dfns:
94 self.declared_types[d.name] = d.value
95
96 if app_types is not None:
97 self.declared_types.update(app_types)
98
99 # Primitive types.
100 self.declared_types['string'] = StrType()
101 self.declared_types['int'] = IntType()
102 self.declared_types['bool'] = BoolType()
103
104 # Types with fields that need to be reflected on: Product and Constructor.
105 self.compound_types = {}
106 for d in module.dfns:
107 typ = d.value
108 if isinstance(typ, Product):
109 self.compound_types[d.name] = typ
110 elif isinstance(typ, Sum):
111 # e.g. 'assign_op' is simple, or 'bracket_op' is not simple.
112 self.compound_types[d.name] = typ
113
114 for cons in typ.types:
115 self.compound_types[cons.name] = cons
116
117 def ByFieldInstance(self, field):
118 """
119 Args:
120 field: Field() instance
121 """
122 t = self.declared_types[field.type]
123 if field.seq:
124 return ArrayType(t)
125
126 if field.opt:
127 return MaybeType(t)
128
129 return t
130
131 def ByTypeName(self, type_name):
132 """
133 Args:
134 type_name: string, e.g. 'word_part' or 'LiteralPart'
135 """
136 if not type_name in self.compound_types:
137 print('FATAL: %s' % self.compound_types.keys())
138 return self.compound_types[type_name]
139
140 def __repr__(self):
141 return repr(self.declared_types)
142
143
144def _CheckFieldsAndWire(typ, type_lookup):
145 for f in typ.fields:
146 # Will fail if it doesn't exist
147 _ = type_lookup.ByFieldInstance(f)
148 typ.type_lookup = type_lookup # wire it for lookup
149
150
151def ResolveTypes(module, app_types=None):
152 # Walk the module, checking types, and adding the type_lookup attribute
153 # everywhere.
154 type_lookup = TypeLookup(module, app_types=app_types)
155
156 for node in module.dfns:
157 assert isinstance(node, Type), node
158 v = node.value
159 if isinstance(v, Product):
160 _CheckFieldsAndWire(v, type_lookup)
161
162 elif isinstance(v, Sum):
163 for cons in v.types:
164 _CheckFieldsAndWire(cons, type_lookup)
165
166 else:
167 raise AssertionError(v)
168
169 return type_lookup
170
171
172# The following classes define nodes into which the ASDL description is parsed.
173# Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST
174# structure used by a programming language. But ASDL files themselves need to be
175# parsed. This module parses ASDL files and uses a simple AST to represent them.
176# See the EBNF at the top of the file to understand the logical connection
177# between the various node types.
178
179builtin_types = {'string', 'int', 'bool'}
180
181class AST(object):
182 def Print(self, f, indent):
183 raise NotImplementedError
184
185 def __repr__(self):
186 f = cStringIO.StringIO()
187 self.Print(f, 0)
188 return f.getvalue()
189
190
191class Module(AST):
192 def __init__(self, name, dfns):
193 self.name = name
194 self.dfns = dfns
195
196 def Print(self, f, indent):
197 ind = indent * ' '
198 f.write('%sModule %s {\n' % (ind, self.name))
199
200 for d in self.dfns:
201 d.Print(f, indent+1)
202 f.write('\n')
203 f.write('%s}\n' % ind)
204
205
206class Type(AST):
207 def __init__(self, name, value):
208 self.name = name
209 self.value = value
210
211 def Print(self, f, indent):
212 ind = indent * ' '
213 f.write('%sType %s {\n' % (ind, self.name))
214 self.value.Print(f, indent+1)
215 f.write('%s}\n' % ind)
216
217
218
219class _CompoundType(AST):
220 """Either a Product or Constructor.
221
222 encode.py and format.py need a reflection API.
223 """
224
225 def __init__(self, fields):
226 self.fields = fields or []
227
228 # Add fake spids field.
229 # TODO: Only do this if 'attributes' are set.
230 if self.fields:
231 self.fields.append(Field('int', 'spids', seq=True))
232
233 self.field_lookup = {f.name: f for f in self.fields}
234 self.type_lookup = None # set by ResolveTypes()
235
236 self.type_cache = {}
237
238 def GetFieldNames(self):
239 for f in self.fields:
240 yield f.name
241
242 def GetFields(self):
243 for f in self.fields:
244 field_name = f.name
245 yield field_name, self.LookupFieldType(field_name)
246
247 def LookupFieldType(self, field_name):
248 # Cache and return it. We don't want to create new instances every
249 # time we iterate over the fields.
250 try:
251 return self.type_cache[field_name]
252 except KeyError:
253 field = self.field_lookup[field_name]
254 desc = self.type_lookup.ByFieldInstance(field)
255 self.type_cache[field_name] = desc
256 return desc
257
258
259class Constructor(_CompoundType):
260 def __init__(self, name, fields=None):
261 _CompoundType.__init__(self, fields)
262 self.name = name
263
264 def Print(self, f, indent):
265 ind = indent * ' '
266 f.write('%sConstructor %s' % (ind, self.name))
267
268 if self.fields:
269 f.write(' {\n')
270 for field in self.fields:
271 field.Print(f, indent+1)
272 f.write('%s}' % ind)
273
274 f.write('\n')
275
276
277class Field(AST):
278 def __init__(self, type, name=None, seq=False, opt=False):
279 self.type = type
280 self.name = name
281 self.seq = seq
282 self.opt = opt
283
284 def Print(self, f, indent):
285 extra = []
286 if self.seq:
287 extra.append('seq=True')
288 elif self.opt:
289 extra.append('opt=True')
290 else:
291 extra = ""
292
293 ind = indent * ' '
294 f.write('%sField %s %s' % (ind, self.name, self.type))
295 if extra:
296 f.write(' (')
297 f.write(', '.join(extra))
298 f.write(')')
299 f.write('\n')
300
301
302class Sum(AST):
303 def __init__(self, types, attributes=None):
304 self.types = types
305 self.attributes = attributes or []
306
307 def Print(self, f, indent):
308 ind = indent * ' '
309 f.write('%sSum {\n' % ind)
310 for t in self.types:
311 t.Print(f, indent+1)
312 if self.attributes:
313 f.write('%s\n' % self.attributes)
314 f.write('%s}\n' % ind)
315
316
317class Product(_CompoundType):
318 def __init__(self, fields, attributes=None):
319 _CompoundType.__init__(self, fields)
320 self.attributes = attributes or []
321
322 def Print(self, f, indent):
323 ind = indent * ' '
324 f.write('%sProduct {\n' % ind)
325 for field in self.fields:
326 field.Print(f, indent+1)
327 if self.attributes:
328 f.write('%s\n' % self.attributes)
329 f.write('%s}\n' % ind)
330
331# A generic visitor for the meta-AST that describes ASDL. This can be used by
332# emitters. Note that this visitor does not provide a generic visit method, so a
333# subclass needs to define visit methods from visitModule to as deep as the
334# interesting node.
335# We also define a Check visitor that makes sure the parsed ASDL is well-formed.
336
337class VisitorBase(object):
338 """Generic tree visitor for ASTs."""
339 def __init__(self):
340 self.cache = {}
341
342 def visit(self, obj, *args):
343 klass = obj.__class__
344 meth = self.cache.get(klass)
345 if meth is None:
346 methname = "visit" + klass.__name__
347 meth = getattr(self, methname, None)
348 self.cache[klass] = meth
349 if meth:
350 try:
351 meth(obj, *args)
352 except Exception as e:
353 print("Error visiting %r: %s" % (obj, e))
354 raise
355
356class Check(VisitorBase):
357 """A visitor that checks a parsed ASDL tree for correctness.
358
359 Errors are printed and accumulated.
360 """
361 def __init__(self):
362 super(Check, self).__init__()
363 self.cons = {}
364 self.errors = 0
365 self.types = {} # list of declared field types
366
367 def visitModule(self, mod):
368 for dfn in mod.dfns:
369 self.visit(dfn)
370
371 def visitType(self, type):
372 self.visit(type.value, str(type.name))
373
374 def visitSum(self, sum, name):
375 for t in sum.types:
376 # Simple sum types can't conflict
377 if is_simple(sum):
378 continue
379 self.visit(t, name)
380
381 def visitConstructor(self, cons, name):
382 key = str(cons.name)
383 conflict = self.cons.get(key)
384 if conflict is None:
385 self.cons[key] = name
386 else:
387 # Problem: This assumes a flat namespace, which we don't want!
388 # This check is more evidence that a flat namespace isn't
389 # desirable.
390 print('Redefinition of constructor {}'.format(key))
391 print('Defined in {} and {}'.format(conflict, name))
392 self.errors += 1
393 for f in cons.fields:
394 self.visit(f, key)
395
396 def visitField(self, field, name):
397 key = str(field.type)
398 l = self.types.setdefault(key, [])
399 l.append(name)
400
401 def visitProduct(self, prod, name):
402 for f in prod.fields:
403 self.visit(f, name)
404
405def check(mod, app_types=None):
406 """Check the parsed ASDL tree for correctness.
407
408 Return True if success. For failure, the errors are printed out and False
409 is returned.
410 """
411 app_types = app_types or {}
412 v = Check()
413 v.visit(mod)
414 return not v.errors
415
416# The ASDL parser itself comes next. The only interesting external interface
417# here is the top-level parse function.
418
419def parse(f):
420 """Parse ASDL from the given file and return a Module node describing it."""
421 parser = ASDLParser()
422 return parser.parse(f)
423
424# Types for describing tokens in an ASDL specification.
425class TokenKind:
426 """TokenKind is provides a scope for enumerated token kinds."""
427 (ConstructorId, TypeId, Equals, Comma, Question, Pipe, Asterisk,
428 LParen, RParen, LBrace, RBrace) = range(11)
429
430 operator_table = {
431 '=': Equals, ',': Comma, '?': Question, '|': Pipe, '(': LParen,
432 ')': RParen, '*': Asterisk, '{': LBrace, '}': RBrace}
433
434class Token:
435 def __init__(self, kind, value, lineno):
436 self.kind = kind
437 self.value = value
438 self.lineno = lineno
439
440class ASDLSyntaxError(Exception):
441 def __init__(self, msg, lineno=None):
442 self.msg = msg
443 self.lineno = lineno or '<unknown>'
444
445 def __str__(self):
446 return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
447
448def tokenize_asdl(f):
449 """Tokenize the given buffer. Yield Token objects."""
450 for lineno, line in enumerate(f, 1):
451 for m in re.finditer(r'\s*(\w+|--.*|.)', line.strip()):
452 c = m.group(1)
453 if c[0].isalpha():
454 # Some kind of identifier
455 if c[0].isupper():
456 yield Token(TokenKind.ConstructorId, c, lineno)
457 else:
458 yield Token(TokenKind.TypeId, c, lineno)
459 elif c[:2] == '--':
460 # Comment
461 break
462 else:
463 # Operators
464 try:
465 op_kind = TokenKind.operator_table[c]
466 except KeyError:
467 raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
468 yield Token(op_kind, c, lineno)
469
470class ASDLParser:
471 """Parser for ASDL files.
472
473 Create, then call the parse method on a buffer containing ASDL.
474 This is a simple recursive descent parser that uses tokenize_asdl for the
475 lexing.
476 """
477 def __init__(self):
478 self._tokenizer = None
479 self.cur_token = None
480
481 def parse(self, f):
482 """Parse the ASDL in the file and return an AST with a Module root.
483 """
484 self._tokenizer = tokenize_asdl(f)
485 self._advance()
486 return self._parse_module()
487
488 def _parse_module(self):
489 if self._at_keyword('module'):
490 self._advance()
491 else:
492 raise ASDLSyntaxError(
493 'Expected "module" (found {})'.format(self.cur_token.value),
494 self.cur_token.lineno)
495 name = self._match(self._id_kinds)
496 self._match(TokenKind.LBrace)
497 defs = self._parse_definitions()
498 self._match(TokenKind.RBrace)
499 return Module(name, defs)
500
501 def _parse_definitions(self):
502 defs = []
503 while self.cur_token.kind == TokenKind.TypeId:
504 typename = self._advance()
505 self._match(TokenKind.Equals)
506 type = self._parse_type()
507 defs.append(Type(typename, type))
508 return defs
509
510 def _parse_type(self):
511 if self.cur_token.kind == TokenKind.LParen:
512 # If we see a (, it's a product
513 return self._parse_product()
514 else:
515 # Otherwise it's a sum. Look for ConstructorId
516 sumlist = [Constructor(self._match(TokenKind.ConstructorId),
517 self._parse_optional_fields())]
518 while self.cur_token.kind == TokenKind.Pipe:
519 # More constructors
520 self._advance()
521 sumlist.append(Constructor(
522 self._match(TokenKind.ConstructorId),
523 self._parse_optional_fields()))
524 return Sum(sumlist, self._parse_optional_attributes())
525
526 def _parse_product(self):
527 return Product(self._parse_fields(), self._parse_optional_attributes())
528
529 def _parse_fields(self):
530 fields = []
531 self._match(TokenKind.LParen)
532 while self.cur_token.kind == TokenKind.TypeId:
533 typename = self._advance()
534 is_seq, is_opt = self._parse_optional_field_quantifier()
535 id = (self._advance() if self.cur_token.kind in self._id_kinds
536 else None)
537 fields.append(Field(typename, id, seq=is_seq, opt=is_opt))
538 if self.cur_token.kind == TokenKind.RParen:
539 break
540 elif self.cur_token.kind == TokenKind.Comma:
541 self._advance()
542 self._match(TokenKind.RParen)
543 return fields
544
545 def _parse_optional_fields(self):
546 if self.cur_token.kind == TokenKind.LParen:
547 return self._parse_fields()
548 else:
549 return None
550
551 def _parse_optional_attributes(self):
552 if self._at_keyword('attributes'):
553 self._advance()
554 return self._parse_fields()
555 else:
556 return None
557
558 def _parse_optional_field_quantifier(self):
559 is_seq, is_opt = False, False
560 if self.cur_token.kind == TokenKind.Asterisk:
561 is_seq = True
562 self._advance()
563 elif self.cur_token.kind == TokenKind.Question:
564 is_opt = True
565 self._advance()
566 return is_seq, is_opt
567
568 def _advance(self):
569 """ Return the value of the current token and read the next one into
570 self.cur_token.
571 """
572 cur_val = None if self.cur_token is None else self.cur_token.value
573 try:
574 self.cur_token = next(self._tokenizer)
575 except StopIteration:
576 self.cur_token = None
577 return cur_val
578
579 _id_kinds = (TokenKind.ConstructorId, TokenKind.TypeId)
580
581 def _match(self, kind):
582 """The 'match' primitive of RD parsers.
583
584 * Verifies that the current token is of the given kind (kind can
585 be a tuple, in which the kind must match one of its members).
586 * Returns the value of the current token
587 * Reads in the next token
588 """
589 if (isinstance(kind, tuple) and self.cur_token.kind in kind or
590 self.cur_token.kind == kind
591 ):
592 value = self.cur_token.value
593 self._advance()
594 return value
595 else:
596 raise ASDLSyntaxError(
597 'Unmatched {} (found {})'.format(kind, self.cur_token.kind),
598 self.cur_token.lineno)
599
600 def _at_keyword(self, keyword):
601 return (self.cur_token.kind == TokenKind.TypeId and
602 self.cur_token.value == keyword)
603
604
605def LoadSchema(f, app_types):
606 """Parse an ASDL schema. Used for code gen and metaprogramming."""
607 asdl_module = parse(f)
608
609 if not check(asdl_module, app_types):
610 raise AssertionError('ASDL file is invalid')
611
612 type_lookup = ResolveTypes(asdl_module, app_types)
613 return asdl_module, type_lookup