| 1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
| 2 | # Licensed to PSF under a Contributor Agreement.
|
| 3 |
|
| 4 | """Parser engine for the grammar tables generated by pgen.
|
| 5 |
|
| 6 | The grammar table must be loaded first.
|
| 7 |
|
| 8 | See Parser/parser.c in the Python distribution for additional info on
|
| 9 | how this parsing engine works.
|
| 10 | """
|
| 11 |
|
| 12 | from mycpp.mylib import log
|
| 13 | _ = log
|
| 14 |
|
| 15 | from typing import TYPE_CHECKING, Optional, List
|
| 16 | from pgen2.pnode import PNode, PNodeAllocator
|
| 17 |
|
| 18 | if TYPE_CHECKING:
|
| 19 | from _devbuild.gen.syntax_asdl import Token
|
| 20 | from pgen2.grammar import Grammar, dfa_t
|
| 21 |
|
| 22 | # Duplicated to avoid dependency
|
| 23 | # from pgen2 import token
|
| 24 |
|
| 25 | NT_OFFSET = 256
|
| 26 |
|
| 27 |
|
| 28 | class ParseError(Exception):
|
| 29 | """Exception to signal the parser is stuck."""
|
| 30 |
|
| 31 | def __init__(self, msg, type_, tok):
|
| 32 | # type: (str, int, Token) -> None
|
| 33 | self.msg = msg
|
| 34 | self.type = type_
|
| 35 | self.tok = tok
|
| 36 |
|
| 37 | def __repr__(self):
|
| 38 | # type: () -> str
|
| 39 | return "%s: type=%d, tok=%r" % (self.msg, self.type, self.tok)
|
| 40 |
|
| 41 |
|
| 42 | class _StackItem(object):
|
| 43 | def __init__(self, dfa, state, node):
|
| 44 | # type: (dfa_t, int, PNode) -> None
|
| 45 | self.dfa = dfa
|
| 46 | self.state = state
|
| 47 | self.node = node
|
| 48 |
|
| 49 |
|
| 50 | class Parser(object):
|
| 51 | """Parser engine.
|
| 52 |
|
| 53 | The proper usage sequence is:
|
| 54 |
|
| 55 | p = Parser(grammar, [converter]) # create instance
|
| 56 | p.setup(start) # prepare for parsing
|
| 57 | <for each input token>:
|
| 58 | if p.addtoken(...): # parse a token; may raise ParseError
|
| 59 | break
|
| 60 | root = p.rootnode # root of abstract syntax tree
|
| 61 |
|
| 62 | A Parser instance may be reused by calling setup() repeatedly.
|
| 63 |
|
| 64 | A Parser instance contains state pertaining to the current token
|
| 65 | sequence, and should not be used concurrently by different threads
|
| 66 | to parse separate token sequences.
|
| 67 |
|
| 68 | See driver.py for how to get input tokens by tokenizing a file or
|
| 69 | string.
|
| 70 |
|
| 71 | Parsing is complete when addtoken() returns True; the root of the
|
| 72 | abstract syntax tree can then be retrieved from the rootnode
|
| 73 | instance variable. When a syntax error occurs, addtoken() raises
|
| 74 | the ParseError exception. There is no error recovery; the parser
|
| 75 | cannot be used after a syntax error was reported (but it can be
|
| 76 | reinitialized by calling setup()).
|
| 77 | """
|
| 78 |
|
| 79 | def __init__(self, grammar):
|
| 80 | # type: (Grammar) -> None
|
| 81 | """Constructor.
|
| 82 |
|
| 83 | The grammar argument is a grammar.Grammar instance; see the
|
| 84 | grammar module for more information.
|
| 85 |
|
| 86 | The parser is not ready yet for parsing; you must call the
|
| 87 | setup() method to get it started.
|
| 88 |
|
| 89 | A concrete syntax tree node is a (type, value, context, nodes)
|
| 90 | tuple, where type is the node type (a token or symbol number),
|
| 91 | value is None for symbols and a string for tokens, context is
|
| 92 | None or an opaque value used for error reporting (typically a
|
| 93 | (lineno, offset) pair), and nodes is a list of children for
|
| 94 | symbols, and None for tokens.
|
| 95 |
|
| 96 | An abstract syntax tree node may be anything; this is entirely
|
| 97 | up to the converter function.
|
| 98 | """
|
| 99 | self.grammar = grammar
|
| 100 | self.rootnode = None # type: Optional[PNode]
|
| 101 | self.stack = [] # type: List[_StackItem]
|
| 102 | self.pnode_alloc = None # type: Optional[PNodeAllocator]
|
| 103 |
|
| 104 | def setup(self, start, pnode_alloc):
|
| 105 | # type: (int, PNodeAllocator) -> None
|
| 106 | """Prepare for parsing.
|
| 107 |
|
| 108 | This *must* be called before starting to parse.
|
| 109 |
|
| 110 | The optional argument is an alternative start symbol; it
|
| 111 | defaults to the grammar's start symbol.
|
| 112 |
|
| 113 | You can use a Parser instance to parse any number of programs;
|
| 114 | each time you call setup() the parser is reset to an initial
|
| 115 | state determined by the (implicit or explicit) start symbol.
|
| 116 | """
|
| 117 | self.pnode_alloc = pnode_alloc
|
| 118 | newnode = self.pnode_alloc.NewPNode(start, None)
|
| 119 | # Each stack entry is a tuple: (dfa, state, node).
|
| 120 | self.stack = [_StackItem(self.grammar.dfas[start], 0, newnode)]
|
| 121 | self.rootnode = None
|
| 122 |
|
| 123 | def addtoken(self, typ, opaque, ilabel):
|
| 124 | # type: (int, Token, int) -> bool
|
| 125 | """Add a token; return True iff this is the end of the program."""
|
| 126 | # Loop until the token is shifted; may raise exceptions
|
| 127 |
|
| 128 | # Andy NOTE: This is not linear time, i.e. a constant amount of work
|
| 129 | # for each token? Is it O(n^2) as the ANTLR paper says?
|
| 130 | # Do the "accelerators" in pgen.c have anything to do with it?
|
| 131 |
|
| 132 | while True:
|
| 133 | top = self.stack[-1]
|
| 134 | states, _ = top.dfa
|
| 135 | state = top.state
|
| 136 |
|
| 137 | arcs = states[state]
|
| 138 |
|
| 139 | # Look for a state with this label
|
| 140 | found = False
|
| 141 | for ilab, newstate in arcs:
|
| 142 | t = self.grammar.labels[ilab]
|
| 143 | if ilabel == ilab:
|
| 144 | # Look it up in the list of labels
|
| 145 | assert t < NT_OFFSET, t
|
| 146 | # Shift a token; we're done with it
|
| 147 | self.shift(typ, opaque, newstate)
|
| 148 | # Pop while we are in an accept-only state
|
| 149 | state = newstate
|
| 150 |
|
| 151 | while True:
|
| 152 | # mycpp: rewrite of tuple-in-list comparison
|
| 153 | if len(states[state]) != 1:
|
| 154 | break
|
| 155 |
|
| 156 | s0, s1 = states[state][0]
|
| 157 | if s0 != 0 or s1 != state:
|
| 158 | break
|
| 159 |
|
| 160 | self.pop()
|
| 161 | if len(self.stack) == 0:
|
| 162 | # Done parsing!
|
| 163 | return True
|
| 164 | top = self.stack[-1]
|
| 165 | states, _ = top.dfa
|
| 166 | state = top.state
|
| 167 |
|
| 168 | # Done with this token
|
| 169 | return False
|
| 170 | elif t >= NT_OFFSET:
|
| 171 | # See if it's a symbol and if we're in its first set
|
| 172 | itsdfa = self.grammar.dfas[t]
|
| 173 | _, itsfirst = itsdfa
|
| 174 | if ilabel in itsfirst:
|
| 175 | # Push a symbol
|
| 176 | self.push(t, opaque, self.grammar.dfas[t], newstate)
|
| 177 | found = True
|
| 178 | break # To continue the outer while loop
|
| 179 |
|
| 180 | if not found:
|
| 181 | # Note: this condition was rewritten for mycpp tarnslation.
|
| 182 | # if (0, state) in arcs:
|
| 183 | # ...
|
| 184 | # else:
|
| 185 | # ...
|
| 186 | found2 = False
|
| 187 | for left, right in arcs:
|
| 188 | if left == 0 and right == state:
|
| 189 | # An accepting state, pop it and try something else
|
| 190 | self.pop()
|
| 191 | if len(self.stack) == 0:
|
| 192 | # Done parsing, but another token is input
|
| 193 | raise ParseError("too much input", typ, opaque)
|
| 194 | found2 = True
|
| 195 |
|
| 196 | if not found2:
|
| 197 | # No success finding a transition
|
| 198 | raise ParseError("bad input", typ, opaque)
|
| 199 |
|
| 200 | def shift(self, typ, opaque, newstate):
|
| 201 | # type: (int, Token, int) -> None
|
| 202 | """Shift a token. (Internal)"""
|
| 203 | top = self.stack[-1]
|
| 204 | newnode = self.pnode_alloc.NewPNode(typ, opaque)
|
| 205 | top.node.AddChild(newnode)
|
| 206 | self.stack[-1].state = newstate
|
| 207 |
|
| 208 | def push(self, typ, opaque, newdfa, newstate):
|
| 209 | # type: (int, Token, dfa_t, int) -> None
|
| 210 | """Push a nonterminal. (Internal)"""
|
| 211 | top = self.stack[-1]
|
| 212 | newnode = self.pnode_alloc.NewPNode(typ, opaque)
|
| 213 | self.stack[-1].state = newstate
|
| 214 | self.stack.append(_StackItem(newdfa, 0, newnode))
|
| 215 |
|
| 216 | def pop(self):
|
| 217 | # type: () -> None
|
| 218 | """Pop a nonterminal. (Internal)"""
|
| 219 | top = self.stack.pop()
|
| 220 | newnode = top.node
|
| 221 | if len(self.stack):
|
| 222 | top2 = self.stack[-1]
|
| 223 | top2.node.AddChild(newnode)
|
| 224 | else:
|
| 225 | self.rootnode = newnode
|