OILS / opy / _regtest / src / osh / bool_parse.py View on Github | oilshell.org

272 lines, 150 significant
1#!/usr/bin/env python
2# Copyright 2016 Andy Chu. All rights reserved.
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8"""
9bool_parse.py - Parse boolean expressions.
10
11In contrast to test / [, the parsing of [[ expressions is done BEFORE
12evaluation. So we are parsing a list of Word instances to an AST, rather than
13a list of strings.
14
15TODO: If we implement "test / [", we should share the parsing.
16
17Grammar from http://compilers.iecc.com/crenshaw/tutor6.txt, adapted to ANTLR
18syntax.
19
20 Expr : Term (OR Term)*
21 Term : Negated (AND Negated)*
22 Negated : '!'? Factor
23 Factor : WORD
24 | UNARY_OP WORD
25 | WORD BINARY_OP WORD
26 | '(' Expr ')'
27
28OR = || -o
29AND = && -a
30WORD = any word
31UNARY_OP: -z -n, etc.
32BINARY_OP: -gt, -ot, ==, etc.
33"""
34
35from core import word
36from core import util
37from osh.meta import ast, Id, Kind, LookupKind, types
38
39try:
40 import libc # for regex_parse
41except ImportError:
42 from benchmarks import fake_libc as libc
43
44lex_mode_e = types.lex_mode_e
45log = util.log
46
47
48class BoolParser(object):
49 """Parses [[ at compile time and [ at runtime."""
50
51 def __init__(self, w_parser):
52 """
53 Args:
54 w_parser: WordParser
55 """
56 self.w_parser = w_parser
57 # Either one word or two words for lookahead
58 self.words = []
59
60 self.cur_word = None
61 self.op_id = Id.Undefined_Tok
62 self.b_kind = Kind.Undefined
63
64 self.error_stack = []
65
66 def Error(self):
67 return self.error_stack
68
69 def AddErrorContext(self, msg, *args, **kwargs):
70 err = util.ParseError(msg, *args, **kwargs)
71 self.error_stack.append(err)
72
73 def _NextOne(self, lex_mode=lex_mode_e.DBRACKET):
74 #print('_Next', self.cur_word)
75 n = len(self.words)
76 if n == 2:
77 assert lex_mode == lex_mode_e.DBRACKET
78 self.words[0] = self.words[1]
79 self.cur_word = self.words[0]
80 del self.words[1]
81 elif n in (0, 1):
82 w = self.w_parser.ReadWord(lex_mode)
83 if not w:
84 err = self.w_parser.Error()
85 self.error_stack.extend(err)
86 return False
87 if n == 0:
88 self.words.append(w)
89 else:
90 self.words[0] = w
91 self.cur_word = w
92
93 self.op_id = word.BoolId(self.cur_word)
94 self.b_kind = LookupKind(self.op_id)
95 #log('--- word %s', self.cur_word)
96 #log('op_id %s %s %s', self.op_id, self.b_kind, lex_mode)
97 return True
98
99 def _Next(self, lex_mode=lex_mode_e.DBRACKET):
100 """Advance to the next token, skipping newlines.
101
102 We don't handle newlines in the lexer because we want the newline after ]]
103 to be Id.Op_Newline rather than Id.WS_Newline. It's more complicated if
104 it's Id.WS_Newline -- we might have to unread tokens, etc.
105 """
106 while True:
107 w = self._NextOne(lex_mode=lex_mode)
108 if not w:
109 return False
110 if self.op_id != Id.Op_Newline:
111 break
112 return True
113
114 def _LookAhead(self):
115 n = len(self.words)
116 if n != 1:
117 raise AssertionError(self.words)
118
119 w = self.w_parser.ReadWord(lex_mode_e.DBRACKET)
120 self.words.append(w) # Save it for _Next()
121 return w
122
123 def Parse(self):
124 if not self._Next(): return None
125
126 node = self.ParseExpr()
127 if self.op_id != Id.Lit_DRightBracket:
128 self.AddErrorContext("Unexpected extra word %r", self.cur_word,
129 word=self.cur_word)
130 return None
131 return node
132
133 def _TestAtEnd(self):
134 """For unit tests only."""
135 return self.op_id == Id.Lit_DRightBracket
136
137 def ParseForBuiltin(self):
138 """For test builtin."""
139 if not self._Next(): return None
140
141 node = self.ParseExpr()
142 if self.op_id != Id.Eof_Real:
143 self.AddErrorContext(
144 'Unexpected trailing word in test expression: %s',
145 self.cur_word)
146 return None
147
148 return node
149
150 def ParseExpr(self):
151 """
152 Iterative:
153 Expr : Term (OR Term)*
154
155 Right recursion:
156 Expr : Term (OR Expr)?
157 """
158 left = self.ParseTerm()
159 # [[ uses || while [ uses -o
160 if self.op_id in (Id.Op_DPipe, Id.BoolUnary_o):
161 if not self._Next(): return None
162 right = self.ParseExpr()
163 if not right: return None
164 return ast.LogicalOr(left, right)
165 else:
166 return left
167
168 def ParseTerm(self):
169 """
170 Term : Negated (AND Negated)*
171
172 Right recursion:
173 Term : Negated (AND Term)?
174 """
175 left = self.ParseNegatedFactor()
176 if not left:
177 return None # TODO: An exception should handle this case.
178 # [[ uses && while [ uses -a
179 if self.op_id in (Id.Op_DAmp, Id.BoolUnary_a):
180 if not self._Next(): return None
181 right = self.ParseTerm()
182 if not right: return None
183 return ast.LogicalAnd(left, right)
184 else:
185 return left
186
187 def ParseNegatedFactor(self):
188 """
189 Negated : '!'? Factor
190 """
191 if self.op_id == Id.KW_Bang:
192 if not self._Next(): return None
193 child = self.ParseFactor()
194 if not child: return None
195 return ast.LogicalNot(child)
196 else:
197 return self.ParseFactor()
198
199 def ParseFactor(self):
200 """
201 Factor : WORD
202 | UNARY_OP WORD
203 | WORD BINARY_OP WORD
204 | '(' Expr ')'
205 """
206 if self.b_kind == Kind.BoolUnary:
207 # Just save the type and not the token itself?
208 op = self.op_id
209 if not self._Next(): return None
210 w = self.cur_word
211 if not self._Next(): return None
212 node = ast.BoolUnary(op, w)
213 return node
214
215 if self.b_kind == Kind.Word:
216 # Peek ahead another token.
217 t2 = self._LookAhead()
218 t2_op_id = word.BoolId(t2)
219 t2_b_kind = LookupKind(t2_op_id)
220
221 #log('t2 %s / t2_op_id %s / t2_b_kind %s', t2, t2_op_id, t2_b_kind)
222 # Redir pun for < and >, -a and -o pun
223 if t2_b_kind in (Kind.BoolBinary, Kind.Redir):
224 left = self.cur_word
225
226 if not self._Next(): return None
227 op = self.op_id
228
229 # TODO: Need to change to lex_mode_e.BASH_REGEX.
230 # _Next(lex_mode) then?
231 is_regex = t2_op_id == Id.BoolBinary_EqualTilde
232 if is_regex:
233 if not self._Next(lex_mode=lex_mode_e.BASH_REGEX): return None
234 else:
235 if not self._Next(): return None
236
237 right = self.cur_word
238 if is_regex:
239 # TODO: Quoted parts need to be regex-escaped, e.g. [[ $a =~ "{" ]].
240 # I don't think libc has a function to do this. Escape these
241 # characters:
242 # https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html0
243
244 ok, regex_str, unused_quoted = word.StaticEval(right)
245
246 # doesn't contain $foo, etc.
247 if ok and not libc.regex_parse(regex_str):
248 self.AddErrorContext("Invalid regex: %r" % regex_str, word=right)
249 return None
250
251 if not self._Next(): return None
252 return ast.BoolBinary(op, left, right)
253 else:
254 # [[ foo ]]
255 w = self.cur_word
256 if not self._Next(): return None
257 return ast.WordTest(w)
258
259 if self.op_id == Id.Op_LParen:
260 if not self._Next(): return None
261 node = self.ParseExpr()
262 if self.op_id != Id.Op_RParen:
263 self.AddErrorContext(
264 'Expected ), got %s', self.cur_word, word=self.cur_word)
265 return None
266 if not self._Next(): return None
267 return node
268
269 # TODO: A proper error, e.g. for [[ && ]] or [[ ]]
270 self.AddErrorContext(
271 'Unexpected token: %s' % self.cur_word, word=self.cur_word)
272 return None