opy/_regtest/src/osh/bool

OILS / opy / _regtest / src / osh / bool_parse.py View on Github | oilshell.org

272 lines, 150 significant

1	#!/usr/bin/env python
2	# Copyright 2016 Andy Chu. All rights reserved.
3	# Licensed under the Apache License, Version 2.0 (the "License");
4	# you may not use this file except in compliance with the License.
5	# You may obtain a copy of the License at
6	#
7	# http://www.apache.org/licenses/LICENSE-2.0
8	"""
9	bool_parse.py - Parse boolean expressions.
10
11	In contrast to test / [, the parsing of [[ expressions is done BEFORE
12	evaluation. So we are parsing a list of Word instances to an AST, rather than
13	a list of strings.
14
15	TODO: If we implement "test / [", we should share the parsing.
16
17	Grammar from http://compilers.iecc.com/crenshaw/tutor6.txt, adapted to ANTLR
18	syntax.
19
20	Expr : Term (OR Term)*
21	Term : Negated (AND Negated)*
22	Negated : '!'? Factor
23	Factor : WORD
24	\| UNARY_OP WORD
25	\| WORD BINARY_OP WORD
26	\| '(' Expr ')'
27
28	OR = \|\| -o
29	AND = && -a
30	WORD = any word
31	UNARY_OP: -z -n, etc.
32	BINARY_OP: -gt, -ot, ==, etc.
33	"""
34
35	from core import word
36	from core import util
37	from osh.meta import ast, Id, Kind, LookupKind, types
38
39	try:
40	import libc # for regex_parse
41	except ImportError:
42	from benchmarks import fake_libc as libc
43
44	lex_mode_e = types.lex_mode_e
45	log = util.log
46
47
48	class BoolParser(object):
49	"""Parses [[ at compile time and [ at runtime."""
50
51	def __init__(self, w_parser):
52	"""
53	Args:
54	w_parser: WordParser
55	"""
56	self.w_parser = w_parser
57	# Either one word or two words for lookahead
58	self.words = []
59
60	self.cur_word = None
61	self.op_id = Id.Undefined_Tok
62	self.b_kind = Kind.Undefined
63
64	self.error_stack = []
65
66	def Error(self):
67	return self.error_stack
68
69	def AddErrorContext(self, msg, args, *kwargs):
70	err = util.ParseError(msg, args, *kwargs)
71	self.error_stack.append(err)
72
73	def _NextOne(self, lex_mode=lex_mode_e.DBRACKET):
74	#print('_Next', self.cur_word)
75	n = len(self.words)
76	if n == 2:
77	assert lex_mode == lex_mode_e.DBRACKET
78	self.words[0] = self.words[1]
79	self.cur_word = self.words[0]
80	del self.words[1]
81	elif n in (0, 1):
82	w = self.w_parser.ReadWord(lex_mode)
83	if not w:
84	err = self.w_parser.Error()
85	self.error_stack.extend(err)
86	return False
87	if n == 0:
88	self.words.append(w)
89	else:
90	self.words[0] = w
91	self.cur_word = w
92
93	self.op_id = word.BoolId(self.cur_word)
94	self.b_kind = LookupKind(self.op_id)
95	#log('--- word %s', self.cur_word)
96	#log('op_id %s %s %s', self.op_id, self.b_kind, lex_mode)
97	return True
98
99	def _Next(self, lex_mode=lex_mode_e.DBRACKET):
100	"""Advance to the next token, skipping newlines.
101
102	We don't handle newlines in the lexer because we want the newline after ]]
103	to be Id.Op_Newline rather than Id.WS_Newline. It's more complicated if
104	it's Id.WS_Newline -- we might have to unread tokens, etc.
105	"""
106	while True:
107	w = self._NextOne(lex_mode=lex_mode)
108	if not w:
109	return False
110	if self.op_id != Id.Op_Newline:
111	break
112	return True
113
114	def _LookAhead(self):
115	n = len(self.words)
116	if n != 1:
117	raise AssertionError(self.words)
118
119	w = self.w_parser.ReadWord(lex_mode_e.DBRACKET)
120	self.words.append(w) # Save it for _Next()
121	return w
122
123	def Parse(self):
124	if not self._Next(): return None
125
126	node = self.ParseExpr()
127	if self.op_id != Id.Lit_DRightBracket:
128	self.AddErrorContext("Unexpected extra word %r", self.cur_word,
129	word=self.cur_word)
130	return None
131	return node
132
133	def _TestAtEnd(self):
134	"""For unit tests only."""
135	return self.op_id == Id.Lit_DRightBracket
136
137	def ParseForBuiltin(self):
138	"""For test builtin."""
139	if not self._Next(): return None
140
141	node = self.ParseExpr()
142	if self.op_id != Id.Eof_Real:
143	self.AddErrorContext(
144	'Unexpected trailing word in test expression: %s',
145	self.cur_word)
146	return None
147
148	return node
149
150	def ParseExpr(self):
151	"""
152	Iterative:
153	Expr : Term (OR Term)*
154
155	Right recursion:
156	Expr : Term (OR Expr)?
157	"""
158	left = self.ParseTerm()
159	# [[ uses \|\| while [ uses -o
160	if self.op_id in (Id.Op_DPipe, Id.BoolUnary_o):
161	if not self._Next(): return None
162	right = self.ParseExpr()
163	if not right: return None
164	return ast.LogicalOr(left, right)
165	else:
166	return left
167
168	def ParseTerm(self):
169	"""
170	Term : Negated (AND Negated)*
171
172	Right recursion:
173	Term : Negated (AND Term)?
174	"""
175	left = self.ParseNegatedFactor()
176	if not left:
177	return None # TODO: An exception should handle this case.
178	# [[ uses && while [ uses -a
179	if self.op_id in (Id.Op_DAmp, Id.BoolUnary_a):
180	if not self._Next(): return None
181	right = self.ParseTerm()
182	if not right: return None
183	return ast.LogicalAnd(left, right)
184	else:
185	return left
186
187	def ParseNegatedFactor(self):
188	"""
189	Negated : '!'? Factor
190	"""
191	if self.op_id == Id.KW_Bang:
192	if not self._Next(): return None
193	child = self.ParseFactor()
194	if not child: return None
195	return ast.LogicalNot(child)
196	else:
197	return self.ParseFactor()
198
199	def ParseFactor(self):
200	"""
201	Factor : WORD
202	\| UNARY_OP WORD
203	\| WORD BINARY_OP WORD
204	\| '(' Expr ')'
205	"""
206	if self.b_kind == Kind.BoolUnary:
207	# Just save the type and not the token itself?
208	op = self.op_id
209	if not self._Next(): return None
210	w = self.cur_word
211	if not self._Next(): return None
212	node = ast.BoolUnary(op, w)
213	return node
214
215	if self.b_kind == Kind.Word:
216	# Peek ahead another token.
217	t2 = self._LookAhead()
218	t2_op_id = word.BoolId(t2)
219	t2_b_kind = LookupKind(t2_op_id)
220
221	#log('t2 %s / t2_op_id %s / t2_b_kind %s', t2, t2_op_id, t2_b_kind)
222	# Redir pun for < and >, -a and -o pun
223	if t2_b_kind in (Kind.BoolBinary, Kind.Redir):
224	left = self.cur_word
225
226	if not self._Next(): return None
227	op = self.op_id
228
229	# TODO: Need to change to lex_mode_e.BASH_REGEX.
230	# _Next(lex_mode) then?
231	is_regex = t2_op_id == Id.BoolBinary_EqualTilde
232	if is_regex:
233	if not self._Next(lex_mode=lex_mode_e.BASH_REGEX): return None
234	else:
235	if not self._Next(): return None
236
237	right = self.cur_word
238	if is_regex:
239	# TODO: Quoted parts need to be regex-escaped, e.g. [[ $a =~ "{" ]].
240	# I don't think libc has a function to do this. Escape these
241	# characters:
242	# https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html0
243
244	ok, regex_str, unused_quoted = word.StaticEval(right)
245
246	# doesn't contain $foo, etc.
247	if ok and not libc.regex_parse(regex_str):
248	self.AddErrorContext("Invalid regex: %r" % regex_str, word=right)
249	return None
250
251	if not self._Next(): return None
252	return ast.BoolBinary(op, left, right)
253	else:
254	# [[ foo ]]
255	w = self.cur_word
256	if not self._Next(): return None
257	return ast.WordTest(w)
258
259	if self.op_id == Id.Op_LParen:
260	if not self._Next(): return None
261	node = self.ParseExpr()
262	if self.op_id != Id.Op_RParen:
263	self.AddErrorContext(
264	'Expected ), got %s', self.cur_word, word=self.cur_word)
265	return None
266	if not self._Next(): return None
267	return node
268
269	# TODO: A proper error, e.g. for [[ && ]] or [[ ]]
270	self.AddErrorContext(
271	'Unexpected token: %s' % self.cur_word, word=self.cur_word)
272	return None