ysh/regex_translate.py

OILS / ysh / regex_translate.py View on Github | oilshell.org

393 lines, 243 significant

1	#!/usr/bin/env python2
2	"""regex_translate.py."""
3	from __future__ import print_function
4
5	from _devbuild.gen.syntax_asdl import (
6	PosixClass,
7	PerlClass,
8	CharCode,
9	CharRange,
10	char_class_term_e,
11	char_class_term_t,
12	re,
13	re_e,
14	re_repeat,
15	re_repeat_e,
16	EggexFlag,
17	Token,
18	)
19	from _devbuild.gen.id_kind_asdl import Id
20	from _devbuild.gen.value_asdl import value
21	from core.error import e_die, p_die
22	from frontend import lexer
23	from mycpp.mylib import log, tagswitch, switch
24	from osh import glob_ # for ExtendedRegexEscape
25
26	from typing import List, Optional, TYPE_CHECKING, cast
27
28	if TYPE_CHECKING:
29	from _devbuild.gen.syntax_asdl import re_t
30
31	from libc import REG_ICASE, REG_NEWLINE
32
33	_ = log
34
35	PERL_CLASS = {
36	'd': '[:digit:]',
37	# Python's docs say it's [a-zA-Z0-9_] when NO LOCALE is set.
38	'w': '[:alpha:][:digit:]_',
39	# Python's doc says \s is [ \t\n\r\f\v] when NO LCOALE
40	's': '[:space:]',
41	}
42
43	# ERE's in POSIX:
44	# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
45	#
46	# NOTE: They don't support \x00 or \u1234 as in Perl/Python!
47	#
48	# It's hard to grep for tabs with BRE or ERE syntax. You have to use:
49	#
50	# (1) a literal tab with Ctrl-V, or
51	# (2) bash syntax: grep $'\t' foo.txt
52	# (3) POSIX shell syntax: grep "$(echo -e '\t')" foo.txt.
53	#
54	# I ran into this in test/lint.sh !!!
55	#
56	# https://stackoverflow.com/questions/1825552/grep-a-tab-in-unix
57
58	# Algorithm:
59	# - Unicode escapes in BREs disallowed
60	# - ASCII codes 1-255 allowed LITERALLY, but NUL \0 disallowed
61	#
62	# What about utf-8 encoded characters? Those work within literals, but are
63	# problematic within character sets. There's no way to disallow those in
64	# general though.
65
66	CH_RBRACKET = 0x5d
67	CH_BACKSLASH = 0x5c
68	CH_CARET = 0x5e
69	CH_HYPHEN = 0x2d
70
71	FLAG_RBRACKET = 0b0001
72	FLAG_BACKSLASH = 0b0010
73	FLAG_CARET = 0b0100
74	FLAG_HYPHEN = 0b1000
75
76
77	def _CharCodeToEre(term, parts, special_char_flags):
78	# type: (CharCode, List[str], List[int]) -> None
79	"""special_char_flags: list of single int that is mutated."""
80
81	char_int = term.i
82	if char_int >= 128 and term.u_braced:
83	# \u{ff} can't be represented in ERE because we don't know the encoding
84	# \xff can be represented
85	e_die("ERE can't express char code %d" % char_int, term.blame_tok)
86
87	# note: mycpp doesn't handle
88	# special_char_flags[0] \|= FLAG_HYPHEN
89	mask = special_char_flags[0]
90
91	if char_int == CH_HYPHEN:
92	mask \|= FLAG_HYPHEN
93	elif char_int == CH_CARET:
94	mask \|= FLAG_CARET
95	elif char_int == CH_RBRACKET:
96	mask \|= FLAG_RBRACKET
97	elif char_int == CH_BACKSLASH:
98	mask \|= FLAG_BACKSLASH
99	else:
100	parts.append(chr(char_int))
101
102	special_char_flags[0] = mask
103
104
105	def _CharClassTermToEre(term, parts, special_char_flags):
106	# type: (char_class_term_t, List[str], List[int]) -> None
107	"""special_char_flags: list of single int that is mutated."""
108
109	UP_term = term
110	with tagswitch(term) as case:
111	if case(char_class_term_e.CharRange):
112	term = cast(CharRange, UP_term)
113
114	# Create our own flags
115	range_no_special = [0]
116
117	_CharCodeToEre(term.start, parts, range_no_special)
118	if range_no_special[0] != 0:
119	e_die(
120	"Can't use char %d as start of range in ERE syntax" %
121	term.start.i, term.start.blame_tok)
122
123	parts.append('-') # a-b
124
125	_CharCodeToEre(term.end, parts, range_no_special)
126	if range_no_special[0] != 0:
127	e_die(
128	"Can't use char %d as end of range in ERE syntax" %
129	term.end.i, term.end.blame_tok)
130
131	elif case(char_class_term_e.CharCode):
132	term = cast(CharCode, UP_term)
133
134	_CharCodeToEre(term, parts, special_char_flags)
135
136	elif case(char_class_term_e.PerlClass):
137	term = cast(PerlClass, UP_term)
138	n = term.name
139	chars = PERL_CLASS[term.name] # looks like '[:digit:]'
140	if term.negated:
141	e_die("Perl classes can't be negated in ERE", term.negated)
142	else:
143	pat = '%s' % chars
144	parts.append(pat)
145
146	elif case(char_class_term_e.PosixClass):
147	term = cast(PosixClass, UP_term)
148	n = term.name # looks like 'digit'
149	if term.negated:
150	e_die("POSIX classes can't be negated in ERE", term.negated)
151	else:
152	pat = '[:%s:]' % n
153	parts.append(pat)
154
155	else:
156	raise AssertionError(term)
157
158
159	def _AsPosixEre(node, parts, capture_names):
160	# type: (re_t, List[str], List[Optional[str]]) -> None
161	"""Translate an Oil regex to a POSIX ERE.
162
163	Appends to a list of parts that you have to join.
164	"""
165	UP_node = node
166	tag = node.tag()
167
168	if tag == re_e.Primitive:
169	node = cast(re.Primitive, UP_node)
170	if node.id == Id.Eggex_Dot:
171	parts.append('.')
172	elif node.id == Id.Eggex_Start:
173	parts.append('^')
174	elif node.id == Id.Eggex_End:
175	parts.append('$')
176	else:
177	raise AssertionError(node.id)
178	return
179
180	if tag == re_e.LiteralChars:
181	node = cast(re.LiteralChars, UP_node)
182	# The bash [[ x =~ "." ]] construct also has to do this
183
184	# TODO: What about \0 and unicode escapes?
185	# Those won't be as LiteralChars I don't think?
186	# Unless you put them there through \0
187	# Maybe DISALLOW those.
188	# "Unprintable chars should be written as \0 or \x00 or \u0000"
189
190	parts.append(glob_.ExtendedRegexEscape(node.s))
191	return
192
193	if tag == re_e.Seq:
194	node = cast(re.Seq, UP_node)
195	for c in node.children:
196	_AsPosixEre(c, parts, capture_names)
197	return
198
199	if tag == re_e.Alt:
200	node = cast(re.Alt, UP_node)
201	for i, c in enumerate(node.children):
202	if i != 0:
203	parts.append('\|')
204	_AsPosixEre(c, parts, capture_names)
205	return
206
207	if tag == re_e.Repeat:
208	node = cast(re.Repeat, UP_node)
209	# 'foo' or "foo" or $x or ${x} evaluated to too many chars
210	if node.child.tag() == re_e.LiteralChars:
211	child = cast(re.LiteralChars, node.child)
212	if len(child.s) > 1:
213	# Note: Other regex dialects have non-capturing groups since we don't
214	# need this.
215	e_die(
216	"POSIX EREs don't have groups without capture, so this node "
217	"needs () around it.", child.blame_tok)
218
219	_AsPosixEre(node.child, parts, capture_names)
220	op = node.op
221	op_tag = op.tag()
222	UP_op = op
223
224	if op_tag == re_repeat_e.Op:
225	op = cast(Token, UP_op)
226	with switch(op.id) as case:
227	if case(Id.Arith_Plus):
228	parts.append('+')
229	elif case(Id.Arith_Star):
230	parts.append('*')
231	elif case(Id.Arith_QMark):
232	parts.append('?')
233	elif case(Id.Expr_DecInt):
234	parts.append('{%s}' % lexer.LazyStr(op))
235	else:
236	raise AssertionError(op.id)
237	return
238
239	if op_tag == re_repeat_e.Range:
240	op = cast(re_repeat.Range, UP_op)
241	parts.append('{%s,%s}' % (op.lower, op.upper))
242	return
243
244	raise NotImplementedError(op_tag)
245
246	# Special case for familiarity: () is acceptable as a group in ERE
247	if tag == re_e.Group:
248	node = cast(re.Group, UP_node)
249
250	# placeholder so we know this group is numbered, but not named
251	capture_names.append(None)
252
253	parts.append('(')
254	_AsPosixEre(node.child, parts, capture_names)
255	parts.append(')')
256	return
257
258	if tag == re_e.Capture:
259	node = cast(re.Capture, UP_node)
260
261	# Collect in order of ( appearance
262	# TODO: get the name string, and type string
263
264	capture_str = lexer.TokenVal(node.name) if node.name else None
265	capture_names.append(capture_str)
266
267	parts.append('(')
268	_AsPosixEre(node.child, parts, capture_names)
269	parts.append(')')
270	return
271
272	if tag == re_e.PerlClass:
273	node = cast(PerlClass, UP_node)
274	n = node.name
275	chars = PERL_CLASS[node.name] # looks like [:digit:]
276	if node.negated:
277	pat = '[^%s]' % chars
278	else:
279	pat = '[%s]' % chars
280	parts.append(pat)
281	return
282
283	if tag == re_e.PosixClass:
284	node = cast(PosixClass, UP_node)
285	n = node.name # looks like 'digit'
286	if node.negated:
287	pat = '[^[:%s:]]' % n
288	else:
289	pat = '[[:%s:]]' % n
290	parts.append(pat)
291	return
292
293	if tag == re_e.CharClass:
294	node = cast(re.CharClass, UP_node)
295
296	# HYPHEN CARET RBRACKET BACKSLASH
297	special_char_flags = [0]
298	non_special_parts = [] # type: List[str]
299
300	for term in node.terms:
301	_CharClassTermToEre(term, non_special_parts, special_char_flags)
302
303	parts.append('[')
304	if node.negated:
305	parts.append('^')
306
307	# Help the user with some of terrible corner cases
308
309	# - move literal - to end [ab-] not [a-b]
310	# - move literal ^ to end [x^-] not [^x-]
311	# - move literal ] to beginning: []x] not [x]]
312	# - double up \\ because of Gawk extension [\\]
313
314	if special_char_flags[0] & FLAG_RBRACKET:
315	parts.append(']')
316
317	parts.extend(non_special_parts)
318
319	if special_char_flags[0] & FLAG_BACKSLASH:
320	parts.append('\\\\') # TWO backslashes
321
322	if special_char_flags[0] & FLAG_CARET:
323	parts.append('^')
324
325	if special_char_flags[0] & FLAG_HYPHEN:
326	parts.append('-')
327
328	parts.append(']')
329	return
330
331	raise NotImplementedError(tag)
332
333
334	def AsPosixEre(eggex):
335	# type: (value.Eggex) -> str
336	"""
337	Lazily fills in fields on the value.Eggex argument.
338	"""
339	if eggex.as_ere is not None:
340	return eggex.as_ere
341
342	parts = [] # type: List[str]
343	_AsPosixEre(eggex.spliced, parts, eggex.capture_names)
344
345	# These are both indexed by group number, with None for the holes
346	# List[str?] vs. List[value?]
347	assert len(eggex.capture_names) == len(eggex.convert_funcs)
348
349	eggex.as_ere = ''.join(parts)
350
351	return eggex.as_ere
352
353
354	def CanonicalFlags(flags):
355	# type: (List[EggexFlag]) -> str
356	"""
357	Raises PARSE error on invalid flags.
358
359	In theory we could encode directly to integers like REG_ICASE, but a string
360	like like 'i' makes the error message slightly more legible.
361	"""
362	letters = [] # type: List[str]
363	for flag in flags:
364	if flag.negated:
365	p_die("Flag can't be negated", flag.flag)
366	flag_name = lexer.TokenVal(flag.flag)
367	if flag_name in ('i', 'reg_icase'):
368	letters.append('i')
369	elif flag_name == 'reg_newline':
370	letters.append('n')
371	else:
372	p_die("Invalid regex flag %r" % flag_name, flag.flag)
373
374	# Normalize for comparison
375	letters.sort()
376	return ''.join(letters)
377
378
379	def LibcFlags(canonical_flags):
380	# type: (Optional[str]) -> int
381	if canonical_flags is None:
382	return 0
383
384	libc_flags = 0
385	for ch in canonical_flags:
386	if ch == 'i':
387	libc_flags \|= REG_ICASE
388	elif ch == 'n':
389	libc_flags \|= REG_NEWLINE
390	else:
391	# regex_translate should prevent this
392	raise AssertionError()
393	return libc_flags