OILS / ysh / regex_translate.py View on Github | oilshell.org

393 lines, 243 significant
1#!/usr/bin/env python2
2"""regex_translate.py."""
3from __future__ import print_function
4
5from _devbuild.gen.syntax_asdl import (
6 PosixClass,
7 PerlClass,
8 CharCode,
9 CharRange,
10 char_class_term_e,
11 char_class_term_t,
12 re,
13 re_e,
14 re_repeat,
15 re_repeat_e,
16 EggexFlag,
17 Token,
18)
19from _devbuild.gen.id_kind_asdl import Id
20from _devbuild.gen.value_asdl import value
21from core.error import e_die, p_die
22from frontend import lexer
23from mycpp.mylib import log, tagswitch, switch
24from osh import glob_ # for ExtendedRegexEscape
25
26from typing import List, Optional, TYPE_CHECKING, cast
27
28if TYPE_CHECKING:
29 from _devbuild.gen.syntax_asdl import re_t
30
31from libc import REG_ICASE, REG_NEWLINE
32
33_ = log
34
35PERL_CLASS = {
36 'd': '[:digit:]',
37 # Python's docs say it's [a-zA-Z0-9_] when NO LOCALE is set.
38 'w': '[:alpha:][:digit:]_',
39 # Python's doc says \s is [ \t\n\r\f\v] when NO LCOALE
40 's': '[:space:]',
41}
42
43# ERE's in POSIX:
44# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
45#
46# NOTE: They don't support \x00 or \u1234 as in Perl/Python!
47#
48# It's hard to grep for tabs with BRE or ERE syntax. You have to use:
49#
50# (1) a literal tab with Ctrl-V, or
51# (2) bash syntax: grep $'\t' foo.txt
52# (3) POSIX shell syntax: grep "$(echo -e '\t')" foo.txt.
53#
54# I ran into this in test/lint.sh !!!
55#
56# https://stackoverflow.com/questions/1825552/grep-a-tab-in-unix
57
58# Algorithm:
59# - Unicode escapes in BREs disallowed
60# - ASCII codes 1-255 allowed LITERALLY, but NUL \0 disallowed
61#
62# What about utf-8 encoded characters? Those work within literals, but are
63# problematic within character sets. There's no way to disallow those in
64# general though.
65
66CH_RBRACKET = 0x5d
67CH_BACKSLASH = 0x5c
68CH_CARET = 0x5e
69CH_HYPHEN = 0x2d
70
71FLAG_RBRACKET = 0b0001
72FLAG_BACKSLASH = 0b0010
73FLAG_CARET = 0b0100
74FLAG_HYPHEN = 0b1000
75
76
77def _CharCodeToEre(term, parts, special_char_flags):
78 # type: (CharCode, List[str], List[int]) -> None
79 """special_char_flags: list of single int that is mutated."""
80
81 char_int = term.i
82 if char_int >= 128 and term.u_braced:
83 # \u{ff} can't be represented in ERE because we don't know the encoding
84 # \xff can be represented
85 e_die("ERE can't express char code %d" % char_int, term.blame_tok)
86
87 # note: mycpp doesn't handle
88 # special_char_flags[0] |= FLAG_HYPHEN
89 mask = special_char_flags[0]
90
91 if char_int == CH_HYPHEN:
92 mask |= FLAG_HYPHEN
93 elif char_int == CH_CARET:
94 mask |= FLAG_CARET
95 elif char_int == CH_RBRACKET:
96 mask |= FLAG_RBRACKET
97 elif char_int == CH_BACKSLASH:
98 mask |= FLAG_BACKSLASH
99 else:
100 parts.append(chr(char_int))
101
102 special_char_flags[0] = mask
103
104
105def _CharClassTermToEre(term, parts, special_char_flags):
106 # type: (char_class_term_t, List[str], List[int]) -> None
107 """special_char_flags: list of single int that is mutated."""
108
109 UP_term = term
110 with tagswitch(term) as case:
111 if case(char_class_term_e.CharRange):
112 term = cast(CharRange, UP_term)
113
114 # Create our own flags
115 range_no_special = [0]
116
117 _CharCodeToEre(term.start, parts, range_no_special)
118 if range_no_special[0] != 0:
119 e_die(
120 "Can't use char %d as start of range in ERE syntax" %
121 term.start.i, term.start.blame_tok)
122
123 parts.append('-') # a-b
124
125 _CharCodeToEre(term.end, parts, range_no_special)
126 if range_no_special[0] != 0:
127 e_die(
128 "Can't use char %d as end of range in ERE syntax" %
129 term.end.i, term.end.blame_tok)
130
131 elif case(char_class_term_e.CharCode):
132 term = cast(CharCode, UP_term)
133
134 _CharCodeToEre(term, parts, special_char_flags)
135
136 elif case(char_class_term_e.PerlClass):
137 term = cast(PerlClass, UP_term)
138 n = term.name
139 chars = PERL_CLASS[term.name] # looks like '[:digit:]'
140 if term.negated:
141 e_die("Perl classes can't be negated in ERE", term.negated)
142 else:
143 pat = '%s' % chars
144 parts.append(pat)
145
146 elif case(char_class_term_e.PosixClass):
147 term = cast(PosixClass, UP_term)
148 n = term.name # looks like 'digit'
149 if term.negated:
150 e_die("POSIX classes can't be negated in ERE", term.negated)
151 else:
152 pat = '[:%s:]' % n
153 parts.append(pat)
154
155 else:
156 raise AssertionError(term)
157
158
159def _AsPosixEre(node, parts, capture_names):
160 # type: (re_t, List[str], List[Optional[str]]) -> None
161 """Translate an Oil regex to a POSIX ERE.
162
163 Appends to a list of parts that you have to join.
164 """
165 UP_node = node
166 tag = node.tag()
167
168 if tag == re_e.Primitive:
169 node = cast(re.Primitive, UP_node)
170 if node.id == Id.Eggex_Dot:
171 parts.append('.')
172 elif node.id == Id.Eggex_Start:
173 parts.append('^')
174 elif node.id == Id.Eggex_End:
175 parts.append('$')
176 else:
177 raise AssertionError(node.id)
178 return
179
180 if tag == re_e.LiteralChars:
181 node = cast(re.LiteralChars, UP_node)
182 # The bash [[ x =~ "." ]] construct also has to do this
183
184 # TODO: What about \0 and unicode escapes?
185 # Those won't be as LiteralChars I don't think?
186 # Unless you put them there through \0
187 # Maybe DISALLOW those.
188 # "Unprintable chars should be written as \0 or \x00 or \u0000"
189
190 parts.append(glob_.ExtendedRegexEscape(node.s))
191 return
192
193 if tag == re_e.Seq:
194 node = cast(re.Seq, UP_node)
195 for c in node.children:
196 _AsPosixEre(c, parts, capture_names)
197 return
198
199 if tag == re_e.Alt:
200 node = cast(re.Alt, UP_node)
201 for i, c in enumerate(node.children):
202 if i != 0:
203 parts.append('|')
204 _AsPosixEre(c, parts, capture_names)
205 return
206
207 if tag == re_e.Repeat:
208 node = cast(re.Repeat, UP_node)
209 # 'foo' or "foo" or $x or ${x} evaluated to too many chars
210 if node.child.tag() == re_e.LiteralChars:
211 child = cast(re.LiteralChars, node.child)
212 if len(child.s) > 1:
213 # Note: Other regex dialects have non-capturing groups since we don't
214 # need this.
215 e_die(
216 "POSIX EREs don't have groups without capture, so this node "
217 "needs () around it.", child.blame_tok)
218
219 _AsPosixEre(node.child, parts, capture_names)
220 op = node.op
221 op_tag = op.tag()
222 UP_op = op
223
224 if op_tag == re_repeat_e.Op:
225 op = cast(Token, UP_op)
226 with switch(op.id) as case:
227 if case(Id.Arith_Plus):
228 parts.append('+')
229 elif case(Id.Arith_Star):
230 parts.append('*')
231 elif case(Id.Arith_QMark):
232 parts.append('?')
233 elif case(Id.Expr_DecInt):
234 parts.append('{%s}' % lexer.LazyStr(op))
235 else:
236 raise AssertionError(op.id)
237 return
238
239 if op_tag == re_repeat_e.Range:
240 op = cast(re_repeat.Range, UP_op)
241 parts.append('{%s,%s}' % (op.lower, op.upper))
242 return
243
244 raise NotImplementedError(op_tag)
245
246 # Special case for familiarity: () is acceptable as a group in ERE
247 if tag == re_e.Group:
248 node = cast(re.Group, UP_node)
249
250 # placeholder so we know this group is numbered, but not named
251 capture_names.append(None)
252
253 parts.append('(')
254 _AsPosixEre(node.child, parts, capture_names)
255 parts.append(')')
256 return
257
258 if tag == re_e.Capture:
259 node = cast(re.Capture, UP_node)
260
261 # Collect in order of ( appearance
262 # TODO: get the name string, and type string
263
264 capture_str = lexer.TokenVal(node.name) if node.name else None
265 capture_names.append(capture_str)
266
267 parts.append('(')
268 _AsPosixEre(node.child, parts, capture_names)
269 parts.append(')')
270 return
271
272 if tag == re_e.PerlClass:
273 node = cast(PerlClass, UP_node)
274 n = node.name
275 chars = PERL_CLASS[node.name] # looks like [:digit:]
276 if node.negated:
277 pat = '[^%s]' % chars
278 else:
279 pat = '[%s]' % chars
280 parts.append(pat)
281 return
282
283 if tag == re_e.PosixClass:
284 node = cast(PosixClass, UP_node)
285 n = node.name # looks like 'digit'
286 if node.negated:
287 pat = '[^[:%s:]]' % n
288 else:
289 pat = '[[:%s:]]' % n
290 parts.append(pat)
291 return
292
293 if tag == re_e.CharClass:
294 node = cast(re.CharClass, UP_node)
295
296 # HYPHEN CARET RBRACKET BACKSLASH
297 special_char_flags = [0]
298 non_special_parts = [] # type: List[str]
299
300 for term in node.terms:
301 _CharClassTermToEre(term, non_special_parts, special_char_flags)
302
303 parts.append('[')
304 if node.negated:
305 parts.append('^')
306
307 # Help the user with some of terrible corner cases
308
309 # - move literal - to end [ab-] not [a-b]
310 # - move literal ^ to end [x^-] not [^x-]
311 # - move literal ] to beginning: []x] not [x]]
312 # - double up \\ because of Gawk extension [\\]
313
314 if special_char_flags[0] & FLAG_RBRACKET:
315 parts.append(']')
316
317 parts.extend(non_special_parts)
318
319 if special_char_flags[0] & FLAG_BACKSLASH:
320 parts.append('\\\\') # TWO backslashes
321
322 if special_char_flags[0] & FLAG_CARET:
323 parts.append('^')
324
325 if special_char_flags[0] & FLAG_HYPHEN:
326 parts.append('-')
327
328 parts.append(']')
329 return
330
331 raise NotImplementedError(tag)
332
333
334def AsPosixEre(eggex):
335 # type: (value.Eggex) -> str
336 """
337 Lazily fills in fields on the value.Eggex argument.
338 """
339 if eggex.as_ere is not None:
340 return eggex.as_ere
341
342 parts = [] # type: List[str]
343 _AsPosixEre(eggex.spliced, parts, eggex.capture_names)
344
345 # These are both indexed by group number, with None for the holes
346 # List[str?] vs. List[value?]
347 assert len(eggex.capture_names) == len(eggex.convert_funcs)
348
349 eggex.as_ere = ''.join(parts)
350
351 return eggex.as_ere
352
353
354def CanonicalFlags(flags):
355 # type: (List[EggexFlag]) -> str
356 """
357 Raises PARSE error on invalid flags.
358
359 In theory we could encode directly to integers like REG_ICASE, but a string
360 like like 'i' makes the error message slightly more legible.
361 """
362 letters = [] # type: List[str]
363 for flag in flags:
364 if flag.negated:
365 p_die("Flag can't be negated", flag.flag)
366 flag_name = lexer.TokenVal(flag.flag)
367 if flag_name in ('i', 'reg_icase'):
368 letters.append('i')
369 elif flag_name == 'reg_newline':
370 letters.append('n')
371 else:
372 p_die("Invalid regex flag %r" % flag_name, flag.flag)
373
374 # Normalize for comparison
375 letters.sort()
376 return ''.join(letters)
377
378
379def LibcFlags(canonical_flags):
380 # type: (Optional[str]) -> int
381 if canonical_flags is None:
382 return 0
383
384 libc_flags = 0
385 for ch in canonical_flags:
386 if ch == 'i':
387 libc_flags |= REG_ICASE
388 elif ch == 'n':
389 libc_flags |= REG_NEWLINE
390 else:
391 # regex_translate should prevent this
392 raise AssertionError()
393 return libc_flags