| 1 | #!/usr/bin/env python2
 | 
| 2 | """regex_translate.py."""
 | 
| 3 | from __future__ import print_function
 | 
| 4 | 
 | 
| 5 | from _devbuild.gen.syntax_asdl import (
 | 
| 6 |     PosixClass,
 | 
| 7 |     PerlClass,
 | 
| 8 |     CharCode,
 | 
| 9 |     CharRange,
 | 
| 10 |     char_class_term_e,
 | 
| 11 |     char_class_term_t,
 | 
| 12 |     re,
 | 
| 13 |     re_e,
 | 
| 14 |     re_repeat,
 | 
| 15 |     re_repeat_e,
 | 
| 16 |     EggexFlag,
 | 
| 17 |     Token,
 | 
| 18 | )
 | 
| 19 | from _devbuild.gen.id_kind_asdl import Id
 | 
| 20 | from _devbuild.gen.value_asdl import value
 | 
| 21 | from core.error import e_die, p_die
 | 
| 22 | from frontend import lexer
 | 
| 23 | from mycpp.mylib import log, tagswitch, switch
 | 
| 24 | from osh import glob_  # for ExtendedRegexEscape
 | 
| 25 | 
 | 
| 26 | from typing import List, Optional, TYPE_CHECKING, cast
 | 
| 27 | 
 | 
| 28 | if TYPE_CHECKING:
 | 
| 29 |     from _devbuild.gen.syntax_asdl import re_t
 | 
| 30 | 
 | 
| 31 | from libc import REG_ICASE, REG_NEWLINE
 | 
| 32 | 
 | 
| 33 | _ = log
 | 
| 34 | 
 | 
| 35 | PERL_CLASS = {
 | 
| 36 |     'd': '[:digit:]',
 | 
| 37 |     # Python's docs say it's [a-zA-Z0-9_] when NO LOCALE is set.
 | 
| 38 |     'w': '[:alpha:][:digit:]_',
 | 
| 39 |     # Python's doc says \s is [ \t\n\r\f\v] when NO LCOALE
 | 
| 40 |     's': '[:space:]',
 | 
| 41 | }
 | 
| 42 | 
 | 
| 43 | # ERE's in POSIX:
 | 
| 44 | # https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
 | 
| 45 | #
 | 
| 46 | # NOTE: They don't support \x00 or \u1234 as in Perl/Python!
 | 
| 47 | #
 | 
| 48 | # It's hard to grep for tabs with BRE or ERE syntax.  You have to use:
 | 
| 49 | #
 | 
| 50 | # (1) a literal tab with Ctrl-V, or
 | 
| 51 | # (2) bash syntax:  grep $'\t' foo.txt
 | 
| 52 | # (3) POSIX shell syntax:  grep "$(echo -e '\t')" foo.txt.
 | 
| 53 | #
 | 
| 54 | # I ran into this in test/lint.sh !!!
 | 
| 55 | #
 | 
| 56 | # https://stackoverflow.com/questions/1825552/grep-a-tab-in-unix
 | 
| 57 | 
 | 
| 58 | # Algorithm:
 | 
| 59 | # - Unicode escapes in BREs disallowed
 | 
| 60 | # - ASCII codes 1-255 allowed LITERALLY, but NUL \0 disallowed
 | 
| 61 | #
 | 
| 62 | # What about utf-8 encoded characters?  Those work within literals, but are
 | 
| 63 | # problematic within character sets.  There's no way to disallow those in
 | 
| 64 | # general though.
 | 
| 65 | 
 | 
| 66 | CH_RBRACKET = 0x5d
 | 
| 67 | CH_BACKSLASH = 0x5c
 | 
| 68 | CH_CARET = 0x5e
 | 
| 69 | CH_HYPHEN = 0x2d
 | 
| 70 | 
 | 
| 71 | FLAG_RBRACKET = 0b0001
 | 
| 72 | FLAG_BACKSLASH = 0b0010
 | 
| 73 | FLAG_CARET = 0b0100
 | 
| 74 | FLAG_HYPHEN = 0b1000
 | 
| 75 | 
 | 
| 76 | 
 | 
| 77 | def _CharCodeToEre(term, parts, special_char_flags):
 | 
| 78 |     # type: (CharCode, List[str], List[int]) -> None
 | 
| 79 |     """special_char_flags: list of single int that is mutated."""
 | 
| 80 | 
 | 
| 81 |     char_int = term.i
 | 
| 82 |     if char_int >= 128 and term.u_braced:
 | 
| 83 |         # \u{ff} can't be represented in ERE because we don't know the encoding
 | 
| 84 |         # \xff can be represented
 | 
| 85 |         e_die("ERE can't express char code %d" % char_int, term.blame_tok)
 | 
| 86 | 
 | 
| 87 |     # note: mycpp doesn't handle
 | 
| 88 |     # special_char_flags[0] |= FLAG_HYPHEN
 | 
| 89 |     mask = special_char_flags[0]
 | 
| 90 | 
 | 
| 91 |     if char_int == CH_HYPHEN:
 | 
| 92 |         mask |= FLAG_HYPHEN
 | 
| 93 |     elif char_int == CH_CARET:
 | 
| 94 |         mask |= FLAG_CARET
 | 
| 95 |     elif char_int == CH_RBRACKET:
 | 
| 96 |         mask |= FLAG_RBRACKET
 | 
| 97 |     elif char_int == CH_BACKSLASH:
 | 
| 98 |         mask |= FLAG_BACKSLASH
 | 
| 99 |     else:
 | 
| 100 |         parts.append(chr(char_int))
 | 
| 101 | 
 | 
| 102 |     special_char_flags[0] = mask
 | 
| 103 | 
 | 
| 104 | 
 | 
| 105 | def _CharClassTermToEre(term, parts, special_char_flags):
 | 
| 106 |     # type: (char_class_term_t, List[str], List[int]) -> None
 | 
| 107 |     """special_char_flags: list of single int that is mutated."""
 | 
| 108 | 
 | 
| 109 |     UP_term = term
 | 
| 110 |     with tagswitch(term) as case:
 | 
| 111 |         if case(char_class_term_e.CharRange):
 | 
| 112 |             term = cast(CharRange, UP_term)
 | 
| 113 | 
 | 
| 114 |             # Create our own flags
 | 
| 115 |             range_no_special = [0]
 | 
| 116 | 
 | 
| 117 |             _CharCodeToEre(term.start, parts, range_no_special)
 | 
| 118 |             if range_no_special[0] != 0:
 | 
| 119 |                 e_die(
 | 
| 120 |                     "Can't use char %d as start of range in ERE syntax" %
 | 
| 121 |                     term.start.i, term.start.blame_tok)
 | 
| 122 | 
 | 
| 123 |             parts.append('-')  # a-b
 | 
| 124 | 
 | 
| 125 |             _CharCodeToEre(term.end, parts, range_no_special)
 | 
| 126 |             if range_no_special[0] != 0:
 | 
| 127 |                 e_die(
 | 
| 128 |                     "Can't use char %d as end of range in ERE syntax" %
 | 
| 129 |                     term.end.i, term.end.blame_tok)
 | 
| 130 | 
 | 
| 131 |         elif case(char_class_term_e.CharCode):
 | 
| 132 |             term = cast(CharCode, UP_term)
 | 
| 133 | 
 | 
| 134 |             _CharCodeToEre(term, parts, special_char_flags)
 | 
| 135 | 
 | 
| 136 |         elif case(char_class_term_e.PerlClass):
 | 
| 137 |             term = cast(PerlClass, UP_term)
 | 
| 138 |             n = term.name
 | 
| 139 |             chars = PERL_CLASS[term.name]  # looks like '[:digit:]'
 | 
| 140 |             if term.negated:
 | 
| 141 |                 e_die("Perl classes can't be negated in ERE", term.negated)
 | 
| 142 |             else:
 | 
| 143 |                 pat = '%s' % chars
 | 
| 144 |             parts.append(pat)
 | 
| 145 | 
 | 
| 146 |         elif case(char_class_term_e.PosixClass):
 | 
| 147 |             term = cast(PosixClass, UP_term)
 | 
| 148 |             n = term.name  # looks like 'digit'
 | 
| 149 |             if term.negated:
 | 
| 150 |                 e_die("POSIX classes can't be negated in ERE", term.negated)
 | 
| 151 |             else:
 | 
| 152 |                 pat = '[:%s:]' % n
 | 
| 153 |             parts.append(pat)
 | 
| 154 | 
 | 
| 155 |         else:
 | 
| 156 |             raise AssertionError(term)
 | 
| 157 | 
 | 
| 158 | 
 | 
| 159 | def _AsPosixEre(node, parts, capture_names):
 | 
| 160 |     # type: (re_t, List[str], List[Optional[str]]) -> None
 | 
| 161 |     """Translate an Oil regex to a POSIX ERE.
 | 
| 162 | 
 | 
| 163 |     Appends to a list of parts that you have to join.
 | 
| 164 |     """
 | 
| 165 |     UP_node = node
 | 
| 166 |     tag = node.tag()
 | 
| 167 | 
 | 
| 168 |     if tag == re_e.Primitive:
 | 
| 169 |         node = cast(re.Primitive, UP_node)
 | 
| 170 |         if node.id == Id.Eggex_Dot:
 | 
| 171 |             parts.append('.')
 | 
| 172 |         elif node.id == Id.Eggex_Start:
 | 
| 173 |             parts.append('^')
 | 
| 174 |         elif node.id == Id.Eggex_End:
 | 
| 175 |             parts.append('$')
 | 
| 176 |         else:
 | 
| 177 |             raise AssertionError(node.id)
 | 
| 178 |         return
 | 
| 179 | 
 | 
| 180 |     if tag == re_e.LiteralChars:
 | 
| 181 |         node = cast(re.LiteralChars, UP_node)
 | 
| 182 |         # The bash [[ x =~ "." ]] construct also has to do this
 | 
| 183 | 
 | 
| 184 |         # TODO: What about \0 and unicode escapes?
 | 
| 185 |         # Those won't be as LiteralChars I don't think?
 | 
| 186 |         # Unless you put them there through \0
 | 
| 187 |         # Maybe DISALLOW those.
 | 
| 188 |         # "Unprintable chars should be written as \0 or \x00 or \u0000"
 | 
| 189 | 
 | 
| 190 |         parts.append(glob_.ExtendedRegexEscape(node.s))
 | 
| 191 |         return
 | 
| 192 | 
 | 
| 193 |     if tag == re_e.Seq:
 | 
| 194 |         node = cast(re.Seq, UP_node)
 | 
| 195 |         for c in node.children:
 | 
| 196 |             _AsPosixEre(c, parts, capture_names)
 | 
| 197 |         return
 | 
| 198 | 
 | 
| 199 |     if tag == re_e.Alt:
 | 
| 200 |         node = cast(re.Alt, UP_node)
 | 
| 201 |         for i, c in enumerate(node.children):
 | 
| 202 |             if i != 0:
 | 
| 203 |                 parts.append('|')
 | 
| 204 |             _AsPosixEre(c, parts, capture_names)
 | 
| 205 |         return
 | 
| 206 | 
 | 
| 207 |     if tag == re_e.Repeat:
 | 
| 208 |         node = cast(re.Repeat, UP_node)
 | 
| 209 |         # 'foo' or "foo" or $x or ${x} evaluated to too many chars
 | 
| 210 |         if node.child.tag() == re_e.LiteralChars:
 | 
| 211 |             child = cast(re.LiteralChars, node.child)
 | 
| 212 |             if len(child.s) > 1:
 | 
| 213 |                 # Note: Other regex dialects have non-capturing groups since we don't
 | 
| 214 |                 # need this.
 | 
| 215 |                 e_die(
 | 
| 216 |                     "POSIX EREs don't have groups without capture, so this node "
 | 
| 217 |                     "needs () around it.", child.blame_tok)
 | 
| 218 | 
 | 
| 219 |         _AsPosixEre(node.child, parts, capture_names)
 | 
| 220 |         op = node.op
 | 
| 221 |         op_tag = op.tag()
 | 
| 222 |         UP_op = op
 | 
| 223 | 
 | 
| 224 |         if op_tag == re_repeat_e.Op:
 | 
| 225 |             op = cast(Token, UP_op)
 | 
| 226 |             with switch(op.id) as case:
 | 
| 227 |                 if case(Id.Arith_Plus):
 | 
| 228 |                     parts.append('+')
 | 
| 229 |                 elif case(Id.Arith_Star):
 | 
| 230 |                     parts.append('*')
 | 
| 231 |                 elif case(Id.Arith_QMark):
 | 
| 232 |                     parts.append('?')
 | 
| 233 |                 elif case(Id.Expr_DecInt):
 | 
| 234 |                     parts.append('{%s}' % lexer.LazyStr(op))
 | 
| 235 |                 else:
 | 
| 236 |                     raise AssertionError(op.id)
 | 
| 237 |             return
 | 
| 238 | 
 | 
| 239 |         if op_tag == re_repeat_e.Range:
 | 
| 240 |             op = cast(re_repeat.Range, UP_op)
 | 
| 241 |             parts.append('{%s,%s}' % (op.lower, op.upper))
 | 
| 242 |             return
 | 
| 243 | 
 | 
| 244 |         raise NotImplementedError(op_tag)
 | 
| 245 | 
 | 
| 246 |     # Special case for familiarity: () is acceptable as a group in ERE
 | 
| 247 |     if tag == re_e.Group:
 | 
| 248 |         node = cast(re.Group, UP_node)
 | 
| 249 | 
 | 
| 250 |         # placeholder so we know this group is numbered, but not named
 | 
| 251 |         capture_names.append(None)
 | 
| 252 | 
 | 
| 253 |         parts.append('(')
 | 
| 254 |         _AsPosixEre(node.child, parts, capture_names)
 | 
| 255 |         parts.append(')')
 | 
| 256 |         return
 | 
| 257 | 
 | 
| 258 |     if tag == re_e.Capture:
 | 
| 259 |         node = cast(re.Capture, UP_node)
 | 
| 260 | 
 | 
| 261 |         # Collect in order of ( appearance
 | 
| 262 |         # TODO: get the name string, and type string
 | 
| 263 | 
 | 
| 264 |         capture_str = lexer.TokenVal(node.name) if node.name else None
 | 
| 265 |         capture_names.append(capture_str)
 | 
| 266 | 
 | 
| 267 |         parts.append('(')
 | 
| 268 |         _AsPosixEre(node.child, parts, capture_names)
 | 
| 269 |         parts.append(')')
 | 
| 270 |         return
 | 
| 271 | 
 | 
| 272 |     if tag == re_e.PerlClass:
 | 
| 273 |         node = cast(PerlClass, UP_node)
 | 
| 274 |         n = node.name
 | 
| 275 |         chars = PERL_CLASS[node.name]  # looks like [:digit:]
 | 
| 276 |         if node.negated:
 | 
| 277 |             pat = '[^%s]' % chars
 | 
| 278 |         else:
 | 
| 279 |             pat = '[%s]' % chars
 | 
| 280 |         parts.append(pat)
 | 
| 281 |         return
 | 
| 282 | 
 | 
| 283 |     if tag == re_e.PosixClass:
 | 
| 284 |         node = cast(PosixClass, UP_node)
 | 
| 285 |         n = node.name  # looks like 'digit'
 | 
| 286 |         if node.negated:
 | 
| 287 |             pat = '[^[:%s:]]' % n
 | 
| 288 |         else:
 | 
| 289 |             pat = '[[:%s:]]' % n
 | 
| 290 |         parts.append(pat)
 | 
| 291 |         return
 | 
| 292 | 
 | 
| 293 |     if tag == re_e.CharClass:
 | 
| 294 |         node = cast(re.CharClass, UP_node)
 | 
| 295 | 
 | 
| 296 |         # HYPHEN CARET RBRACKET BACKSLASH
 | 
| 297 |         special_char_flags = [0]
 | 
| 298 |         non_special_parts = []  # type: List[str]
 | 
| 299 | 
 | 
| 300 |         for term in node.terms:
 | 
| 301 |             _CharClassTermToEre(term, non_special_parts, special_char_flags)
 | 
| 302 | 
 | 
| 303 |         parts.append('[')
 | 
| 304 |         if node.negated:
 | 
| 305 |             parts.append('^')
 | 
| 306 | 
 | 
| 307 |         # Help the user with some of terrible corner cases
 | 
| 308 | 
 | 
| 309 |         # - move literal - to end        [ab-] not [a-b]
 | 
| 310 |         # - move literal ^ to end        [x^-] not [^x-]
 | 
| 311 |         # - move literal ] to beginning: []x] not [x]]
 | 
| 312 |         # - double up \\ because of Gawk extension [\\]
 | 
| 313 | 
 | 
| 314 |         if special_char_flags[0] & FLAG_RBRACKET:
 | 
| 315 |             parts.append(']')
 | 
| 316 | 
 | 
| 317 |         parts.extend(non_special_parts)
 | 
| 318 | 
 | 
| 319 |         if special_char_flags[0] & FLAG_BACKSLASH:
 | 
| 320 |             parts.append('\\\\')  # TWO backslashes
 | 
| 321 | 
 | 
| 322 |         if special_char_flags[0] & FLAG_CARET:
 | 
| 323 |             parts.append('^')
 | 
| 324 | 
 | 
| 325 |         if special_char_flags[0] & FLAG_HYPHEN:
 | 
| 326 |             parts.append('-')
 | 
| 327 | 
 | 
| 328 |         parts.append(']')
 | 
| 329 |         return
 | 
| 330 | 
 | 
| 331 |     raise NotImplementedError(tag)
 | 
| 332 | 
 | 
| 333 | 
 | 
| 334 | def AsPosixEre(eggex):
 | 
| 335 |     # type: (value.Eggex) -> str
 | 
| 336 |     """
 | 
| 337 |     Lazily fills in fields on the value.Eggex argument.
 | 
| 338 |     """
 | 
| 339 |     if eggex.as_ere is not None:
 | 
| 340 |         return eggex.as_ere
 | 
| 341 | 
 | 
| 342 |     parts = []  # type: List[str]
 | 
| 343 |     _AsPosixEre(eggex.spliced, parts, eggex.capture_names)
 | 
| 344 | 
 | 
| 345 |     # These are both indexed by group number, with None for the holes
 | 
| 346 |     # List[str?] vs. List[value?]
 | 
| 347 |     assert len(eggex.capture_names) == len(eggex.convert_funcs)
 | 
| 348 | 
 | 
| 349 |     eggex.as_ere = ''.join(parts)
 | 
| 350 | 
 | 
| 351 |     return eggex.as_ere
 | 
| 352 | 
 | 
| 353 | 
 | 
| 354 | def CanonicalFlags(flags):
 | 
| 355 |     # type: (List[EggexFlag]) -> str
 | 
| 356 |     """
 | 
| 357 |     Raises PARSE error on invalid flags.
 | 
| 358 | 
 | 
| 359 |     In theory we could encode directly to integers like REG_ICASE, but a string
 | 
| 360 |     like like 'i' makes the error message slightly more legible.
 | 
| 361 |     """
 | 
| 362 |     letters = []  # type: List[str]
 | 
| 363 |     for flag in flags:
 | 
| 364 |         if flag.negated:
 | 
| 365 |             p_die("Flag can't be negated", flag.flag)
 | 
| 366 |         flag_name = lexer.TokenVal(flag.flag)
 | 
| 367 |         if flag_name in ('i', 'reg_icase'):
 | 
| 368 |             letters.append('i')
 | 
| 369 |         elif flag_name == 'reg_newline':
 | 
| 370 |             letters.append('n')
 | 
| 371 |         else:
 | 
| 372 |             p_die("Invalid regex flag %r" % flag_name, flag.flag)
 | 
| 373 | 
 | 
| 374 |     # Normalize for comparison
 | 
| 375 |     letters.sort()
 | 
| 376 |     return ''.join(letters)
 | 
| 377 | 
 | 
| 378 | 
 | 
| 379 | def LibcFlags(canonical_flags):
 | 
| 380 |     # type: (Optional[str]) -> int
 | 
| 381 |     if canonical_flags is None:
 | 
| 382 |         return 0
 | 
| 383 | 
 | 
| 384 |     libc_flags = 0
 | 
| 385 |     for ch in canonical_flags:
 | 
| 386 |         if ch == 'i':
 | 
| 387 |             libc_flags |= REG_ICASE
 | 
| 388 |         elif ch == 'n':
 | 
| 389 |             libc_flags |= REG_NEWLINE
 | 
| 390 |         else:
 | 
| 391 |             # regex_translate should prevent this
 | 
| 392 |             raise AssertionError()
 | 
| 393 |     return libc_flags
 |