1 | from __future__ import print_function # for OPy compiler
|
2 |
|
3 | #
|
4 | # Secret Labs' Regular Expression Engine
|
5 | #
|
6 | # convert re-style regular expression to sre pattern
|
7 | #
|
8 | # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
|
9 | #
|
10 | # See the sre.py file for information on usage and redistribution.
|
11 | #
|
12 |
|
13 | """Internal support module for sre"""
|
14 |
|
15 | # XXX: show string offset and offending character for all errors
|
16 |
|
17 | import sys
|
18 |
|
19 | from sre_constants import *
|
20 |
|
21 | SPECIAL_CHARS = ".\\[{()*+?^$|"
|
22 | REPEAT_CHARS = "*+?{"
|
23 |
|
24 | DIGITS = set("0123456789")
|
25 |
|
26 | OCTDIGITS = set("01234567")
|
27 | HEXDIGITS = set("0123456789abcdefABCDEF")
|
28 |
|
29 | WHITESPACE = set(" \t\n\r\v\f")
|
30 |
|
31 | ESCAPES = {
|
32 | r"\a": (LITERAL, ord("\a")),
|
33 | r"\b": (LITERAL, ord("\b")),
|
34 | r"\f": (LITERAL, ord("\f")),
|
35 | r"\n": (LITERAL, ord("\n")),
|
36 | r"\r": (LITERAL, ord("\r")),
|
37 | r"\t": (LITERAL, ord("\t")),
|
38 | r"\v": (LITERAL, ord("\v")),
|
39 | r"\\": (LITERAL, ord("\\"))
|
40 | }
|
41 |
|
42 | CATEGORIES = {
|
43 | r"\A": (AT, AT_BEGINNING_STRING), # start of string
|
44 | r"\b": (AT, AT_BOUNDARY),
|
45 | r"\B": (AT, AT_NON_BOUNDARY),
|
46 | r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
|
47 | r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
|
48 | r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
|
49 | r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
|
50 | r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
|
51 | r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
|
52 | r"\Z": (AT, AT_END_STRING), # end of string
|
53 | }
|
54 |
|
55 | FLAGS = {
|
56 | # standard flags
|
57 | "i": SRE_FLAG_IGNORECASE,
|
58 | "L": SRE_FLAG_LOCALE,
|
59 | "m": SRE_FLAG_MULTILINE,
|
60 | "s": SRE_FLAG_DOTALL,
|
61 | "x": SRE_FLAG_VERBOSE,
|
62 | # extensions
|
63 | "t": SRE_FLAG_TEMPLATE,
|
64 | "u": SRE_FLAG_UNICODE,
|
65 | }
|
66 |
|
67 | class Pattern:
|
68 | # master pattern object. keeps track of global attributes
|
69 | def __init__(self):
|
70 | self.flags = 0
|
71 | self.open = []
|
72 | self.groups = 1
|
73 | self.groupdict = {}
|
74 | self.lookbehind = 0
|
75 |
|
76 | def opengroup(self, name=None):
|
77 | gid = self.groups
|
78 | self.groups = gid + 1
|
79 | if name is not None:
|
80 | ogid = self.groupdict.get(name, None)
|
81 | if ogid is not None:
|
82 | raise error, ("redefinition of group name %s as group %d; "
|
83 | "was group %d" % (repr(name), gid, ogid))
|
84 | self.groupdict[name] = gid
|
85 | self.open.append(gid)
|
86 | return gid
|
87 | def closegroup(self, gid):
|
88 | self.open.remove(gid)
|
89 | def checkgroup(self, gid):
|
90 | return gid < self.groups and gid not in self.open
|
91 |
|
92 | class SubPattern:
|
93 | # a subpattern, in intermediate form
|
94 | def __init__(self, pattern, data=None):
|
95 | self.pattern = pattern
|
96 | if data is None:
|
97 | data = []
|
98 | self.data = data
|
99 | self.width = None
|
100 | def dump(self, level=0):
|
101 | seqtypes = (tuple, list)
|
102 | for op, av in self.data:
|
103 | print(level*" " + op, end=' ')
|
104 | if op == IN:
|
105 | # member sublanguage
|
106 | print()
|
107 | for op, a in av:
|
108 | print((level+1)*" " + op, a)
|
109 | elif op == BRANCH:
|
110 | print()
|
111 | for i, a in enumerate(av[1]):
|
112 | if i:
|
113 | print(level*" " + "or")
|
114 | a.dump(level+1)
|
115 | elif op == GROUPREF_EXISTS:
|
116 | condgroup, item_yes, item_no = av
|
117 | print(condgroup)
|
118 | item_yes.dump(level+1)
|
119 | if item_no:
|
120 | print(level*" " + "else")
|
121 | item_no.dump(level+1)
|
122 | elif isinstance(av, seqtypes):
|
123 | nl = 0
|
124 | for a in av:
|
125 | if isinstance(a, SubPattern):
|
126 | if not nl:
|
127 | print()
|
128 | a.dump(level+1)
|
129 | nl = 1
|
130 | else:
|
131 | print(a, end=' ')
|
132 | nl = 0
|
133 | if not nl:
|
134 | print()
|
135 | else:
|
136 | print(av)
|
137 | def __repr__(self):
|
138 | return repr(self.data)
|
139 | def __len__(self):
|
140 | return len(self.data)
|
141 | def __delitem__(self, index):
|
142 | del self.data[index]
|
143 | def __getitem__(self, index):
|
144 | if isinstance(index, slice):
|
145 | return SubPattern(self.pattern, self.data[index])
|
146 | return self.data[index]
|
147 | def __setitem__(self, index, code):
|
148 | self.data[index] = code
|
149 | def insert(self, index, code):
|
150 | self.data.insert(index, code)
|
151 | def append(self, code):
|
152 | self.data.append(code)
|
153 | def getwidth(self):
|
154 | # determine the width (min, max) for this subpattern
|
155 | if self.width:
|
156 | return self.width
|
157 | lo = hi = 0
|
158 | UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
|
159 | REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
|
160 | for op, av in self.data:
|
161 | if op is BRANCH:
|
162 | i = MAXREPEAT - 1
|
163 | j = 0
|
164 | for av in av[1]:
|
165 | l, h = av.getwidth()
|
166 | i = min(i, l)
|
167 | j = max(j, h)
|
168 | lo = lo + i
|
169 | hi = hi + j
|
170 | elif op is CALL:
|
171 | i, j = av.getwidth()
|
172 | lo = lo + i
|
173 | hi = hi + j
|
174 | elif op is SUBPATTERN:
|
175 | i, j = av[1].getwidth()
|
176 | lo = lo + i
|
177 | hi = hi + j
|
178 | elif op in REPEATCODES:
|
179 | i, j = av[2].getwidth()
|
180 | lo = lo + i * av[0]
|
181 | hi = hi + j * av[1]
|
182 | elif op in UNITCODES:
|
183 | lo = lo + 1
|
184 | hi = hi + 1
|
185 | elif op == SUCCESS:
|
186 | break
|
187 | self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
|
188 | return self.width
|
189 |
|
190 | class Tokenizer:
|
191 | def __init__(self, string):
|
192 | self.string = string
|
193 | self.index = 0
|
194 | self.__next()
|
195 | def __next(self):
|
196 | if self.index >= len(self.string):
|
197 | self.next = None
|
198 | return
|
199 | char = self.string[self.index]
|
200 | if char[0] == "\\":
|
201 | try:
|
202 | c = self.string[self.index + 1]
|
203 | except IndexError:
|
204 | raise error, "bogus escape (end of line)"
|
205 | char = char + c
|
206 | self.index = self.index + len(char)
|
207 | self.next = char
|
208 | def match(self, char, skip=1):
|
209 | if char == self.next:
|
210 | if skip:
|
211 | self.__next()
|
212 | return 1
|
213 | return 0
|
214 | def get(self):
|
215 | this = self.next
|
216 | self.__next()
|
217 | return this
|
218 | def tell(self):
|
219 | return self.index, self.next
|
220 | def seek(self, index):
|
221 | self.index, self.next = index
|
222 |
|
223 | def isident(char):
|
224 | return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
|
225 |
|
226 | def isdigit(char):
|
227 | return "0" <= char <= "9"
|
228 |
|
229 | def isname(name):
|
230 | # check that group name is a valid string
|
231 | if not isident(name[0]):
|
232 | return False
|
233 | for char in name[1:]:
|
234 | if not isident(char) and not isdigit(char):
|
235 | return False
|
236 | return True
|
237 |
|
238 | def _class_escape(source, escape):
|
239 | # handle escape code inside character class
|
240 | code = ESCAPES.get(escape)
|
241 | if code:
|
242 | return code
|
243 | code = CATEGORIES.get(escape)
|
244 | if code and code[0] == IN:
|
245 | return code
|
246 | try:
|
247 | c = escape[1:2]
|
248 | if c == "x":
|
249 | # hexadecimal escape (exactly two digits)
|
250 | while source.next in HEXDIGITS and len(escape) < 4:
|
251 | escape = escape + source.get()
|
252 | escape = escape[2:]
|
253 | if len(escape) != 2:
|
254 | raise error, "bogus escape: %s" % repr("\\" + escape)
|
255 | return LITERAL, int(escape, 16) & 0xff
|
256 | elif c in OCTDIGITS:
|
257 | # octal escape (up to three digits)
|
258 | while source.next in OCTDIGITS and len(escape) < 4:
|
259 | escape = escape + source.get()
|
260 | escape = escape[1:]
|
261 | return LITERAL, int(escape, 8) & 0xff
|
262 | elif c in DIGITS:
|
263 | raise error, "bogus escape: %s" % repr(escape)
|
264 | if len(escape) == 2:
|
265 | return LITERAL, ord(escape[1])
|
266 | except ValueError:
|
267 | pass
|
268 | raise error, "bogus escape: %s" % repr(escape)
|
269 |
|
270 | def _escape(source, escape, state):
|
271 | # handle escape code in expression
|
272 | code = CATEGORIES.get(escape)
|
273 | if code:
|
274 | return code
|
275 | code = ESCAPES.get(escape)
|
276 | if code:
|
277 | return code
|
278 | try:
|
279 | c = escape[1:2]
|
280 | if c == "x":
|
281 | # hexadecimal escape
|
282 | while source.next in HEXDIGITS and len(escape) < 4:
|
283 | escape = escape + source.get()
|
284 | if len(escape) != 4:
|
285 | raise ValueError
|
286 | return LITERAL, int(escape[2:], 16) & 0xff
|
287 | elif c == "0":
|
288 | # octal escape
|
289 | while source.next in OCTDIGITS and len(escape) < 4:
|
290 | escape = escape + source.get()
|
291 | return LITERAL, int(escape[1:], 8) & 0xff
|
292 | elif c in DIGITS:
|
293 | # octal escape *or* decimal group reference (sigh)
|
294 | if source.next in DIGITS:
|
295 | escape = escape + source.get()
|
296 | if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
|
297 | source.next in OCTDIGITS):
|
298 | # got three octal digits; this is an octal escape
|
299 | escape = escape + source.get()
|
300 | return LITERAL, int(escape[1:], 8) & 0xff
|
301 | # not an octal escape, so this is a group reference
|
302 | group = int(escape[1:])
|
303 | if group < state.groups:
|
304 | if not state.checkgroup(group):
|
305 | raise error, "cannot refer to open group"
|
306 | if state.lookbehind:
|
307 | import warnings
|
308 | warnings.warn('group references in lookbehind '
|
309 | 'assertions are not supported',
|
310 | RuntimeWarning)
|
311 | return GROUPREF, group
|
312 | raise ValueError
|
313 | if len(escape) == 2:
|
314 | return LITERAL, ord(escape[1])
|
315 | except ValueError:
|
316 | pass
|
317 | raise error, "bogus escape: %s" % repr(escape)
|
318 |
|
319 | def _parse_sub(source, state, nested=1):
|
320 | # parse an alternation: a|b|c
|
321 |
|
322 | items = []
|
323 | itemsappend = items.append
|
324 | sourcematch = source.match
|
325 | while 1:
|
326 | itemsappend(_parse(source, state))
|
327 | if sourcematch("|"):
|
328 | continue
|
329 | if not nested:
|
330 | break
|
331 | if not source.next or sourcematch(")", 0):
|
332 | break
|
333 | else:
|
334 | raise error, "pattern not properly closed"
|
335 |
|
336 | if len(items) == 1:
|
337 | return items[0]
|
338 |
|
339 | subpattern = SubPattern(state)
|
340 | subpatternappend = subpattern.append
|
341 |
|
342 | # check if all items share a common prefix
|
343 | while 1:
|
344 | prefix = None
|
345 | for item in items:
|
346 | if not item:
|
347 | break
|
348 | if prefix is None:
|
349 | prefix = item[0]
|
350 | elif item[0] != prefix:
|
351 | break
|
352 | else:
|
353 | # all subitems start with a common "prefix".
|
354 | # move it out of the branch
|
355 | for item in items:
|
356 | del item[0]
|
357 | subpatternappend(prefix)
|
358 | continue # check next one
|
359 | break
|
360 |
|
361 | # check if the branch can be replaced by a character set
|
362 | for item in items:
|
363 | if len(item) != 1 or item[0][0] != LITERAL:
|
364 | break
|
365 | else:
|
366 | # we can store this as a character set instead of a
|
367 | # branch (the compiler may optimize this even more)
|
368 | set = []
|
369 | setappend = set.append
|
370 | for item in items:
|
371 | setappend(item[0])
|
372 | subpatternappend((IN, set))
|
373 | return subpattern
|
374 |
|
375 | subpattern.append((BRANCH, (None, items)))
|
376 | return subpattern
|
377 |
|
378 | def _parse_sub_cond(source, state, condgroup):
|
379 | item_yes = _parse(source, state)
|
380 | if source.match("|"):
|
381 | item_no = _parse(source, state)
|
382 | if source.match("|"):
|
383 | raise error, "conditional backref with more than two branches"
|
384 | else:
|
385 | item_no = None
|
386 | if source.next and not source.match(")", 0):
|
387 | raise error, "pattern not properly closed"
|
388 | subpattern = SubPattern(state)
|
389 | subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
390 | return subpattern
|
391 |
|
392 | _PATTERNENDERS = set("|)")
|
393 | _ASSERTCHARS = set("=!<")
|
394 | _LOOKBEHINDASSERTCHARS = set("=!")
|
395 | _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
|
396 |
|
397 | def _parse(source, state):
|
398 | # parse a simple pattern
|
399 | subpattern = SubPattern(state)
|
400 |
|
401 | # precompute constants into local variables
|
402 | subpatternappend = subpattern.append
|
403 | sourceget = source.get
|
404 | sourcematch = source.match
|
405 | _len = len
|
406 | PATTERNENDERS = _PATTERNENDERS
|
407 | ASSERTCHARS = _ASSERTCHARS
|
408 | LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
|
409 | REPEATCODES = _REPEATCODES
|
410 |
|
411 | while 1:
|
412 |
|
413 | if source.next in PATTERNENDERS:
|
414 | break # end of subpattern
|
415 | this = sourceget()
|
416 | if this is None:
|
417 | break # end of pattern
|
418 |
|
419 | if state.flags & SRE_FLAG_VERBOSE:
|
420 | # skip whitespace and comments
|
421 | if this in WHITESPACE:
|
422 | continue
|
423 | if this == "#":
|
424 | while 1:
|
425 | this = sourceget()
|
426 | if this in (None, "\n"):
|
427 | break
|
428 | continue
|
429 |
|
430 | if this and this[0] not in SPECIAL_CHARS:
|
431 | subpatternappend((LITERAL, ord(this)))
|
432 |
|
433 | elif this == "[":
|
434 | # character set
|
435 | set = []
|
436 | setappend = set.append
|
437 | ## if sourcematch(":"):
|
438 | ## pass # handle character classes
|
439 | if sourcematch("^"):
|
440 | setappend((NEGATE, None))
|
441 | # check remaining characters
|
442 | start = set[:]
|
443 | while 1:
|
444 | this = sourceget()
|
445 | if this == "]" and set != start:
|
446 | break
|
447 | elif this and this[0] == "\\":
|
448 | code1 = _class_escape(source, this)
|
449 | elif this:
|
450 | code1 = LITERAL, ord(this)
|
451 | else:
|
452 | raise error, "unexpected end of regular expression"
|
453 | if sourcematch("-"):
|
454 | # potential range
|
455 | this = sourceget()
|
456 | if this == "]":
|
457 | if code1[0] is IN:
|
458 | code1 = code1[1][0]
|
459 | setappend(code1)
|
460 | setappend((LITERAL, ord("-")))
|
461 | break
|
462 | elif this:
|
463 | if this[0] == "\\":
|
464 | code2 = _class_escape(source, this)
|
465 | else:
|
466 | code2 = LITERAL, ord(this)
|
467 | if code1[0] != LITERAL or code2[0] != LITERAL:
|
468 | raise error, "bad character range"
|
469 | lo = code1[1]
|
470 | hi = code2[1]
|
471 | if hi < lo:
|
472 | raise error, "bad character range"
|
473 | setappend((RANGE, (lo, hi)))
|
474 | else:
|
475 | raise error, "unexpected end of regular expression"
|
476 | else:
|
477 | if code1[0] is IN:
|
478 | code1 = code1[1][0]
|
479 | setappend(code1)
|
480 |
|
481 | # XXX: <fl> should move set optimization to compiler!
|
482 | if _len(set)==1 and set[0][0] is LITERAL:
|
483 | subpatternappend(set[0]) # optimization
|
484 | elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
|
485 | subpatternappend((NOT_LITERAL, set[1][1])) # optimization
|
486 | else:
|
487 | # XXX: <fl> should add charmap optimization here
|
488 | subpatternappend((IN, set))
|
489 |
|
490 | elif this and this[0] in REPEAT_CHARS:
|
491 | # repeat previous item
|
492 | if this == "?":
|
493 | min, max = 0, 1
|
494 | elif this == "*":
|
495 | min, max = 0, MAXREPEAT
|
496 |
|
497 | elif this == "+":
|
498 | min, max = 1, MAXREPEAT
|
499 | elif this == "{":
|
500 | if source.next == "}":
|
501 | subpatternappend((LITERAL, ord(this)))
|
502 | continue
|
503 | here = source.tell()
|
504 | min, max = 0, MAXREPEAT
|
505 | lo = hi = ""
|
506 | while source.next in DIGITS:
|
507 | lo = lo + source.get()
|
508 | if sourcematch(","):
|
509 | while source.next in DIGITS:
|
510 | hi = hi + sourceget()
|
511 | else:
|
512 | hi = lo
|
513 | if not sourcematch("}"):
|
514 | subpatternappend((LITERAL, ord(this)))
|
515 | source.seek(here)
|
516 | continue
|
517 | if lo:
|
518 | min = int(lo)
|
519 | if min >= MAXREPEAT:
|
520 | raise OverflowError("the repetition number is too large")
|
521 | if hi:
|
522 | max = int(hi)
|
523 | if max >= MAXREPEAT:
|
524 | raise OverflowError("the repetition number is too large")
|
525 | if max < min:
|
526 | raise error("bad repeat interval")
|
527 | else:
|
528 | raise error, "not supported"
|
529 | # figure out which item to repeat
|
530 | if subpattern:
|
531 | item = subpattern[-1:]
|
532 | else:
|
533 | item = None
|
534 | if not item or (_len(item) == 1 and item[0][0] == AT):
|
535 | raise error, "nothing to repeat"
|
536 | if item[0][0] in REPEATCODES:
|
537 | raise error, "multiple repeat"
|
538 | if sourcematch("?"):
|
539 | subpattern[-1] = (MIN_REPEAT, (min, max, item))
|
540 | else:
|
541 | subpattern[-1] = (MAX_REPEAT, (min, max, item))
|
542 |
|
543 | elif this == ".":
|
544 | subpatternappend((ANY, None))
|
545 |
|
546 | elif this == "(":
|
547 | group = 1
|
548 | name = None
|
549 | condgroup = None
|
550 | if sourcematch("?"):
|
551 | group = 0
|
552 | # options
|
553 | if sourcematch("P"):
|
554 | # python extensions
|
555 | if sourcematch("<"):
|
556 | # named group: skip forward to end of name
|
557 | name = ""
|
558 | while 1:
|
559 | char = sourceget()
|
560 | if char is None:
|
561 | raise error, "unterminated name"
|
562 | if char == ">":
|
563 | break
|
564 | name = name + char
|
565 | group = 1
|
566 | if not name:
|
567 | raise error("missing group name")
|
568 | if not isname(name):
|
569 | raise error("bad character in group name %r" %
|
570 | name)
|
571 | elif sourcematch("="):
|
572 | # named backreference
|
573 | name = ""
|
574 | while 1:
|
575 | char = sourceget()
|
576 | if char is None:
|
577 | raise error, "unterminated name"
|
578 | if char == ")":
|
579 | break
|
580 | name = name + char
|
581 | if not name:
|
582 | raise error("missing group name")
|
583 | if not isname(name):
|
584 | raise error("bad character in backref group name "
|
585 | "%r" % name)
|
586 | gid = state.groupdict.get(name)
|
587 | if gid is None:
|
588 | msg = "unknown group name: {0!r}".format(name)
|
589 | raise error(msg)
|
590 | if state.lookbehind:
|
591 | import warnings
|
592 | warnings.warn('group references in lookbehind '
|
593 | 'assertions are not supported',
|
594 | RuntimeWarning)
|
595 | subpatternappend((GROUPREF, gid))
|
596 | continue
|
597 | else:
|
598 | char = sourceget()
|
599 | if char is None:
|
600 | raise error, "unexpected end of pattern"
|
601 | raise error, "unknown specifier: ?P%s" % char
|
602 | elif sourcematch(":"):
|
603 | # non-capturing group
|
604 | group = 2
|
605 | elif sourcematch("#"):
|
606 | # comment
|
607 | while 1:
|
608 | if source.next is None or source.next == ")":
|
609 | break
|
610 | sourceget()
|
611 | if not sourcematch(")"):
|
612 | raise error, "unbalanced parenthesis"
|
613 | continue
|
614 | elif source.next in ASSERTCHARS:
|
615 | # lookahead assertions
|
616 | char = sourceget()
|
617 | dir = 1
|
618 | if char == "<":
|
619 | if source.next not in LOOKBEHINDASSERTCHARS:
|
620 | raise error, "syntax error"
|
621 | dir = -1 # lookbehind
|
622 | char = sourceget()
|
623 | state.lookbehind += 1
|
624 | p = _parse_sub(source, state)
|
625 | if dir < 0:
|
626 | state.lookbehind -= 1
|
627 | if not sourcematch(")"):
|
628 | raise error, "unbalanced parenthesis"
|
629 | if char == "=":
|
630 | subpatternappend((ASSERT, (dir, p)))
|
631 | else:
|
632 | subpatternappend((ASSERT_NOT, (dir, p)))
|
633 | continue
|
634 | elif sourcematch("("):
|
635 | # conditional backreference group
|
636 | condname = ""
|
637 | while 1:
|
638 | char = sourceget()
|
639 | if char is None:
|
640 | raise error, "unterminated name"
|
641 | if char == ")":
|
642 | break
|
643 | condname = condname + char
|
644 | group = 2
|
645 | if not condname:
|
646 | raise error("missing group name")
|
647 | if isname(condname):
|
648 | condgroup = state.groupdict.get(condname)
|
649 | if condgroup is None:
|
650 | msg = "unknown group name: {0!r}".format(condname)
|
651 | raise error(msg)
|
652 | else:
|
653 | try:
|
654 | condgroup = int(condname)
|
655 | except ValueError:
|
656 | raise error, "bad character in group name"
|
657 | if state.lookbehind:
|
658 | import warnings
|
659 | warnings.warn('group references in lookbehind '
|
660 | 'assertions are not supported',
|
661 | RuntimeWarning)
|
662 | else:
|
663 | # flags
|
664 | if not source.next in FLAGS:
|
665 | raise error, "unexpected end of pattern"
|
666 | while source.next in FLAGS:
|
667 | state.flags = state.flags | FLAGS[sourceget()]
|
668 | if group:
|
669 | # parse group contents
|
670 | if group == 2:
|
671 | # anonymous group
|
672 | group = None
|
673 | else:
|
674 | group = state.opengroup(name)
|
675 | if condgroup:
|
676 | p = _parse_sub_cond(source, state, condgroup)
|
677 | else:
|
678 | p = _parse_sub(source, state)
|
679 | if not sourcematch(")"):
|
680 | raise error, "unbalanced parenthesis"
|
681 | if group is not None:
|
682 | state.closegroup(group)
|
683 | subpatternappend((SUBPATTERN, (group, p)))
|
684 | else:
|
685 | while 1:
|
686 | char = sourceget()
|
687 | if char is None:
|
688 | raise error, "unexpected end of pattern"
|
689 | if char == ")":
|
690 | break
|
691 | raise error, "unknown extension"
|
692 |
|
693 | elif this == "^":
|
694 | subpatternappend((AT, AT_BEGINNING))
|
695 |
|
696 | elif this == "$":
|
697 | subpattern.append((AT, AT_END))
|
698 |
|
699 | elif this and this[0] == "\\":
|
700 | code = _escape(source, this, state)
|
701 | subpatternappend(code)
|
702 |
|
703 | else:
|
704 | raise error, "parser error"
|
705 |
|
706 | return subpattern
|
707 |
|
708 | def parse(str, flags=0, pattern=None):
|
709 | # parse 're' pattern into list of (opcode, argument) tuples
|
710 |
|
711 | source = Tokenizer(str)
|
712 |
|
713 | if pattern is None:
|
714 | pattern = Pattern()
|
715 | pattern.flags = flags
|
716 | pattern.str = str
|
717 |
|
718 | p = _parse_sub(source, pattern, 0)
|
719 |
|
720 | tail = source.get()
|
721 | if tail == ")":
|
722 | raise error, "unbalanced parenthesis"
|
723 | elif tail:
|
724 | raise error, "bogus characters at end of regular expression"
|
725 |
|
726 | if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
|
727 | # the VERBOSE flag was switched on inside the pattern. to be
|
728 | # on the safe side, we'll parse the whole thing again...
|
729 | return parse(str, p.pattern.flags)
|
730 |
|
731 | if flags & SRE_FLAG_DEBUG:
|
732 | p.dump()
|
733 |
|
734 | return p
|
735 |
|
736 | def parse_template(source, pattern):
|
737 | # parse 're' replacement string into list of literals and
|
738 | # group references
|
739 | s = Tokenizer(source)
|
740 | sget = s.get
|
741 | p = []
|
742 | a = p.append
|
743 | def literal(literal, p=p, pappend=a):
|
744 | if p and p[-1][0] is LITERAL:
|
745 | p[-1] = LITERAL, p[-1][1] + literal
|
746 | else:
|
747 | pappend((LITERAL, literal))
|
748 | sep = source[:0]
|
749 | if type(sep) is type(""):
|
750 | makechar = chr
|
751 | else:
|
752 | makechar = unichr
|
753 | while 1:
|
754 | this = sget()
|
755 | if this is None:
|
756 | break # end of replacement string
|
757 | if this and this[0] == "\\":
|
758 | # group
|
759 | c = this[1:2]
|
760 | if c == "g":
|
761 | name = ""
|
762 | if s.match("<"):
|
763 | while 1:
|
764 | char = sget()
|
765 | if char is None:
|
766 | raise error, "unterminated group name"
|
767 | if char == ">":
|
768 | break
|
769 | name = name + char
|
770 | if not name:
|
771 | raise error, "missing group name"
|
772 | try:
|
773 | index = int(name)
|
774 | if index < 0:
|
775 | raise error, "negative group number"
|
776 | except ValueError:
|
777 | if not isname(name):
|
778 | raise error, "bad character in group name"
|
779 | try:
|
780 | index = pattern.groupindex[name]
|
781 | except KeyError:
|
782 | msg = "unknown group name: {0!r}".format(name)
|
783 | raise IndexError(msg)
|
784 | a((MARK, index))
|
785 | elif c == "0":
|
786 | if s.next in OCTDIGITS:
|
787 | this = this + sget()
|
788 | if s.next in OCTDIGITS:
|
789 | this = this + sget()
|
790 | literal(makechar(int(this[1:], 8) & 0xff))
|
791 | elif c in DIGITS:
|
792 | isoctal = False
|
793 | if s.next in DIGITS:
|
794 | this = this + sget()
|
795 | if (c in OCTDIGITS and this[2] in OCTDIGITS and
|
796 | s.next in OCTDIGITS):
|
797 | this = this + sget()
|
798 | isoctal = True
|
799 | literal(makechar(int(this[1:], 8) & 0xff))
|
800 | if not isoctal:
|
801 | a((MARK, int(this[1:])))
|
802 | else:
|
803 | try:
|
804 | this = makechar(ESCAPES[this][1])
|
805 | except KeyError:
|
806 | pass
|
807 | literal(this)
|
808 | else:
|
809 | literal(this)
|
810 | # convert template to groups and literals lists
|
811 | i = 0
|
812 | groups = []
|
813 | groupsappend = groups.append
|
814 | literals = [None] * len(p)
|
815 | for c, s in p:
|
816 | if c is MARK:
|
817 | groupsappend((i, s))
|
818 | # literal[i] is already None
|
819 | else:
|
820 | literals[i] = s
|
821 | i = i + 1
|
822 | return groups, literals
|
823 |
|
824 | def expand_template(template, match):
|
825 | g = match.group
|
826 | sep = match.string[:0]
|
827 | groups, literals = template
|
828 | literals = literals[:]
|
829 | try:
|
830 | for index, group in groups:
|
831 | literals[index] = s = g(group)
|
832 | if s is None:
|
833 | raise error, "unmatched group"
|
834 | except IndexError:
|
835 | raise error, "invalid group reference"
|
836 | return sep.join(literals)
|