opy/_regtest/src/sre

OILS / opy / _regtest / src / sre_compile.py View on Github | oilshell.org

596 lines, 475 significant

1	# -- coding: utf-8 --
2	#
3	# Secret Labs' Regular Expression Engine
4	#
5	# convert template to internal format
6	#
7	# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
8	#
9	# See the sre.py file for information on usage and redistribution.
10	#
11
12	"""Internal support module for sre"""
13
14	import _sre, sys
15	import sre_parse
16	from sre_constants import *
17
18	assert _sre.MAGIC == MAGIC, "SRE module mismatch"
19
20	if _sre.CODESIZE == 2:
21	MAXCODE = 65535
22	else:
23	MAXCODE = 0xFFFFFFFFL
24
25	_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
26	_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
27	_SUCCESS_CODES = set([SUCCESS, FAILURE])
28	_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
29
30	# Sets of lowercase characters which have the same uppercase.
31	_equivalences = (
32	# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
33	(0x69, 0x131), # iı
34	# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
35	(0x73, 0x17f), # sſ
36	# MICRO SIGN, GREEK SMALL LETTER MU
37	(0xb5, 0x3bc), # µμ
38	# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
39	(0x345, 0x3b9, 0x1fbe), # \u0345ιι
40	# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
41	(0x3b2, 0x3d0), # βϐ
42	# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
43	(0x3b5, 0x3f5), # εϵ
44	# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
45	(0x3b8, 0x3d1), # θϑ
46	# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
47	(0x3ba, 0x3f0), # κϰ
48	# GREEK SMALL LETTER PI, GREEK PI SYMBOL
49	(0x3c0, 0x3d6), # πϖ
50	# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
51	(0x3c1, 0x3f1), # ρϱ
52	# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
53	(0x3c2, 0x3c3), # ςσ
54	# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
55	(0x3c6, 0x3d5), # φϕ
56	# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
57	(0x1e61, 0x1e9b), # ṡẛ
58	)
59
60	# Maps the lowercase code to lowercase codes which have the same uppercase.
61	_ignorecase_fixes = {i: tuple(j for j in t if i != j)
62	for t in _equivalences for i in t}
63
64	def _compile(code, pattern, flags):
65	# internal: compile a (sub)pattern
66	emit = code.append
67	_len = len
68	LITERAL_CODES = _LITERAL_CODES
69	REPEATING_CODES = _REPEATING_CODES
70	SUCCESS_CODES = _SUCCESS_CODES
71	ASSERT_CODES = _ASSERT_CODES
72	if (flags & SRE_FLAG_IGNORECASE and
73	not (flags & SRE_FLAG_LOCALE) and
74	flags & SRE_FLAG_UNICODE):
75	fixes = _ignorecase_fixes
76	else:
77	fixes = None
78	for op, av in pattern:
79	if op in LITERAL_CODES:
80	if flags & SRE_FLAG_IGNORECASE:
81	lo = _sre.getlower(av, flags)
82	if fixes and lo in fixes:
83	emit(OPCODES[IN_IGNORE])
84	skip = _len(code); emit(0)
85	if op is NOT_LITERAL:
86	emit(OPCODES[NEGATE])
87	for k in (lo,) + fixes[lo]:
88	emit(OPCODES[LITERAL])
89	emit(k)
90	emit(OPCODES[FAILURE])
91	code[skip] = _len(code) - skip
92	else:
93	emit(OPCODES[OP_IGNORE[op]])
94	emit(lo)
95	else:
96	emit(OPCODES[op])
97	emit(av)
98	elif op is IN:
99	if flags & SRE_FLAG_IGNORECASE:
100	emit(OPCODES[OP_IGNORE[op]])
101	def fixup(literal, flags=flags):
102	return _sre.getlower(literal, flags)
103	else:
104	emit(OPCODES[op])
105	fixup = None
106	skip = _len(code); emit(0)
107	_compile_charset(av, flags, code, fixup, fixes)
108	code[skip] = _len(code) - skip
109	elif op is ANY:
110	if flags & SRE_FLAG_DOTALL:
111	emit(OPCODES[ANY_ALL])
112	else:
113	emit(OPCODES[ANY])
114	elif op in REPEATING_CODES:
115	if flags & SRE_FLAG_TEMPLATE:
116	raise error, "internal: unsupported template operator"
117	emit(OPCODES[REPEAT])
118	skip = _len(code); emit(0)
119	emit(av[0])
120	emit(av[1])
121	_compile(code, av[2], flags)
122	emit(OPCODES[SUCCESS])
123	code[skip] = _len(code) - skip
124	elif _simple(av) and op is not REPEAT:
125	if op is MAX_REPEAT:
126	emit(OPCODES[REPEAT_ONE])
127	else:
128	emit(OPCODES[MIN_REPEAT_ONE])
129	skip = _len(code); emit(0)
130	emit(av[0])
131	emit(av[1])
132	_compile(code, av[2], flags)
133	emit(OPCODES[SUCCESS])
134	code[skip] = _len(code) - skip
135	else:
136	emit(OPCODES[REPEAT])
137	skip = _len(code); emit(0)
138	emit(av[0])
139	emit(av[1])
140	_compile(code, av[2], flags)
141	code[skip] = _len(code) - skip
142	if op is MAX_REPEAT:
143	emit(OPCODES[MAX_UNTIL])
144	else:
145	emit(OPCODES[MIN_UNTIL])
146	elif op is SUBPATTERN:
147	if av[0]:
148	emit(OPCODES[MARK])
149	emit((av[0]-1)*2)
150	# _compile_info(code, av[1], flags)
151	_compile(code, av[1], flags)
152	if av[0]:
153	emit(OPCODES[MARK])
154	emit((av[0]-1)*2+1)
155	elif op in SUCCESS_CODES:
156	emit(OPCODES[op])
157	elif op in ASSERT_CODES:
158	emit(OPCODES[op])
159	skip = _len(code); emit(0)
160	if av[0] >= 0:
161	emit(0) # look ahead
162	else:
163	lo, hi = av[1].getwidth()
164	if lo != hi:
165	raise error, "look-behind requires fixed-width pattern"
166	emit(lo) # look behind
167	_compile(code, av[1], flags)
168	emit(OPCODES[SUCCESS])
169	code[skip] = _len(code) - skip
170	elif op is CALL:
171	emit(OPCODES[op])
172	skip = _len(code); emit(0)
173	_compile(code, av, flags)
174	emit(OPCODES[SUCCESS])
175	code[skip] = _len(code) - skip
176	elif op is AT:
177	emit(OPCODES[op])
178	if flags & SRE_FLAG_MULTILINE:
179	av = AT_MULTILINE.get(av, av)
180	if flags & SRE_FLAG_LOCALE:
181	av = AT_LOCALE.get(av, av)
182	elif flags & SRE_FLAG_UNICODE:
183	av = AT_UNICODE.get(av, av)
184	emit(ATCODES[av])
185	elif op is BRANCH:
186	emit(OPCODES[op])
187	tail = []
188	tailappend = tail.append
189	for av in av[1]:
190	skip = _len(code); emit(0)
191	# _compile_info(code, av, flags)
192	_compile(code, av, flags)
193	emit(OPCODES[JUMP])
194	tailappend(_len(code)); emit(0)
195	code[skip] = _len(code) - skip
196	emit(0) # end of branch
197	for tail in tail:
198	code[tail] = _len(code) - tail
199	elif op is CATEGORY:
200	emit(OPCODES[op])
201	if flags & SRE_FLAG_LOCALE:
202	av = CH_LOCALE[av]
203	elif flags & SRE_FLAG_UNICODE:
204	av = CH_UNICODE[av]
205	emit(CHCODES[av])
206	elif op is GROUPREF:
207	if flags & SRE_FLAG_IGNORECASE:
208	emit(OPCODES[OP_IGNORE[op]])
209	else:
210	emit(OPCODES[op])
211	emit(av-1)
212	elif op is GROUPREF_EXISTS:
213	emit(OPCODES[op])
214	emit(av[0]-1)
215	skipyes = _len(code); emit(0)
216	_compile(code, av[1], flags)
217	if av[2]:
218	emit(OPCODES[JUMP])
219	skipno = _len(code); emit(0)
220	code[skipyes] = _len(code) - skipyes + 1
221	_compile(code, av[2], flags)
222	code[skipno] = _len(code) - skipno
223	else:
224	code[skipyes] = _len(code) - skipyes + 1
225	else:
226	raise ValueError, ("unsupported operand type", op)
227
228	def _compile_charset(charset, flags, code, fixup=None, fixes=None):
229	# compile charset subprogram
230	emit = code.append
231	for op, av in _optimize_charset(charset, fixup, fixes,
232	flags & SRE_FLAG_UNICODE):
233	emit(OPCODES[op])
234	if op is NEGATE:
235	pass
236	elif op is LITERAL:
237	emit(av)
238	elif op is RANGE:
239	emit(av[0])
240	emit(av[1])
241	elif op is CHARSET:
242	code.extend(av)
243	elif op is BIGCHARSET:
244	code.extend(av)
245	elif op is CATEGORY:
246	if flags & SRE_FLAG_LOCALE:
247	emit(CHCODES[CH_LOCALE[av]])
248	elif flags & SRE_FLAG_UNICODE:
249	emit(CHCODES[CH_UNICODE[av]])
250	else:
251	emit(CHCODES[av])
252	else:
253	raise error, "internal: unsupported set operator"
254	emit(OPCODES[FAILURE])
255
256	def _optimize_charset(charset, fixup, fixes, isunicode):
257	# internal: optimize character set
258	out = []
259	tail = []
260	charmap = bytearray(256)
261	for op, av in charset:
262	while True:
263	try:
264	if op is LITERAL:
265	if fixup:
266	i = fixup(av)
267	charmap[i] = 1
268	if fixes and i in fixes:
269	for k in fixes[i]:
270	charmap[k] = 1
271	else:
272	charmap[av] = 1
273	elif op is RANGE:
274	r = range(av[0], av[1]+1)
275	if fixup:
276	r = map(fixup, r)
277	if fixup and fixes:
278	for i in r:
279	charmap[i] = 1
280	if i in fixes:
281	for k in fixes[i]:
282	charmap[k] = 1
283	else:
284	for i in r:
285	charmap[i] = 1
286	elif op is NEGATE:
287	out.append((op, av))
288	else:
289	tail.append((op, av))
290	except IndexError:
291	if len(charmap) == 256:
292	# character set contains non-UCS1 character codes
293	charmap += b'\0' * 0xff00
294	continue
295	# character set contains non-BMP character codes
296	if fixup and isunicode and op is RANGE:
297	lo, hi = av
298	ranges = [av]
299	# There are only two ranges of cased astral characters:
300	# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
301	_fixup_range(max(0x10000, lo), min(0x11fff, hi),
302	ranges, fixup)
303	for lo, hi in ranges:
304	if lo == hi:
305	tail.append((LITERAL, hi))
306	else:
307	tail.append((RANGE, (lo, hi)))
308	else:
309	tail.append((op, av))
310	break
311
312	# compress character map
313	runs = []
314	q = 0
315	while True:
316	p = charmap.find(b'\1', q)
317	if p < 0:
318	break
319	if len(runs) >= 2:
320	runs = None
321	break
322	q = charmap.find(b'\0', p)
323	if q < 0:
324	runs.append((p, len(charmap)))
325	break
326	runs.append((p, q))
327	if runs is not None:
328	# use literal/range
329	for p, q in runs:
330	if q - p == 1:
331	out.append((LITERAL, p))
332	else:
333	out.append((RANGE, (p, q - 1)))
334	out += tail
335	# if the case was changed or new representation is more compact
336	if fixup or len(out) < len(charset):
337	return out
338	# else original character set is good enough
339	return charset
340
341	# use bitmap
342	if len(charmap) == 256:
343	data = _mk_bitmap(charmap)
344	out.append((CHARSET, data))
345	out += tail
346	return out
347
348	# To represent a big charset, first a bitmap of all characters in the
349	# set is constructed. Then, this bitmap is sliced into chunks of 256
350	# characters, duplicate chunks are eliminated, and each chunk is
351	# given a number. In the compiled expression, the charset is
352	# represented by a 32-bit word sequence, consisting of one word for
353	# the number of different chunks, a sequence of 256 bytes (64 words)
354	# of chunk numbers indexed by their original chunk position, and a
355	# sequence of 256-bit chunks (8 words each).
356
357	# Compression is normally good: in a typical charset, large ranges of
358	# Unicode will be either completely excluded (e.g. if only cyrillic
359	# letters are to be matched), or completely included (e.g. if large
360	# subranges of Kanji match). These ranges will be represented by
361	# chunks of all one-bits or all zero-bits.
362
363	# Matching can be also done efficiently: the more significant byte of
364	# the Unicode character is an index into the chunk number, and the
365	# less significant byte is a bit index in the chunk (just like the
366	# CHARSET matching).
367
368	# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
369	# of the basic multilingual plane; an efficient representation
370	# for all of Unicode has not yet been developed.
371
372	charmap = bytes(charmap) # should be hashable
373	comps = {}
374	mapping = bytearray(256)
375	block = 0
376	data = bytearray()
377	for i in range(0, 65536, 256):
378	chunk = charmap[i: i + 256]
379	if chunk in comps:
380	mapping[i // 256] = comps[chunk]
381	else:
382	mapping[i // 256] = comps[chunk] = block
383	block += 1
384	data += chunk
385	data = _mk_bitmap(data)
386	data[0:0] = [block] + _bytes_to_codes(mapping)
387	out.append((BIGCHARSET, data))
388	out += tail
389	return out
390
391	def _fixup_range(lo, hi, ranges, fixup):
392	for i in map(fixup, range(lo, hi+1)):
393	for k, (lo, hi) in enumerate(ranges):
394	if i < lo:
395	if l == lo - 1:
396	ranges[k] = (i, hi)
397	else:
398	ranges.insert(k, (i, i))
399	break
400	elif i > hi:
401	if i == hi + 1:
402	ranges[k] = (lo, i)
403	break
404	else:
405	break
406	else:
407	ranges.append((i, i))
408
409	_CODEBITS = _sre.CODESIZE * 8
410	_BITS_TRANS = b'0' + b'1' * 255
411	def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
412	s = bytes(bits).translate(_BITS_TRANS)[::-1]
413	return [_int(s[i - _CODEBITS: i], 2)
414	for i in range(len(s), 0, -_CODEBITS)]
415
416	def _bytes_to_codes(b):
417	# Convert block indices to word array
418	import array
419	if _sre.CODESIZE == 2:
420	code = 'H'
421	else:
422	code = 'I'
423	a = array.array(code, bytes(b))
424	assert a.itemsize == _sre.CODESIZE
425	assert len(a) * a.itemsize == len(b)
426	return a.tolist()
427
428	def _simple(av):
429	# check if av is a "simple" operator
430	lo, hi = av[2].getwidth()
431	return lo == hi == 1 and av[2][0][0] != SUBPATTERN
432
433	def _compile_info(code, pattern, flags):
434	# internal: compile an info block. in the current version,
435	# this contains min/max pattern width, and an optional literal
436	# prefix or a character map
437	lo, hi = pattern.getwidth()
438	if lo == 0:
439	return # not worth it
440	# look for a literal prefix
441	prefix = []
442	prefixappend = prefix.append
443	prefix_skip = 0
444	charset = [] # not used
445	charsetappend = charset.append
446	if not (flags & SRE_FLAG_IGNORECASE):
447	# look for literal prefix
448	for op, av in pattern.data:
449	if op is LITERAL:
450	if len(prefix) == prefix_skip:
451	prefix_skip = prefix_skip + 1
452	prefixappend(av)
453	elif op is SUBPATTERN and len(av[1]) == 1:
454	op, av = av[1][0]
455	if op is LITERAL:
456	prefixappend(av)
457	else:
458	break
459	else:
460	break
461	# if no prefix, look for charset prefix
462	if not prefix and pattern.data:
463	op, av = pattern.data[0]
464	if op is SUBPATTERN and av[1]:
465	op, av = av[1][0]
466	if op is LITERAL:
467	charsetappend((op, av))
468	elif op is BRANCH:
469	c = []
470	cappend = c.append
471	for p in av[1]:
472	if not p:
473	break
474	op, av = p[0]
475	if op is LITERAL:
476	cappend((op, av))
477	else:
478	break
479	else:
480	charset = c
481	elif op is BRANCH:
482	c = []
483	cappend = c.append
484	for p in av[1]:
485	if not p:
486	break
487	op, av = p[0]
488	if op is LITERAL:
489	cappend((op, av))
490	else:
491	break
492	else:
493	charset = c
494	elif op is IN:
495	charset = av
496	## if prefix:
497	## print "*** PREFIX", prefix, prefix_skip
498	## if charset:
499	## print "*** CHARSET", charset
500	# add an info block
501	emit = code.append
502	emit(OPCODES[INFO])
503	skip = len(code); emit(0)
504	# literal flag
505	mask = 0
506	if prefix:
507	mask = SRE_INFO_PREFIX
508	if len(prefix) == prefix_skip == len(pattern.data):
509	mask = mask + SRE_INFO_LITERAL
510	elif charset:
511	mask = mask + SRE_INFO_CHARSET
512	emit(mask)
513	# pattern length
514	if lo < MAXCODE:
515	emit(lo)
516	else:
517	emit(MAXCODE)
518	prefix = prefix[:MAXCODE]
519	if hi < MAXCODE:
520	emit(hi)
521	else:
522	emit(0)
523	# add literal prefix
524	if prefix:
525	emit(len(prefix)) # length
526	emit(prefix_skip) # skip
527	code.extend(prefix)
528	# generate overlap table
529	table = [-1] + ([0]*len(prefix))
530	for i in xrange(len(prefix)):
531	table[i+1] = table[i]+1
532	while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
533	table[i+1] = table[table[i+1]-1]+1
534	code.extend(table[1:]) # don't store first entry
535	elif charset:
536	_compile_charset(charset, flags, code)
537	code[skip] = len(code) - skip
538
539	try:
540	unicode
541	except NameError:
542	STRING_TYPES = (type(""),)
543	else:
544	STRING_TYPES = (type(""), type(unicode("")))
545
546	def isstring(obj):
547	for tp in STRING_TYPES:
548	if isinstance(obj, tp):
549	return 1
550	return 0
551
552	def _code(p, flags):
553
554	flags = p.pattern.flags \| flags
555	code = []
556
557	# compile info block
558	_compile_info(code, p, flags)
559
560	# compile the pattern
561	_compile(code, p.data, flags)
562
563	code.append(OPCODES[SUCCESS])
564
565	return code
566
567	def compile(p, flags=0):
568	# internal: convert pattern list to internal format
569
570	if isstring(p):
571	pattern = p
572	p = sre_parse.parse(p, flags)
573	else:
574	pattern = None
575
576	code = _code(p, flags)
577
578	# print code
579
580	# XXX: <fl> get rid of this limitation!
581	if p.pattern.groups > 100:
582	raise AssertionError(
583	"sorry, but this version only supports 100 named groups"
584	)
585
586	# map in either direction
587	groupindex = p.pattern.groupdict
588	indexgroup = [None] * p.pattern.groups
589	for k, i in groupindex.items():
590	indexgroup[i] = k
591
592	return _sre.compile(
593	pattern, flags \| p.pattern.flags, code,
594	p.pattern.groups-1,
595	groupindex, indexgroup
596	)