lazylex/html.py

OILS / lazylex / html.py View on Github | oilshell.org

406 lines, 184 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	TODO: This should be an Oils library eventually. It's a "lazily-parsed data
8	structure" like TSV8
9	"""
10	from __future__ import print_function
11
12	import cStringIO
13	import re
14	import sys
15
16
17	def log(msg, *args):
18	msg = msg % args
19	print(msg, file=sys.stderr)
20
21
22	class LexError(Exception):
23	"""For bad lexical elements like <> or &&"""
24
25	def __init__(self, s, pos):
26	self.s = s
27	self.pos = pos
28
29	def __str__(self):
30	return '(LexError %r)' % (self.s[self.pos:self.pos + 20])
31
32
33	class ParseError(Exception):
34	"""For errors in the tag structure."""
35
36	def __init__(self, msg, *args):
37	self.msg = msg
38	self.args = args
39
40	def __str__(self):
41	return '(ParseError %s)' % (self.msg % self.args)
42
43
44	class Output(object):
45	"""Takes an underlying input buffer and an output file. Maintains a
46	position in the input buffer.
47
48	Print FROM the input or print new text to the output.
49	"""
50
51	def __init__(self, s, f, left_pos=0, right_pos=0):
52	self.s = s
53	self.f = f
54	self.pos = left_pos
55	if right_pos == 0:
56	self.right_pos = len(s)
57	else:
58	self.right_pos = right_pos
59
60	def SkipTo(self, pos):
61	"""Skip to a position."""
62	self.pos = pos
63
64	def PrintUntil(self, pos):
65	"""Print until a position."""
66	piece = self.s[self.pos:pos]
67	self.f.write(piece)
68	self.pos = pos
69
70	def PrintTheRest(self):
71	"""Print until the end of the string."""
72	self.PrintUntil(self.right_pos)
73
74	def Print(self, s):
75	"""Print text to the underlying buffer."""
76	self.f.write(s)
77
78
79	# HTML Tokens
80	(Decl, Comment, Processing, StartTag, StartEndTag, EndTag, DecChar, HexChar,
81	CharEntity, RawData, Invalid, EndOfStream) = range(12)
82
83
84	def MakeLexer(rules):
85	return [
86	# DOTALL is for the comment
87	(re.compile(pat, re.VERBOSE \| re.DOTALL), i) for (pat, i) in rules
88	]
89
90
91	#
92	# Eggex
93	#
94	# Tag = / ~['>']+ /
95
96	# Is this valid? A single character?
97	# Tag = / ~'>'* /
98
99	# Maybe better: / [NOT '>']+/
100	# capital letters not allowed there?
101	#
102	# But then this is confusing:
103	# / [NOT ~digit]+/
104	#
105	# / [NOT digit] / is [^\d]
106	# / ~digit / is \D
107	#
108	# Or maybe:
109	#
110	# / [~ digit]+ /
111	# / [~ '>']+ /
112	# / [NOT '>']+ /
113
114	# End = / '</' Tag '>' /
115	# StartEnd = / '<' Tag '/>' /
116	# Start = / '<' Tag '>' /
117	#
118	# EntityRef = / '&' dot{* N} ';' /
119
120	LEXER = [
121	# TODO: instead of nongreedy matches, the loop can just do .find('-->') and
122	# .find('?>')
123
124	# Actually non-greedy matches are regular and can be matched in linear time
125	# with RE2.
126	#
127	# https://news.ycombinator.com/item?id=27099798
128	#
129	# Maybe try combining all of these for speed.
130	(r'<!-- .*? -->', Comment),
131	(r'<\? .*? \?>', Processing),
132
133	# NOTE: < is allowed in these.
134	(r'<! [^>]+ >', Decl), # <!DOCTYPE html>
135	(r'</ [^>]+ >', EndTag), # self-closing <br/> comes FIRST
136	(r'< [^>]+ />', StartEndTag), # end </a>
137	(r'< [^>]+ >', StartTag), # start <a>
138	(r'&\# [0-9]+ ;', DecChar),
139	(r'&\# x[0-9a-fA-F]+ ;', HexChar),
140	(r'& [a-zA-Z]+ ;', CharEntity),
141
142	# Note: > is allowed in raw data.
143	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
144	(r'[^&<]+', RawData),
145	(r'.', Invalid), # error!
146	]
147
148	LEXER = MakeLexer(LEXER)
149
150
151	def _Tokens(s, left_pos, right_pos):
152	"""
153	Args:
154	s: string to parse
155	left_pos, right_pos: Optional span boundaries.
156	"""
157	pos = left_pos
158	if right_pos == 0:
159	n = len(s)
160	else:
161	n = right_pos
162
163	while pos < n:
164	# Find the FIRST pattern that matches.
165	for pat, tok_id in LEXER:
166	m = pat.match(s, pos)
167	if m:
168	end_pos = m.end()
169	yield tok_id, end_pos
170	pos = end_pos
171	break
172
173	# Zero length sentinel
174	yield EndOfStream, pos
175
176
177	def ValidTokens(s, left_pos=0, right_pos=0):
178	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
179
180	I'm not combining the two functions because I might want to do a
181	'yield' transformation on Tokens()? Exceptions might complicate the
182	issue?
183	"""
184	pos = left_pos
185	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
186	if tok_id == Invalid:
187	raise LexError(s, pos)
188	yield tok_id, end_pos
189	pos = end_pos
190
191
192	# To match <a or </a
193	# <h2 but not <2h ?
194	_TAG_RE = re.compile(r'/? \s* ([a-zA-Z][a-zA-Z0-9]*)', re.VERBOSE)
195
196	# To match href="foo"
197
198	_ATTR_RE = re.compile(
199	r'''
200	\s+ # Leading whitespace is required
201	([a-z]+) # Attribute name
202	(?: # Optional attribute value
203	\s* = \s*
204	(?:
205	" ([^>"]*) " # double quoted value
206	\| ([a-zA-Z0-9_\-]+) # Just allow unquoted "identifiers"
207	# TODO: relax this? for href=$foo
208	)
209	)?
210	''', re.VERBOSE)
211
212	TagName, AttrName, UnquotedValue, QuotedValue = range(4)
213
214
215	class TagLexer(object):
216	"""
217	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
218	provides a few operations:
219
220	- What is the tag?
221	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
222	"""
223
224	def __init__(self, s):
225	self.s = s
226	self.start_pos = -1 # Invalid
227	self.end_pos = -1
228
229	def Reset(self, start_pos, end_pos):
230	self.start_pos = start_pos
231	self.end_pos = end_pos
232
233	def TagString(self):
234	return self.s[self.start_pos:self.end_pos]
235
236	def TagName(self):
237	# First event
238	tok_id, start, end = next(self.Tokens())
239	return self.s[start:end]
240
241	def GetSpanForAttrValue(self, attr_name):
242	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
243	# TODO: Could also cache these
244
245	events = self.Tokens()
246	val = (-1, -1)
247	try:
248	while True:
249	tok_id, start, end = next(events)
250	if tok_id == AttrName:
251	name = self.s[start:end]
252	if name == attr_name:
253	# For HasAttr()
254	#val = True
255
256	# Now try to get a real value
257	tok_id, start, end = next(events)
258	if tok_id in (QuotedValue, UnquotedValue):
259
260	# TODO: Unescape this with htmlentitydefs
261	# I think we need another lexer!
262	#
263	# We could make a single pass?
264	# Shortcut: 'if '&' in substring'
265	# Then we need to unescape it
266
267	val = start, end
268	break
269
270	except StopIteration:
271	pass
272	return val
273
274	def GetAttr(self, attr_name):
275	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
276	# TODO: Could also cache these
277	start, end = self.GetSpanForAttrValue(attr_name)
278	if start == -1:
279	return None
280	return self.s[start:end]
281
282	def Tokens(self):
283	"""
284	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
285
286	Where each Token is (Type, start_pos, end_pos)
287
288	Note that start and end are NOT redundant! We skip over some unwanted
289	characters.
290	"""
291	m = _TAG_RE.match(self.s, self.start_pos + 1)
292	if not m:
293	raise RuntimeError('Invalid HTML tag: %r' % self.TagString())
294	yield TagName, m.start(1), m.end(1)
295
296	pos = m.end(0)
297
298	while True:
299	# don't search past the end
300	m = _ATTR_RE.match(self.s, pos, self.end_pos)
301	if not m:
302	# A validating parser would check that > or /> is next -- there's no junk
303	break
304
305	yield AttrName, m.start(1), m.end(1)
306
307	# Quoted is group 2, unquoted is group 3.
308	if m.group(2) is not None:
309	yield QuotedValue, m.start(2), m.end(2)
310	elif m.group(3) is not None:
311	yield UnquotedValue, m.start(3), m.end(3)
312
313	# Skip past the "
314	pos = m.end(0)
315
316
317	def ReadUntilStartTag(it, tag_lexer, tag_name):
318	"""Find the next <foo>.
319
320	tag_lexer is RESET.
321	"""
322	pos = 0
323	while True:
324	try:
325	tok_id, end_pos = next(it)
326	except StopIteration:
327	break
328	tag_lexer.Reset(pos, end_pos)
329	if tok_id == StartTag and tag_lexer.TagName() == tag_name:
330	return pos, end_pos
331
332	pos = end_pos
333
334	raise ParseError('No start tag %r', tag_name)
335
336
337	def ReadUntilEndTag(it, tag_lexer, tag_name):
338	"""Find the next </foo>.
339
340	tag_lexer is RESET.
341	"""
342	pos = 0
343	while True:
344	try:
345	tok_id, end_pos = next(it)
346	except StopIteration:
347	break
348	tag_lexer.Reset(pos, end_pos)
349	if tok_id == EndTag and tag_lexer.TagName() == tag_name:
350	return pos, end_pos
351
352	pos = end_pos
353
354	raise ParseError('No end tag %r', tag_name)
355
356
357	CHAR_ENTITY = {
358	'amp': '&',
359	'lt': '<',
360	'gt': '>',
361	'quot': '"',
362	}
363
364
365	def ToText(s, left_pos=0, right_pos=0):
366	"""Given HTML, return text by unquoting > and < etc.
367
368	Used by:
369	doctools/oils_doc.py: PygmentsPlugin
370	doctool/make_help.py: HelpIndexCards
371
372	In the latter case, we cold process some tags, like:
373
374	- Blue Link (not clickable, but still useful)
375	- Red X
376
377	That should be html.ToAnsi.
378	"""
379	f = cStringIO.StringIO()
380	out = Output(s, f, left_pos, right_pos)
381
382	pos = left_pos
383	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
384	if tok_id == RawData:
385	out.SkipTo(pos)
386	out.PrintUntil(end_pos)
387
388	elif tok_id == CharEntity: # &
389
390	entity = s[pos + 1:end_pos - 1]
391
392	out.SkipTo(pos)
393	out.Print(CHAR_ENTITY[entity])
394	out.SkipTo(end_pos)
395
396	# Not handling these yet
397	elif tok_id == HexChar:
398	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
399
400	elif tok_id == DecChar:
401	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
402
403	pos = end_pos
404
405	out.PrintTheRest()
406	return f.getvalue()