OILS / lazylex / html.py View on Github | oilshell.org

406 lines, 184 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7TODO: This should be an Oils library eventually. It's a "lazily-parsed data
8structure" like TSV8
9"""
10from __future__ import print_function
11
12import cStringIO
13import re
14import sys
15
16
17def log(msg, *args):
18 msg = msg % args
19 print(msg, file=sys.stderr)
20
21
22class LexError(Exception):
23 """For bad lexical elements like <> or &&"""
24
25 def __init__(self, s, pos):
26 self.s = s
27 self.pos = pos
28
29 def __str__(self):
30 return '(LexError %r)' % (self.s[self.pos:self.pos + 20])
31
32
33class ParseError(Exception):
34 """For errors in the tag structure."""
35
36 def __init__(self, msg, *args):
37 self.msg = msg
38 self.args = args
39
40 def __str__(self):
41 return '(ParseError %s)' % (self.msg % self.args)
42
43
44class Output(object):
45 """Takes an underlying input buffer and an output file. Maintains a
46 position in the input buffer.
47
48 Print FROM the input or print new text to the output.
49 """
50
51 def __init__(self, s, f, left_pos=0, right_pos=0):
52 self.s = s
53 self.f = f
54 self.pos = left_pos
55 if right_pos == 0:
56 self.right_pos = len(s)
57 else:
58 self.right_pos = right_pos
59
60 def SkipTo(self, pos):
61 """Skip to a position."""
62 self.pos = pos
63
64 def PrintUntil(self, pos):
65 """Print until a position."""
66 piece = self.s[self.pos:pos]
67 self.f.write(piece)
68 self.pos = pos
69
70 def PrintTheRest(self):
71 """Print until the end of the string."""
72 self.PrintUntil(self.right_pos)
73
74 def Print(self, s):
75 """Print text to the underlying buffer."""
76 self.f.write(s)
77
78
79# HTML Tokens
80(Decl, Comment, Processing, StartTag, StartEndTag, EndTag, DecChar, HexChar,
81 CharEntity, RawData, Invalid, EndOfStream) = range(12)
82
83
84def MakeLexer(rules):
85 return [
86 # DOTALL is for the comment
87 (re.compile(pat, re.VERBOSE | re.DOTALL), i) for (pat, i) in rules
88 ]
89
90
91#
92# Eggex
93#
94# Tag = / ~['>']+ /
95
96# Is this valid? A single character?
97# Tag = / ~'>'* /
98
99# Maybe better: / [NOT '>']+/
100# capital letters not allowed there?
101#
102# But then this is confusing:
103# / [NOT ~digit]+/
104#
105# / [NOT digit] / is [^\d]
106# / ~digit / is \D
107#
108# Or maybe:
109#
110# / [~ digit]+ /
111# / [~ '>']+ /
112# / [NOT '>']+ /
113
114# End = / '</' Tag '>' /
115# StartEnd = / '<' Tag '/>' /
116# Start = / '<' Tag '>' /
117#
118# EntityRef = / '&' dot{* N} ';' /
119
120LEXER = [
121 # TODO: instead of nongreedy matches, the loop can just do .find('-->') and
122 # .find('?>')
123
124 # Actually non-greedy matches are regular and can be matched in linear time
125 # with RE2.
126 #
127 # https://news.ycombinator.com/item?id=27099798
128 #
129 # Maybe try combining all of these for speed.
130 (r'<!-- .*? -->', Comment),
131 (r'<\? .*? \?>', Processing),
132
133 # NOTE: < is allowed in these.
134 (r'<! [^>]+ >', Decl), # <!DOCTYPE html>
135 (r'</ [^>]+ >', EndTag), # self-closing <br/> comes FIRST
136 (r'< [^>]+ />', StartEndTag), # end </a>
137 (r'< [^>]+ >', StartTag), # start <a>
138 (r'&\# [0-9]+ ;', DecChar),
139 (r'&\# x[0-9a-fA-F]+ ;', HexChar),
140 (r'& [a-zA-Z]+ ;', CharEntity),
141
142 # Note: > is allowed in raw data.
143 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
144 (r'[^&<]+', RawData),
145 (r'.', Invalid), # error!
146]
147
148LEXER = MakeLexer(LEXER)
149
150
151def _Tokens(s, left_pos, right_pos):
152 """
153 Args:
154 s: string to parse
155 left_pos, right_pos: Optional span boundaries.
156 """
157 pos = left_pos
158 if right_pos == 0:
159 n = len(s)
160 else:
161 n = right_pos
162
163 while pos < n:
164 # Find the FIRST pattern that matches.
165 for pat, tok_id in LEXER:
166 m = pat.match(s, pos)
167 if m:
168 end_pos = m.end()
169 yield tok_id, end_pos
170 pos = end_pos
171 break
172
173 # Zero length sentinel
174 yield EndOfStream, pos
175
176
177def ValidTokens(s, left_pos=0, right_pos=0):
178 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
179
180 I'm not combining the two functions because I might want to do a
181 'yield' transformation on Tokens()? Exceptions might complicate the
182 issue?
183 """
184 pos = left_pos
185 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
186 if tok_id == Invalid:
187 raise LexError(s, pos)
188 yield tok_id, end_pos
189 pos = end_pos
190
191
192# To match <a or </a
193# <h2 but not <2h ?
194_TAG_RE = re.compile(r'/? \s* ([a-zA-Z][a-zA-Z0-9]*)', re.VERBOSE)
195
196# To match href="foo"
197
198_ATTR_RE = re.compile(
199 r'''
200\s+ # Leading whitespace is required
201([a-z]+) # Attribute name
202(?: # Optional attribute value
203 \s* = \s*
204 (?:
205 " ([^>"]*) " # double quoted value
206 | ([a-zA-Z0-9_\-]+) # Just allow unquoted "identifiers"
207 # TODO: relax this? for href=$foo
208 )
209)?
210''', re.VERBOSE)
211
212TagName, AttrName, UnquotedValue, QuotedValue = range(4)
213
214
215class TagLexer(object):
216 """
217 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
218 provides a few operations:
219
220 - What is the tag?
221 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
222 """
223
224 def __init__(self, s):
225 self.s = s
226 self.start_pos = -1 # Invalid
227 self.end_pos = -1
228
229 def Reset(self, start_pos, end_pos):
230 self.start_pos = start_pos
231 self.end_pos = end_pos
232
233 def TagString(self):
234 return self.s[self.start_pos:self.end_pos]
235
236 def TagName(self):
237 # First event
238 tok_id, start, end = next(self.Tokens())
239 return self.s[start:end]
240
241 def GetSpanForAttrValue(self, attr_name):
242 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
243 # TODO: Could also cache these
244
245 events = self.Tokens()
246 val = (-1, -1)
247 try:
248 while True:
249 tok_id, start, end = next(events)
250 if tok_id == AttrName:
251 name = self.s[start:end]
252 if name == attr_name:
253 # For HasAttr()
254 #val = True
255
256 # Now try to get a real value
257 tok_id, start, end = next(events)
258 if tok_id in (QuotedValue, UnquotedValue):
259
260 # TODO: Unescape this with htmlentitydefs
261 # I think we need another lexer!
262 #
263 # We could make a single pass?
264 # Shortcut: 'if '&' in substring'
265 # Then we need to unescape it
266
267 val = start, end
268 break
269
270 except StopIteration:
271 pass
272 return val
273
274 def GetAttr(self, attr_name):
275 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
276 # TODO: Could also cache these
277 start, end = self.GetSpanForAttrValue(attr_name)
278 if start == -1:
279 return None
280 return self.s[start:end]
281
282 def Tokens(self):
283 """
284 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
285
286 Where each Token is (Type, start_pos, end_pos)
287
288 Note that start and end are NOT redundant! We skip over some unwanted
289 characters.
290 """
291 m = _TAG_RE.match(self.s, self.start_pos + 1)
292 if not m:
293 raise RuntimeError('Invalid HTML tag: %r' % self.TagString())
294 yield TagName, m.start(1), m.end(1)
295
296 pos = m.end(0)
297
298 while True:
299 # don't search past the end
300 m = _ATTR_RE.match(self.s, pos, self.end_pos)
301 if not m:
302 # A validating parser would check that > or /> is next -- there's no junk
303 break
304
305 yield AttrName, m.start(1), m.end(1)
306
307 # Quoted is group 2, unquoted is group 3.
308 if m.group(2) is not None:
309 yield QuotedValue, m.start(2), m.end(2)
310 elif m.group(3) is not None:
311 yield UnquotedValue, m.start(3), m.end(3)
312
313 # Skip past the "
314 pos = m.end(0)
315
316
317def ReadUntilStartTag(it, tag_lexer, tag_name):
318 """Find the next <foo>.
319
320 tag_lexer is RESET.
321 """
322 pos = 0
323 while True:
324 try:
325 tok_id, end_pos = next(it)
326 except StopIteration:
327 break
328 tag_lexer.Reset(pos, end_pos)
329 if tok_id == StartTag and tag_lexer.TagName() == tag_name:
330 return pos, end_pos
331
332 pos = end_pos
333
334 raise ParseError('No start tag %r', tag_name)
335
336
337def ReadUntilEndTag(it, tag_lexer, tag_name):
338 """Find the next </foo>.
339
340 tag_lexer is RESET.
341 """
342 pos = 0
343 while True:
344 try:
345 tok_id, end_pos = next(it)
346 except StopIteration:
347 break
348 tag_lexer.Reset(pos, end_pos)
349 if tok_id == EndTag and tag_lexer.TagName() == tag_name:
350 return pos, end_pos
351
352 pos = end_pos
353
354 raise ParseError('No end tag %r', tag_name)
355
356
357CHAR_ENTITY = {
358 'amp': '&',
359 'lt': '<',
360 'gt': '>',
361 'quot': '"',
362}
363
364
365def ToText(s, left_pos=0, right_pos=0):
366 """Given HTML, return text by unquoting &gt; and &lt; etc.
367
368 Used by:
369 doctools/oils_doc.py: PygmentsPlugin
370 doctool/make_help.py: HelpIndexCards
371
372 In the latter case, we cold process some tags, like:
373
374 - Blue Link (not clickable, but still useful)
375 - Red X
376
377 That should be html.ToAnsi.
378 """
379 f = cStringIO.StringIO()
380 out = Output(s, f, left_pos, right_pos)
381
382 pos = left_pos
383 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
384 if tok_id == RawData:
385 out.SkipTo(pos)
386 out.PrintUntil(end_pos)
387
388 elif tok_id == CharEntity: # &amp;
389
390 entity = s[pos + 1:end_pos - 1]
391
392 out.SkipTo(pos)
393 out.Print(CHAR_ENTITY[entity])
394 out.SkipTo(end_pos)
395
396 # Not handling these yet
397 elif tok_id == HexChar:
398 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
399
400 elif tok_id == DecChar:
401 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
402
403 pos = end_pos
404
405 out.PrintTheRest()
406 return f.getvalue()