| 1 | #!/usr/bin/env python2
 | 
| 2 | """
 | 
| 3 | lazylex/html.py - Low-Level HTML Processing.
 | 
| 4 | 
 | 
| 5 | See lazylex/README.md for details.
 | 
| 6 | 
 | 
| 7 | TODO: This should be an Oils library eventually.  It's a "lazily-parsed data
 | 
| 8 | structure" like TSV8
 | 
| 9 | """
 | 
| 10 | from __future__ import print_function
 | 
| 11 | 
 | 
| 12 | import cStringIO
 | 
| 13 | import re
 | 
| 14 | import sys
 | 
| 15 | 
 | 
| 16 | 
 | 
| 17 | def log(msg, *args):
 | 
| 18 |     msg = msg % args
 | 
| 19 |     print(msg, file=sys.stderr)
 | 
| 20 | 
 | 
| 21 | 
 | 
| 22 | class LexError(Exception):
 | 
| 23 |     """For bad lexical elements like <> or &&"""
 | 
| 24 | 
 | 
| 25 |     def __init__(self, s, pos):
 | 
| 26 |         self.s = s
 | 
| 27 |         self.pos = pos
 | 
| 28 | 
 | 
| 29 |     def __str__(self):
 | 
| 30 |         return '(LexError %r)' % (self.s[self.pos:self.pos + 20])
 | 
| 31 | 
 | 
| 32 | 
 | 
| 33 | class ParseError(Exception):
 | 
| 34 |     """For errors in the tag structure."""
 | 
| 35 | 
 | 
| 36 |     def __init__(self, msg, *args):
 | 
| 37 |         self.msg = msg
 | 
| 38 |         self.args = args
 | 
| 39 | 
 | 
| 40 |     def __str__(self):
 | 
| 41 |         return '(ParseError %s)' % (self.msg % self.args)
 | 
| 42 | 
 | 
| 43 | 
 | 
| 44 | class Output(object):
 | 
| 45 |     """Takes an underlying input buffer and an output file.  Maintains a
 | 
| 46 |     position in the input buffer.
 | 
| 47 | 
 | 
| 48 |     Print FROM the input or print new text to the output.
 | 
| 49 |     """
 | 
| 50 | 
 | 
| 51 |     def __init__(self, s, f, left_pos=0, right_pos=0):
 | 
| 52 |         self.s = s
 | 
| 53 |         self.f = f
 | 
| 54 |         self.pos = left_pos
 | 
| 55 |         if right_pos == 0:
 | 
| 56 |             self.right_pos = len(s)
 | 
| 57 |         else:
 | 
| 58 |             self.right_pos = right_pos
 | 
| 59 | 
 | 
| 60 |     def SkipTo(self, pos):
 | 
| 61 |         """Skip to a position."""
 | 
| 62 |         self.pos = pos
 | 
| 63 | 
 | 
| 64 |     def PrintUntil(self, pos):
 | 
| 65 |         """Print until a position."""
 | 
| 66 |         piece = self.s[self.pos:pos]
 | 
| 67 |         self.f.write(piece)
 | 
| 68 |         self.pos = pos
 | 
| 69 | 
 | 
| 70 |     def PrintTheRest(self):
 | 
| 71 |         """Print until the end of the string."""
 | 
| 72 |         self.PrintUntil(self.right_pos)
 | 
| 73 | 
 | 
| 74 |     def Print(self, s):
 | 
| 75 |         """Print text to the underlying buffer."""
 | 
| 76 |         self.f.write(s)
 | 
| 77 | 
 | 
| 78 | 
 | 
| 79 | # HTML Tokens
 | 
| 80 | (Decl, Comment, Processing, StartTag, StartEndTag, EndTag, DecChar, HexChar,
 | 
| 81 |  CharEntity, RawData, Invalid, EndOfStream) = range(12)
 | 
| 82 | 
 | 
| 83 | 
 | 
| 84 | def MakeLexer(rules):
 | 
| 85 |     return [
 | 
| 86 |         # DOTALL is for the comment
 | 
| 87 |         (re.compile(pat, re.VERBOSE | re.DOTALL), i) for (pat, i) in rules
 | 
| 88 |     ]
 | 
| 89 | 
 | 
| 90 | 
 | 
| 91 | #
 | 
| 92 | # Eggex
 | 
| 93 | #
 | 
| 94 | # Tag      = / ~['>']+ /
 | 
| 95 | 
 | 
| 96 | # Is this valid?  A single character?
 | 
| 97 | # Tag      = / ~'>'* /
 | 
| 98 | 
 | 
| 99 | # Maybe better: / [NOT '>']+/
 | 
| 100 | # capital letters not allowed there?
 | 
| 101 | #
 | 
| 102 | # But then this is confusing:
 | 
| 103 | # / [NOT ~digit]+/
 | 
| 104 | #
 | 
| 105 | # / [NOT digit] / is [^\d]
 | 
| 106 | # / ~digit /      is \D
 | 
| 107 | #
 | 
| 108 | # Or maybe:
 | 
| 109 | #
 | 
| 110 | # / [~ digit]+ /
 | 
| 111 | # / [~ '>']+ /
 | 
| 112 | # / [NOT '>']+ /
 | 
| 113 | 
 | 
| 114 | # End      = / '</' Tag  '>' /
 | 
| 115 | # StartEnd = / '<'  Tag '/>' /
 | 
| 116 | # Start    = / '<'  Tag  '>' /
 | 
| 117 | #
 | 
| 118 | # EntityRef = / '&' dot{* N} ';' /
 | 
| 119 | 
 | 
| 120 | LEXER = [
 | 
| 121 |     # TODO: instead of nongreedy matches, the loop can just do .find('-->') and
 | 
| 122 |     # .find('?>')
 | 
| 123 | 
 | 
| 124 |     # Actually non-greedy matches are regular and can be matched in linear time
 | 
| 125 |     # with RE2.
 | 
| 126 |     #
 | 
| 127 |     # https://news.ycombinator.com/item?id=27099798
 | 
| 128 |     #
 | 
| 129 |     # Maybe try combining all of these for speed.
 | 
| 130 |     (r'<!-- .*? -->', Comment),
 | 
| 131 |     (r'<\? .*? \?>', Processing),
 | 
| 132 | 
 | 
| 133 |     # NOTE: < is allowed in these.
 | 
| 134 |     (r'<! [^>]+ >', Decl),  # <!DOCTYPE html>
 | 
| 135 |     (r'</ [^>]+ >', EndTag),  # self-closing <br/>  comes FIRST
 | 
| 136 |     (r'< [^>]+ />', StartEndTag),  # end </a>
 | 
| 137 |     (r'< [^>]+  >', StartTag),  # start <a>
 | 
| 138 |     (r'&\# [0-9]+ ;', DecChar),
 | 
| 139 |     (r'&\# x[0-9a-fA-F]+ ;', HexChar),
 | 
| 140 |     (r'& [a-zA-Z]+ ;', CharEntity),
 | 
| 141 | 
 | 
| 142 |     # Note: > is allowed in raw data.
 | 
| 143 |     # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
 | 
| 144 |     (r'[^&<]+', RawData),
 | 
| 145 |     (r'.', Invalid),  # error!
 | 
| 146 | ]
 | 
| 147 | 
 | 
| 148 | LEXER = MakeLexer(LEXER)
 | 
| 149 | 
 | 
| 150 | 
 | 
| 151 | def _Tokens(s, left_pos, right_pos):
 | 
| 152 |     """
 | 
| 153 |     Args:
 | 
| 154 |       s: string to parse
 | 
| 155 |       left_pos, right_pos: Optional span boundaries.
 | 
| 156 |     """
 | 
| 157 |     pos = left_pos
 | 
| 158 |     if right_pos == 0:
 | 
| 159 |         n = len(s)
 | 
| 160 |     else:
 | 
| 161 |         n = right_pos
 | 
| 162 | 
 | 
| 163 |     while pos < n:
 | 
| 164 |         # Find the FIRST pattern that matches.
 | 
| 165 |         for pat, tok_id in LEXER:
 | 
| 166 |             m = pat.match(s, pos)
 | 
| 167 |             if m:
 | 
| 168 |                 end_pos = m.end()
 | 
| 169 |                 yield tok_id, end_pos
 | 
| 170 |                 pos = end_pos
 | 
| 171 |                 break
 | 
| 172 | 
 | 
| 173 |     # Zero length sentinel
 | 
| 174 |     yield EndOfStream, pos
 | 
| 175 | 
 | 
| 176 | 
 | 
| 177 | def ValidTokens(s, left_pos=0, right_pos=0):
 | 
| 178 |     """Wrapper around _Tokens to prevent callers from having to handle Invalid.
 | 
| 179 | 
 | 
| 180 |     I'm not combining the two functions because I might want to do a
 | 
| 181 |     'yield' transformation on Tokens()?  Exceptions might complicate the
 | 
| 182 |     issue?
 | 
| 183 |     """
 | 
| 184 |     pos = left_pos
 | 
| 185 |     for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
 | 
| 186 |         if tok_id == Invalid:
 | 
| 187 |             raise LexError(s, pos)
 | 
| 188 |         yield tok_id, end_pos
 | 
| 189 |         pos = end_pos
 | 
| 190 | 
 | 
| 191 | 
 | 
| 192 | # To match <a  or </a
 | 
| 193 | # <h2 but not <2h ?
 | 
| 194 | _TAG_RE = re.compile(r'/? \s* ([a-zA-Z][a-zA-Z0-9]*)', re.VERBOSE)
 | 
| 195 | 
 | 
| 196 | # To match href="foo"
 | 
| 197 | 
 | 
| 198 | _ATTR_RE = re.compile(
 | 
| 199 |     r'''
 | 
| 200 | \s+                     # Leading whitespace is required
 | 
| 201 | ([a-z]+)                # Attribute name
 | 
| 202 | (?:                     # Optional attribute value
 | 
| 203 |   \s* = \s*
 | 
| 204 |   (?:
 | 
| 205 |     " ([^>"]*) "        # double quoted value
 | 
| 206 |   | ([a-zA-Z0-9_\-]+)   # Just allow unquoted "identifiers"
 | 
| 207 |                         # TODO: relax this?  for href=$foo
 | 
| 208 |   )
 | 
| 209 | )?             
 | 
| 210 | ''', re.VERBOSE)
 | 
| 211 | 
 | 
| 212 | TagName, AttrName, UnquotedValue, QuotedValue = range(4)
 | 
| 213 | 
 | 
| 214 | 
 | 
| 215 | class TagLexer(object):
 | 
| 216 |     """
 | 
| 217 |     Given a tag like <a href="..."> or <link type="..." />, the TagLexer
 | 
| 218 |     provides a few operations:
 | 
| 219 | 
 | 
| 220 |     - What is the tag?
 | 
| 221 |     - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
 | 
| 222 |     """
 | 
| 223 | 
 | 
| 224 |     def __init__(self, s):
 | 
| 225 |         self.s = s
 | 
| 226 |         self.start_pos = -1  # Invalid
 | 
| 227 |         self.end_pos = -1
 | 
| 228 | 
 | 
| 229 |     def Reset(self, start_pos, end_pos):
 | 
| 230 |         self.start_pos = start_pos
 | 
| 231 |         self.end_pos = end_pos
 | 
| 232 | 
 | 
| 233 |     def TagString(self):
 | 
| 234 |         return self.s[self.start_pos:self.end_pos]
 | 
| 235 | 
 | 
| 236 |     def TagName(self):
 | 
| 237 |         # First event
 | 
| 238 |         tok_id, start, end = next(self.Tokens())
 | 
| 239 |         return self.s[start:end]
 | 
| 240 | 
 | 
| 241 |     def GetSpanForAttrValue(self, attr_name):
 | 
| 242 |         # Algorithm: search for QuotedValue or UnquotedValue after AttrName
 | 
| 243 |         # TODO: Could also cache these
 | 
| 244 | 
 | 
| 245 |         events = self.Tokens()
 | 
| 246 |         val = (-1, -1)
 | 
| 247 |         try:
 | 
| 248 |             while True:
 | 
| 249 |                 tok_id, start, end = next(events)
 | 
| 250 |                 if tok_id == AttrName:
 | 
| 251 |                     name = self.s[start:end]
 | 
| 252 |                     if name == attr_name:
 | 
| 253 |                         # For HasAttr()
 | 
| 254 |                         #val = True
 | 
| 255 | 
 | 
| 256 |                         # Now try to get a real value
 | 
| 257 |                         tok_id, start, end = next(events)
 | 
| 258 |                         if tok_id in (QuotedValue, UnquotedValue):
 | 
| 259 | 
 | 
| 260 |                             # TODO: Unescape this with htmlentitydefs
 | 
| 261 |                             # I think we need another lexer!
 | 
| 262 |                             #
 | 
| 263 |                             # We could make a single pass?
 | 
| 264 |                             # Shortcut: 'if '&' in substring'
 | 
| 265 |                             # Then we need to unescape it
 | 
| 266 | 
 | 
| 267 |                             val = start, end
 | 
| 268 |                             break
 | 
| 269 | 
 | 
| 270 |         except StopIteration:
 | 
| 271 |             pass
 | 
| 272 |         return val
 | 
| 273 | 
 | 
| 274 |     def GetAttr(self, attr_name):
 | 
| 275 |         # Algorithm: search for QuotedValue or UnquotedValue after AttrName
 | 
| 276 |         # TODO: Could also cache these
 | 
| 277 |         start, end = self.GetSpanForAttrValue(attr_name)
 | 
| 278 |         if start == -1:
 | 
| 279 |             return None
 | 
| 280 |         return self.s[start:end]
 | 
| 281 | 
 | 
| 282 |     def Tokens(self):
 | 
| 283 |         """
 | 
| 284 |         Yields a sequence of tokens: Tag (AttrName AttrValue?)*
 | 
| 285 | 
 | 
| 286 |         Where each Token is (Type, start_pos, end_pos)
 | 
| 287 | 
 | 
| 288 |         Note that start and end are NOT redundant!  We skip over some unwanted
 | 
| 289 |         characters.
 | 
| 290 |         """
 | 
| 291 |         m = _TAG_RE.match(self.s, self.start_pos + 1)
 | 
| 292 |         if not m:
 | 
| 293 |             raise RuntimeError('Invalid HTML tag: %r' % self.TagString())
 | 
| 294 |         yield TagName, m.start(1), m.end(1)
 | 
| 295 | 
 | 
| 296 |         pos = m.end(0)
 | 
| 297 | 
 | 
| 298 |         while True:
 | 
| 299 |             # don't search past the end
 | 
| 300 |             m = _ATTR_RE.match(self.s, pos, self.end_pos)
 | 
| 301 |             if not m:
 | 
| 302 |                 # A validating parser would check that > or /> is next -- there's no junk
 | 
| 303 |                 break
 | 
| 304 | 
 | 
| 305 |             yield AttrName, m.start(1), m.end(1)
 | 
| 306 | 
 | 
| 307 |             # Quoted is group 2, unquoted is group 3.
 | 
| 308 |             if m.group(2) is not None:
 | 
| 309 |                 yield QuotedValue, m.start(2), m.end(2)
 | 
| 310 |             elif m.group(3) is not None:
 | 
| 311 |                 yield UnquotedValue, m.start(3), m.end(3)
 | 
| 312 | 
 | 
| 313 |             # Skip past the "
 | 
| 314 |             pos = m.end(0)
 | 
| 315 | 
 | 
| 316 | 
 | 
| 317 | def ReadUntilStartTag(it, tag_lexer, tag_name):
 | 
| 318 |     """Find the next <foo>.
 | 
| 319 | 
 | 
| 320 |     tag_lexer is RESET.
 | 
| 321 |     """
 | 
| 322 |     pos = 0
 | 
| 323 |     while True:
 | 
| 324 |         try:
 | 
| 325 |             tok_id, end_pos = next(it)
 | 
| 326 |         except StopIteration:
 | 
| 327 |             break
 | 
| 328 |         tag_lexer.Reset(pos, end_pos)
 | 
| 329 |         if tok_id == StartTag and tag_lexer.TagName() == tag_name:
 | 
| 330 |             return pos, end_pos
 | 
| 331 | 
 | 
| 332 |         pos = end_pos
 | 
| 333 | 
 | 
| 334 |     raise ParseError('No start tag %r', tag_name)
 | 
| 335 | 
 | 
| 336 | 
 | 
| 337 | def ReadUntilEndTag(it, tag_lexer, tag_name):
 | 
| 338 |     """Find the next </foo>.
 | 
| 339 | 
 | 
| 340 |     tag_lexer is RESET.
 | 
| 341 |     """
 | 
| 342 |     pos = 0
 | 
| 343 |     while True:
 | 
| 344 |         try:
 | 
| 345 |             tok_id, end_pos = next(it)
 | 
| 346 |         except StopIteration:
 | 
| 347 |             break
 | 
| 348 |         tag_lexer.Reset(pos, end_pos)
 | 
| 349 |         if tok_id == EndTag and tag_lexer.TagName() == tag_name:
 | 
| 350 |             return pos, end_pos
 | 
| 351 | 
 | 
| 352 |         pos = end_pos
 | 
| 353 | 
 | 
| 354 |     raise ParseError('No end tag %r', tag_name)
 | 
| 355 | 
 | 
| 356 | 
 | 
| 357 | CHAR_ENTITY = {
 | 
| 358 |     'amp': '&',
 | 
| 359 |     'lt': '<',
 | 
| 360 |     'gt': '>',
 | 
| 361 |     'quot': '"',
 | 
| 362 | }
 | 
| 363 | 
 | 
| 364 | 
 | 
| 365 | def ToText(s, left_pos=0, right_pos=0):
 | 
| 366 |     """Given HTML, return text by unquoting > and < etc.
 | 
| 367 | 
 | 
| 368 |     Used by:
 | 
| 369 |       doctools/oils_doc.py: PygmentsPlugin
 | 
| 370 |       doctool/make_help.py: HelpIndexCards
 | 
| 371 | 
 | 
| 372 |     In the latter case, we cold process some tags, like:
 | 
| 373 | 
 | 
| 374 |     - Blue Link (not clickable, but still useful)
 | 
| 375 |     - Red X
 | 
| 376 | 
 | 
| 377 |     That should be html.ToAnsi.
 | 
| 378 |     """
 | 
| 379 |     f = cStringIO.StringIO()
 | 
| 380 |     out = Output(s, f, left_pos, right_pos)
 | 
| 381 | 
 | 
| 382 |     pos = left_pos
 | 
| 383 |     for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
 | 
| 384 |         if tok_id == RawData:
 | 
| 385 |             out.SkipTo(pos)
 | 
| 386 |             out.PrintUntil(end_pos)
 | 
| 387 | 
 | 
| 388 |         elif tok_id == CharEntity:  # &
 | 
| 389 | 
 | 
| 390 |             entity = s[pos + 1:end_pos - 1]
 | 
| 391 | 
 | 
| 392 |             out.SkipTo(pos)
 | 
| 393 |             out.Print(CHAR_ENTITY[entity])
 | 
| 394 |             out.SkipTo(end_pos)
 | 
| 395 | 
 | 
| 396 |         # Not handling these yet
 | 
| 397 |         elif tok_id == HexChar:
 | 
| 398 |             raise AssertionError('Hex Char %r' % s[pos:pos + 20])
 | 
| 399 | 
 | 
| 400 |         elif tok_id == DecChar:
 | 
| 401 |             raise AssertionError('Dec Char %r' % s[pos:pos + 20])
 | 
| 402 | 
 | 
| 403 |         pos = end_pos
 | 
| 404 | 
 | 
| 405 |     out.PrintTheRest()
 | 
| 406 |     return f.getvalue()
 |