OILS / doctools / cmark.py View on Github | oilshell.org

498 lines, 277 significant
1#!/usr/bin/env python2
2"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
3and insert anchors.
4
5I started from cmark-0.28.3/wrappers/wrapper.py.
6"""
7from __future__ import print_function
8
9import ctypes
10import HTMLParser
11import json
12import optparse
13import os
14import pprint
15import sys
16
17from doctools import html_lib
18from doctools import doc_html # templates
19from doctools import oils_doc
20
21# Geez find_library returns the filename and not the path? Just hardcode it as
22# a workaround.
23# https://bugs.python.org/issue21042
24
25#from ctypes.util import find_library
26#libname = find_library("cmark")
27#assert libname, "cmark not found"
28
29# There's some ongoing discussion about how to deal with the same in Nix.
30# I think normally you'd just patch/substitute this path during the Nix build.
31# See note in shell.nix
32this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
33
34cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
35cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
36cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
37
38if cmark1 is not None and os.path.exists(cmark1):
39 libname = cmark1
40elif os.path.exists(cmark2):
41 libname = cmark2
42elif os.path.exists(cmark3):
43 libname = cmark3
44else:
45 raise AssertionError("Couldn't find libcmark.so")
46
47cmark = ctypes.CDLL(libname)
48
49markdown = cmark.cmark_markdown_to_html
50markdown.restype = ctypes.c_char_p
51markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
52
53
54def log(msg, *args):
55 if args:
56 msg = msg % args
57
58 if 0:
59 print(msg, file=sys.stderr)
60
61
62# Version 0.29.0 disallowed raw HTML by default!
63CMARK_OPT_UNSAFE = (1 << 17)
64
65
66def md2html(text):
67 textbytes = text
68 textlen = len(text)
69 return markdown(textbytes, textlen, CMARK_OPT_UNSAFE)
70
71
72def demo():
73 sys.stdout.write(md2html('*hi*'))
74
75
76class TocExtractor(HTMLParser.HTMLParser):
77 """Extract Table of Contents
78
79 When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
80 the line number.
81
82 Later, we insert two things:
83 - <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
84 - The TOC after <div id="toc">
85 """
86
87 def __init__(self):
88 HTMLParser.HTMLParser.__init__(self)
89
90 # make targets for these, regardless of whether the TOC links to them.
91 self.h_tags = ['h2', 'h3', 'h4']
92 self.indent = 0
93
94 # The TOC will be inserted after this.
95 self.toc_begin_line = -1
96 self.dense_toc_begin_line = -1
97
98 self.capturing = False
99
100 # Flat list of (line_num, tag, id, HTML)?
101 # HTML is like innerHTML. There can be <code> annotations and so forth.
102 # id is optional -- it can be used for generating headings.
103 self.headings = []
104
105 def handle_starttag(self, tag, attrs):
106 if tag == 'div':
107 if attrs == [('id', 'toc')]:
108 log('%s> %s %s', self.indent * ' ', tag, attrs)
109 self.indent += 1
110 self.toc_begin_line, _ = self.getpos()
111 elif attrs == [('id', 'dense-toc')]:
112 self.indent += 1
113 self.dense_toc_begin_line, _ = self.getpos()
114
115 # Can't have nested <a> tags
116 if self.capturing and tag != 'a':
117 self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
118
119 if tag in self.h_tags:
120 log('%s> %s %s', self.indent * ' ', tag, attrs)
121 self.indent += 1
122 line_num, _ = self.getpos()
123
124 css_id = None
125 for k, v in attrs:
126 if k == 'id':
127 css_id = v
128 break
129 self.headings.append((line_num, tag, css_id, [], []))
130 self.capturing = True # record the text inside <h2></h2> etc.
131
132 def handle_endtag(self, tag):
133 # Debug print
134 if tag == 'div':
135 self.indent -= 1
136 log('%s< %s', self.indent * ' ', tag)
137
138 if tag in self.h_tags:
139 self.indent -= 1
140 log('%s< %s', self.indent * ' ', tag)
141 self.capturing = False
142
143 # Can't have nested <a> tags
144 if self.capturing and tag != 'a':
145 self._AppendHtml('</%s>' % tag)
146
147 def handle_entityref(self, data):
148 """
149 From Python docs:
150 This method is called to process a named character reference of the form
151 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
152 """
153 # BUG FIX: For when we have say &quot; or &lt; in subheadings
154 if self.capturing:
155 self._AppendHtml('&%s;' % data)
156
157 def handle_data(self, data):
158 # Debug print
159 if self.indent > 0:
160 log('%s| %r', self.indent * ' ', data)
161
162 if self.capturing:
163 self._AppendHtml(data)
164 self._AppendText(data)
165
166 def _AppendText(self, text):
167 """Accumulate text of the last heading."""
168 _, _, _, _, text_parts = self.headings[-1]
169 text_parts.append(text)
170
171 def _AppendHtml(self, html):
172 """Accumulate HTML of the last heading."""
173 _, _, _, html_parts, _ = self.headings[-1]
174 html_parts.append(html)
175
176
177TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
178
179# We could just add <h2 id="foo"> attribute! I didn't know those are valid
180# anchors.
181# But it's easier to insert an entire line, rather than part ofa line.
182ANCHOR_FMT = '<a name="%s"></a>\n'
183
184
185def _MakeTocInsertions(opts, toc_tags, headings, toc_pos,
186 preserve_anchor_case):
187 """Given extract headings list and TOC position, return a list of insertions.
188
189 The insertions <div> for the TOC itself, and <a name=""> for the targets.
190
191 Args:
192 toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
193 all of them.
194 """
195 # Example:
196 # <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
197 #
198 # Yeah it's just a flat list, and then indentation is done with CSS. Hm
199 # that's easy.
200
201 toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
202 insertions = []
203
204 i = 0
205 for line_num, tag, css_id, html_parts, text_parts in headings:
206 css_class = TAG_TO_CSS[tag]
207
208 # Add BOTH href, for stability.
209 numeric_href = 'toc_%d' % i
210
211 # If there was an explicit CSS ID written by the user, use that as the href.
212 # I used this in the blog a few times.
213
214 pretty_href = html_lib.PrettyHref(
215 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
216
217 if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
218 toc_href = css_id
219 else:
220 # Always use the pretty version now. The old numeric version is still a
221 # target, but not in the TOC.
222 toc_href = pretty_href
223
224 line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
225 css_class, toc_href, ''.join(html_parts))
226 if tag in toc_tags:
227 toc_lines.append(line)
228
229 targets = []
230 if opts.toc_pretty_href: # NEW WAY
231 targets.append(ANCHOR_FMT % pretty_href)
232 elif css_id: # Old blog explicit
233 targets.append(ANCHOR_FMT % css_id)
234 targets.append(ANCHOR_FMT % numeric_href)
235 else: # Old blog implicit
236 targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
237 targets.append(ANCHOR_FMT % numeric_href)
238
239 insertions.append((line_num, ''.join(targets)))
240
241 i += 1
242
243 # +1 to insert AFTER the <div>
244 toc_insert = (toc_pos + 1, ''.join(toc_lines))
245 insertions.insert(0, toc_insert) # The first insertion is TOC
246
247 return insertions
248
249
250def _MakeTocInsertionsDense(headings, toc_pos, preserve_anchor_case):
251 """For the dense-toc style with columns, used by doc/ref
252
253 The style above is simpler: it outputs a div for every line:
254
255 <div id="toctitle">Table of Contents</div>
256
257 <div class="toclevel1><a ...> Level 1 </a></div>
258 <div class="toclevel2><a ...> 1.A </a></div>
259 <div class="toclevel2><a ...> 1.B </a></div>
260 <div class="toclevel1><a ...> Level 2 </a></div>
261 ...
262
263 We want something like this:
264
265 <div id="dense-toc-title">Table of Contents</div>
266
267 <div class="dense-toc-group">
268 <a ...> Level 1 </a> <br/>
269
270 <a class="dense-toc-h3" ...> 1.A </a> <br/>
271 <a class="dense-toc-h3" ...> 1.B </a> <br/>
272
273 </div> # NO BREAKING within this div
274
275 <div class="dense-toc-group">
276 <a ...> Level 2 </a> <br/>
277 </div>
278 """
279
280 heading_tree = []
281 current_h2 = None
282
283 insertions = []
284
285 for line_num, tag, css_id, html_parts, text_parts in headings:
286
287 pretty_href = html_lib.PrettyHref(
288 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
289
290 if css_id: # doc/ref can use <h3 id="explicit"></h3>
291 toc_href = css_id
292 else:
293 # Always use the pretty version now. The old numeric version is still a
294 # target, but not in the TOC.
295 toc_href = pretty_href
296
297 anchor_html = ''.join(html_parts)
298
299 # Create a two level tree
300 if tag == 'h2':
301 current_h2 = (anchor_html, toc_href, [])
302 heading_tree.append(current_h2)
303 elif tag == 'h3':
304 assert current_h2 is not None, "h3 shouldn't come before any h2"
305 current_h2[2].append((anchor_html, toc_href))
306
307 # Insert the target <a name="">
308 insertions.append((line_num, ANCHOR_FMT % pretty_href))
309
310 #print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
311
312 if 1:
313 log('Heading Tree:')
314 log(pprint.pformat(heading_tree))
315 log('')
316
317 toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
318 toc_lines.append('<div id="dense-toc-cols">\n')
319
320 for h2_html, h2_href, children in heading_tree:
321 toc_lines.append('<div class="dense-toc-group">\n')
322 toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
323 for h3_html, h3_href in children:
324 toc_lines.append(
325 ' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
326 (h3_href, h3_html))
327 toc_lines.append('</div>\n')
328
329 toc_lines.append('</div>\n')
330
331 if 1:
332 log('TOC lines')
333 log(pprint.pformat(toc_lines))
334 log('')
335
336 # +1 to insert AFTER the <div>
337 toc_insert = (toc_pos + 1, ''.join(toc_lines))
338 insertions.insert(0, toc_insert) # The first insertion is TOC
339
340 return insertions
341
342
343def _ApplyInsertions(lines, insertions, out_file):
344 assert insertions, "Should be at least one insertion"
345 j = 0
346 n = len(insertions)
347
348 for i, line in enumerate(lines):
349 current_line = i + 1 # 1-based
350
351 if j < n:
352 line_num, s = insertions[j]
353 if current_line == line_num:
354 out_file.write(s)
355 j += 1
356
357 out_file.write(line)
358
359
360def Render(opts, meta, in_file, out_file, use_fastlex=True, debug_out=None):
361 if debug_out is None:
362 debug_out = []
363
364 # First convert to HTML
365 html = md2html(in_file.read())
366
367 # Now process HTML with oils_doc
368 if use_fastlex:
369 # Note: extract code BEFORE doing the HTML highlighting.
370 if opts.code_block_output:
371 with open(opts.code_block_output, 'w') as f:
372 f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
373 opts.code_block_output)
374 text = oils_doc.ExtractCode(html, f)
375
376 html = oils_doc.RemoveComments(html)
377
378 # Hack for allowing tables without <p> in cells, which CommonMark seems to
379 # require?
380 html = html.replace('<p><pstrip>', '')
381 html = html.replace('</pstrip></p>', '')
382
383 # Expand $xref, etc.
384 html = oils_doc.ExpandLinks(html)
385
386 # <code> blocks
387 # Including class=language-oil-help-topics
388 html = oils_doc.HighlightCode(html,
389 meta.get('default_highlighter'),
390 debug_out=debug_out)
391
392 # h2 is the title. h1 is unused.
393 if opts.toc_tags:
394 toc_tags = opts.toc_tags
395 else:
396 toc_tags = ('h3', 'h4')
397
398 parser = TocExtractor()
399 parser.feed(html)
400
401 log('')
402 log('*** HTML headings:')
403 for heading in parser.headings:
404 log(heading)
405
406 preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
407
408 if parser.toc_begin_line != -1:
409 insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
410 parser.toc_begin_line,
411 preserve_anchor_case)
412 elif parser.dense_toc_begin_line != -1:
413 insertions = _MakeTocInsertionsDense(parser.headings,
414 parser.dense_toc_begin_line,
415 preserve_anchor_case)
416 else: # No TOC found Not found!
417 out_file.write(html) # Pass through
418 return
419
420 log('')
421 log('*** Text Insertions:')
422 for ins in insertions:
423 log(ins)
424
425 log('')
426 log('*** Output:')
427
428 lines = html.splitlines(True) # keep newlines
429 _ApplyInsertions(lines, insertions, out_file)
430
431
432def Options():
433 p = optparse.OptionParser('cmark.py [options]')
434
435 p.add_option(
436 '--toc-pretty-href',
437 action='store_true',
438 default=False,
439 help='Generate textual hrefs #like-this rather than like #toc10')
440 p.add_option('--toc-tag',
441 dest='toc_tags',
442 action='append',
443 default=[],
444 help='h tags to include in the TOC, e.g. h2 h3')
445 p.add_option('--disable-fastlex',
446 dest='disable_fastlex',
447 action='store_true',
448 default=False,
449 help='Hack for old blog posts')
450
451 p.add_option('--code-block-output',
452 dest='code_block_output',
453 default=None,
454 help='Extract and print code blocks to this file')
455
456 return p
457
458
459# width 40 by default
460DEFAULT_META = {'body_css_class': 'width40'}
461
462
463def main(argv):
464 o = Options()
465 opts, argv = o.parse_args(argv)
466 assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
467
468 meta = dict(DEFAULT_META)
469
470 if len(argv) == 3: # It's Oil documentation
471 with open(argv[1]) as f:
472 meta.update(json.load(f))
473
474 # Docs have a special header and footer.
475 with open(argv[2]) as content_f:
476 doc_html.Header(meta, sys.stdout, draft_warning=True)
477 Render(opts, meta, content_f, sys.stdout)
478 doc_html.Footer(meta, sys.stdout)
479 else:
480 # Filter for blog and for benchmarks.
481
482 # Metadata is optional here
483 try:
484 with open(argv[1]) as f:
485 meta.update(json.load(f))
486 except IndexError:
487 pass
488
489 # Old style for blog: it's a filter
490 Render(opts,
491 meta,
492 sys.stdin,
493 sys.stdout,
494 use_fastlex=not opts.disable_fastlex)
495
496
497if __name__ == '__main__':
498 main(sys.argv)