doctools/cmark.py

OILS / doctools / cmark.py View on Github | oilshell.org

498 lines, 277 significant

1	#!/usr/bin/env python2
2	"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
3	and insert anchors.
4
5	I started from cmark-0.28.3/wrappers/wrapper.py.
6	"""
7	from __future__ import print_function
8
9	import ctypes
10	import HTMLParser
11	import json
12	import optparse
13	import os
14	import pprint
15	import sys
16
17	from doctools import html_lib
18	from doctools import doc_html # templates
19	from doctools import oils_doc
20
21	# Geez find_library returns the filename and not the path? Just hardcode it as
22	# a workaround.
23	# https://bugs.python.org/issue21042
24
25	#from ctypes.util import find_library
26	#libname = find_library("cmark")
27	#assert libname, "cmark not found"
28
29	# There's some ongoing discussion about how to deal with the same in Nix.
30	# I think normally you'd just patch/substitute this path during the Nix build.
31	# See note in shell.nix
32	this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
33
34	cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
35	cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
36	cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
37
38	if cmark1 is not None and os.path.exists(cmark1):
39	libname = cmark1
40	elif os.path.exists(cmark2):
41	libname = cmark2
42	elif os.path.exists(cmark3):
43	libname = cmark3
44	else:
45	raise AssertionError("Couldn't find libcmark.so")
46
47	cmark = ctypes.CDLL(libname)
48
49	markdown = cmark.cmark_markdown_to_html
50	markdown.restype = ctypes.c_char_p
51	markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
52
53
54	def log(msg, *args):
55	if args:
56	msg = msg % args
57
58	if 0:
59	print(msg, file=sys.stderr)
60
61
62	# Version 0.29.0 disallowed raw HTML by default!
63	CMARK_OPT_UNSAFE = (1 << 17)
64
65
66	def md2html(text):
67	textbytes = text
68	textlen = len(text)
69	return markdown(textbytes, textlen, CMARK_OPT_UNSAFE)
70
71
72	def demo():
73	sys.stdout.write(md2html('hi'))
74
75
76	class TocExtractor(HTMLParser.HTMLParser):
77	"""Extract Table of Contents
78
79	When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
80	the line number.
81
82	Later, we insert two things:
83	- <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
84	- The TOC after <div id="toc">
85	"""
86
87	def __init__(self):
88	HTMLParser.HTMLParser.__init__(self)
89
90	# make targets for these, regardless of whether the TOC links to them.
91	self.h_tags = ['h2', 'h3', 'h4']
92	self.indent = 0
93
94	# The TOC will be inserted after this.
95	self.toc_begin_line = -1
96	self.dense_toc_begin_line = -1
97
98	self.capturing = False
99
100	# Flat list of (line_num, tag, id, HTML)?
101	# HTML is like innerHTML. There can be <code> annotations and so forth.
102	# id is optional -- it can be used for generating headings.
103	self.headings = []
104
105	def handle_starttag(self, tag, attrs):
106	if tag == 'div':
107	if attrs == [('id', 'toc')]:
108	log('%s> %s %s', self.indent * ' ', tag, attrs)
109	self.indent += 1
110	self.toc_begin_line, _ = self.getpos()
111	elif attrs == [('id', 'dense-toc')]:
112	self.indent += 1
113	self.dense_toc_begin_line, _ = self.getpos()
114
115	# Can't have nested <a> tags
116	if self.capturing and tag != 'a':
117	self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
118
119	if tag in self.h_tags:
120	log('%s> %s %s', self.indent * ' ', tag, attrs)
121	self.indent += 1
122	line_num, _ = self.getpos()
123
124	css_id = None
125	for k, v in attrs:
126	if k == 'id':
127	css_id = v
128	break
129	self.headings.append((line_num, tag, css_id, [], []))
130	self.capturing = True # record the text inside <h2></h2> etc.
131
132	def handle_endtag(self, tag):
133	# Debug print
134	if tag == 'div':
135	self.indent -= 1
136	log('%s< %s', self.indent * ' ', tag)
137
138	if tag in self.h_tags:
139	self.indent -= 1
140	log('%s< %s', self.indent * ' ', tag)
141	self.capturing = False
142
143	# Can't have nested <a> tags
144	if self.capturing and tag != 'a':
145	self._AppendHtml('</%s>' % tag)
146
147	def handle_entityref(self, data):
148	"""
149	From Python docs:
150	This method is called to process a named character reference of the form
151	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
152	"""
153	# BUG FIX: For when we have say " or < in subheadings
154	if self.capturing:
155	self._AppendHtml('&%s;' % data)
156
157	def handle_data(self, data):
158	# Debug print
159	if self.indent > 0:
160	log('%s\| %r', self.indent * ' ', data)
161
162	if self.capturing:
163	self._AppendHtml(data)
164	self._AppendText(data)
165
166	def _AppendText(self, text):
167	"""Accumulate text of the last heading."""
168	_, _, _, _, text_parts = self.headings[-1]
169	text_parts.append(text)
170
171	def _AppendHtml(self, html):
172	"""Accumulate HTML of the last heading."""
173	_, _, _, html_parts, _ = self.headings[-1]
174	html_parts.append(html)
175
176
177	TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
178
179	# We could just add <h2 id="foo"> attribute! I didn't know those are valid
180	# anchors.
181	# But it's easier to insert an entire line, rather than part ofa line.
182	ANCHOR_FMT = '<a name="%s"></a>\n'
183
184
185	def _MakeTocInsertions(opts, toc_tags, headings, toc_pos,
186	preserve_anchor_case):
187	"""Given extract headings list and TOC position, return a list of insertions.
188
189	The insertions <div> for the TOC itself, and <a name=""> for the targets.
190
191	Args:
192	toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
193	all of them.
194	"""
195	# Example:
196	# <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
197	#
198	# Yeah it's just a flat list, and then indentation is done with CSS. Hm
199	# that's easy.
200
201	toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
202	insertions = []
203
204	i = 0
205	for line_num, tag, css_id, html_parts, text_parts in headings:
206	css_class = TAG_TO_CSS[tag]
207
208	# Add BOTH href, for stability.
209	numeric_href = 'toc_%d' % i
210
211	# If there was an explicit CSS ID written by the user, use that as the href.
212	# I used this in the blog a few times.
213
214	pretty_href = html_lib.PrettyHref(
215	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
216
217	if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
218	toc_href = css_id
219	else:
220	# Always use the pretty version now. The old numeric version is still a
221	# target, but not in the TOC.
222	toc_href = pretty_href
223
224	line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
225	css_class, toc_href, ''.join(html_parts))
226	if tag in toc_tags:
227	toc_lines.append(line)
228
229	targets = []
230	if opts.toc_pretty_href: # NEW WAY
231	targets.append(ANCHOR_FMT % pretty_href)
232	elif css_id: # Old blog explicit
233	targets.append(ANCHOR_FMT % css_id)
234	targets.append(ANCHOR_FMT % numeric_href)
235	else: # Old blog implicit
236	targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
237	targets.append(ANCHOR_FMT % numeric_href)
238
239	insertions.append((line_num, ''.join(targets)))
240
241	i += 1
242
243	# +1 to insert AFTER the <div>
244	toc_insert = (toc_pos + 1, ''.join(toc_lines))
245	insertions.insert(0, toc_insert) # The first insertion is TOC
246
247	return insertions
248
249
250	def _MakeTocInsertionsDense(headings, toc_pos, preserve_anchor_case):
251	"""For the dense-toc style with columns, used by doc/ref
252
253	The style above is simpler: it outputs a div for every line:
254
255	<div id="toctitle">Table of Contents</div>
256
257	<div class="toclevel1><a ...> Level 1 </a></div>
258	<div class="toclevel2><a ...> 1.A </a></div>
259	<div class="toclevel2><a ...> 1.B </a></div>
260	<div class="toclevel1><a ...> Level 2 </a></div>
261	...
262
263	We want something like this:
264
265	<div id="dense-toc-title">Table of Contents</div>
266
267	<div class="dense-toc-group">
268	<a ...> Level 1 </a> <br/>
269
270	<a class="dense-toc-h3" ...> 1.A </a> <br/>
271	<a class="dense-toc-h3" ...> 1.B </a> <br/>
272
273	</div> # NO BREAKING within this div
274
275	<div class="dense-toc-group">
276	<a ...> Level 2 </a> <br/>
277	</div>
278	"""
279
280	heading_tree = []
281	current_h2 = None
282
283	insertions = []
284
285	for line_num, tag, css_id, html_parts, text_parts in headings:
286
287	pretty_href = html_lib.PrettyHref(
288	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
289
290	if css_id: # doc/ref can use <h3 id="explicit"></h3>
291	toc_href = css_id
292	else:
293	# Always use the pretty version now. The old numeric version is still a
294	# target, but not in the TOC.
295	toc_href = pretty_href
296
297	anchor_html = ''.join(html_parts)
298
299	# Create a two level tree
300	if tag == 'h2':
301	current_h2 = (anchor_html, toc_href, [])
302	heading_tree.append(current_h2)
303	elif tag == 'h3':
304	assert current_h2 is not None, "h3 shouldn't come before any h2"
305	current_h2[2].append((anchor_html, toc_href))
306
307	# Insert the target <a name="">
308	insertions.append((line_num, ANCHOR_FMT % pretty_href))
309
310	#print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
311
312	if 1:
313	log('Heading Tree:')
314	log(pprint.pformat(heading_tree))
315	log('')
316
317	toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
318	toc_lines.append('<div id="dense-toc-cols">\n')
319
320	for h2_html, h2_href, children in heading_tree:
321	toc_lines.append('<div class="dense-toc-group">\n')
322	toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
323	for h3_html, h3_href in children:
324	toc_lines.append(
325	' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
326	(h3_href, h3_html))
327	toc_lines.append('</div>\n')
328
329	toc_lines.append('</div>\n')
330
331	if 1:
332	log('TOC lines')
333	log(pprint.pformat(toc_lines))
334	log('')
335
336	# +1 to insert AFTER the <div>
337	toc_insert = (toc_pos + 1, ''.join(toc_lines))
338	insertions.insert(0, toc_insert) # The first insertion is TOC
339
340	return insertions
341
342
343	def _ApplyInsertions(lines, insertions, out_file):
344	assert insertions, "Should be at least one insertion"
345	j = 0
346	n = len(insertions)
347
348	for i, line in enumerate(lines):
349	current_line = i + 1 # 1-based
350
351	if j < n:
352	line_num, s = insertions[j]
353	if current_line == line_num:
354	out_file.write(s)
355	j += 1
356
357	out_file.write(line)
358
359
360	def Render(opts, meta, in_file, out_file, use_fastlex=True, debug_out=None):
361	if debug_out is None:
362	debug_out = []
363
364	# First convert to HTML
365	html = md2html(in_file.read())
366
367	# Now process HTML with oils_doc
368	if use_fastlex:
369	# Note: extract code BEFORE doing the HTML highlighting.
370	if opts.code_block_output:
371	with open(opts.code_block_output, 'w') as f:
372	f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
373	opts.code_block_output)
374	text = oils_doc.ExtractCode(html, f)
375
376	html = oils_doc.RemoveComments(html)
377
378	# Hack for allowing tables without <p> in cells, which CommonMark seems to
379	# require?
380	html = html.replace('<p><pstrip>', '')
381	html = html.replace('</pstrip></p>', '')
382
383	# Expand $xref, etc.
384	html = oils_doc.ExpandLinks(html)
385
386	# <code> blocks
387	# Including class=language-oil-help-topics
388	html = oils_doc.HighlightCode(html,
389	meta.get('default_highlighter'),
390	debug_out=debug_out)
391
392	# h2 is the title. h1 is unused.
393	if opts.toc_tags:
394	toc_tags = opts.toc_tags
395	else:
396	toc_tags = ('h3', 'h4')
397
398	parser = TocExtractor()
399	parser.feed(html)
400
401	log('')
402	log('*** HTML headings:')
403	for heading in parser.headings:
404	log(heading)
405
406	preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
407
408	if parser.toc_begin_line != -1:
409	insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
410	parser.toc_begin_line,
411	preserve_anchor_case)
412	elif parser.dense_toc_begin_line != -1:
413	insertions = _MakeTocInsertionsDense(parser.headings,
414	parser.dense_toc_begin_line,
415	preserve_anchor_case)
416	else: # No TOC found Not found!
417	out_file.write(html) # Pass through
418	return
419
420	log('')
421	log('*** Text Insertions:')
422	for ins in insertions:
423	log(ins)
424
425	log('')
426	log('*** Output:')
427
428	lines = html.splitlines(True) # keep newlines
429	_ApplyInsertions(lines, insertions, out_file)
430
431
432	def Options():
433	p = optparse.OptionParser('cmark.py [options]')
434
435	p.add_option(
436	'--toc-pretty-href',
437	action='store_true',
438	default=False,
439	help='Generate textual hrefs #like-this rather than like #toc10')
440	p.add_option('--toc-tag',
441	dest='toc_tags',
442	action='append',
443	default=[],
444	help='h tags to include in the TOC, e.g. h2 h3')
445	p.add_option('--disable-fastlex',
446	dest='disable_fastlex',
447	action='store_true',
448	default=False,
449	help='Hack for old blog posts')
450
451	p.add_option('--code-block-output',
452	dest='code_block_output',
453	default=None,
454	help='Extract and print code blocks to this file')
455
456	return p
457
458
459	# width 40 by default
460	DEFAULT_META = {'body_css_class': 'width40'}
461
462
463	def main(argv):
464	o = Options()
465	opts, argv = o.parse_args(argv)
466	assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
467
468	meta = dict(DEFAULT_META)
469
470	if len(argv) == 3: # It's Oil documentation
471	with open(argv[1]) as f:
472	meta.update(json.load(f))
473
474	# Docs have a special header and footer.
475	with open(argv[2]) as content_f:
476	doc_html.Header(meta, sys.stdout, draft_warning=True)
477	Render(opts, meta, content_f, sys.stdout)
478	doc_html.Footer(meta, sys.stdout)
479	else:
480	# Filter for blog and for benchmarks.
481
482	# Metadata is optional here
483	try:
484	with open(argv[1]) as f:
485	meta.update(json.load(f))
486	except IndexError:
487	pass
488
489	# Old style for blog: it's a filter
490	Render(opts,
491	meta,
492	sys.stdin,
493	sys.stdout,
494	use_fastlex=not opts.disable_fastlex)
495
496
497	if __name__ == '__main__':
498	main(sys.argv)