metrics/bytecode.R

OILS / metrics / bytecode.R View on Github | oilshell.org

388 lines, 204 significant

1	#!/usr/bin/env Rscript
2	#
3	# bytecode.R -- Analyze output of opyc dis-tables.
4	#
5	# Usage:
6	# bytecode.R ACTION IN_DIR OUT_DIR
7
8	library(dplyr)
9	library(tidyr) # spread()
10	library(stringr)
11
12	source('benchmarks/common.R')
13
14	options(stringsAsFactors = F,
15	# Make the report wide. tibble.width doesn't appear to do this?
16	width=200,
17	tibble.print_max=Inf
18	)
19
20	Basic = function(ctx) {
21	Banner('BASIC METRICS')
22
23	# Number of files
24	ctx$frames %>% count(path) -> by_path
25	ShowValue('Number of files: %d', nrow(by_path))
26
27	# 216K
28	b = sum(ctx$frames$bytecode_bytes)
29	ShowValue('Total bytecode bytes: %d', b)
30
31	num_insts = nrow(ctx$ops)
32	ShowValue('Total instructions: %d', num_insts)
33
34	# Hm this isn't reliable because the code name isn't unique! I think we need
35	# firstlineno
36	ctx$frames %>% count(path, code_name) %>% arrange(desc(n)) %>% head() -> f1
37	ShowFrame('Duplicate path/name', f1)
38	}
39
40	BigStrings = function(consts) {
41	Banner('BIG STRINGS')
42
43	strs = consts %>% filter(type == 'str') %>% arrange(desc(len_or_val))
44	strs %>% head(20) %>% print()
45	total_bytes = sum(strs$len_or_val)
46
47	# 184 KB of strings! That's just the payload; the header is probably more.
48	ShowValue('total string bytes: %d', total_bytes)
49
50	# This plot says:
51	#
52	# total bytes is 184 KB
53	# - the top 10 strings sum to 20K bytes
54	# - the top 100 strings sum to 30K bytes
55
56	cum = cumsum(strs$len_or_val)
57	plot(cum)
58
59	#plot(ecdf(strs$len_or_val))
60	}
61
62	Consts = function(consts) {
63	Banner('CONSTS')
64
65	# count of types of constants. Strings dominate of course.
66	# But there are only 7 or so immutable types!
67
68	# - only 2 float constants.
69	# - get rid of the unicode constants in posixpath.
70
71	consts %>% count(type) %>% arrange(desc(n)) %>% head(20) -> frequent
72	ShowFrame('Types of constants', frequent)
73	}
74
75	# Frames by number of consts, number of ops, etc.
76	Frames = function(ctx) {
77	Banner('FRAMES')
78
79	ctx$consts %>% count(path, code_name, sort=T) %>% head(20) -> f1
80	ShowFrame('Frames with many consts', f1)
81
82	ctx$ops %>% count(path, code_name, sort=T) %>% head(20) -> f2
83	ShowFrame('Frames with many ops', f2)
84
85	ctx$frames %>% arrange(desc(stacksize)) %>% head(10) -> f3
86	ShowFrame('Frames with large stacksize', f3)
87
88	ctx$frames %>% arrange(desc(nlocals)) %>% head(10) -> f4
89	ShowFrame('Frames with many locals', f4)
90	}
91
92	# OpKind is FAST for LOAD_FAST, or SLICE for STORE_SLICE+1
93	#
94	# [,1] is the whole match, and [,2] is the first match. Like $0 and $1 in
95	# normal regexes.
96	OpKind = function(op_name) {
97	# optional +1 suffix
98	str_match(op_name, '([A-Z]+)(?:\\+[0-9])?$')[,2]
99	}
100
101	Ops = function(ops, ops_defined = '_tmp/opcodes-defined.txt') {
102	Banner('OPS')
103
104	ops %>% count(op_name) %>% arrange(desc(n)) -> op_freq
105
106	ShowFrame('Ops Used by Frequency', op_freq)
107
108	u2 = ops %>% distinct(op_name)
109	ShowValue('Total unique opcodes: %d', nrow(u2))
110
111	if (ops_defined != '') {
112	defined = read.table(ops_defined, header=F)
113	colnames(defined) = c('op_name')
114
115	setdiff(defined, u2) -> f4
116	ShowFrame('Unused opcodes:', f4)
117	}
118
119	op_freq %>%
120	filter(str_detect(op_name, 'LOAD\|STORE\|FAST')) %>%
121	mutate(kind = OpKind(op_name)) %>%
122	arrange(kind) %>%
123	select(kind, op_name, n) -> mem_ops
124	ShowFrame('Memory Operations:', mem_ops)
125
126	# NOTE: got rid of IMPORT_STAR!
127	ops %>% filter(str_detect(op_name, 'IMPORT')) %>% count(op_name) -> imports
128	ShowFrame('Imports:', imports)
129
130	# These are all the big jump targets! Max is 3,852, which is a lot less than
131	# 65,536. We don't need EXTENDED_ARG!
132	ops %>% arrange(desc(op_arg)) %>% head(10) -> f1
133	ShowFrame('Large op_arg (jump targets):', f1)
134	}
135
136	Flags = function(flags) {
137	Banner('FLAGS')
138
139	flags %>% count(flag) %>% arrange(desc(n)) -> f1
140	ShowFrame('Common flags', f1)
141	}
142
143	Names = function(names) {
144	Banner('NAMES')
145
146	# Common types: free, cell, etc.
147	names %>% count(kind) %>% arrange(desc(n)) %>% head(20) -> f1
148	ShowFrame('Common types', f1)
149
150	# Common names:
151	# self, None, True, False, append, len
152	names %>% count(name) %>% arrange(desc(n)) %>% head(20) -> f2
153	ShowFrame('Common names', f2)
154
155	names %>% mutate(len=nchar(name)) -> all
156	names %>% count(name) %>% mutate(len=nchar(name)) -> unique
157
158	ShowValue('Total length of all %d names: %d',
159	nrow(all), sum(all$len))
160	ShowValue('Total length of %d unique names: %d',
161	nrow(unique), sum(unique$len))
162	}
163
164	# Hm max unique ops is 58
165	# _build/oil/bytecode-opy/core/cmd_exec.pyc 54
166	# _build/oil/bytecode-opy/warnings.pyc 55
167	# _build/oil/bytecode-opy/_abcoll.pyc 58
168	#
169	# But there are 119 total opcodes. A lot of the math ones are uncommon.
170
171	# Written by opy/metrics.sh. Could get rid of that file.
172	UniqueOpsByFile = function(ops) {
173	Banner('UNIQUE OPS')
174
175	# This is a row for every path/op_name
176	u = ops %>% group_by(path) %>% distinct(op_name)
177	u %>% count(path) %>% arrange(n) -> ops_by_file
178
179	ops_by_file %>% head(20) -> f1
180	ShowFrame('Files with few ops:', f1)
181
182	ops_by_file %>% tail(10) -> f2
183	ShowFrame('Files with many ops:', f2)
184
185	ops_by_file %>% filter(grepl('reader\|lex\|parse', path)) -> f3
186	ShowFrame('Unique ops for files that just parse:', f3) # 17, 23, 34, 34, 46
187
188	ops %>% filter(grepl('reader\|lex\|parse', path)) %>% distinct(op_name) ->
189	string_ops
190	ShowValue('Unique opcodes for parsing: %d', nrow(string_ops))
191	}
192
193	# OPy emits 88 distinct opcodes out of 119. Interesting.
194	# CPython emits 94 distinct opcodes.
195	# STORE_MAP and SETUP_WITH are the only differences. Is this for dict literals?
196	#
197	#
198	# setdiff(cpy$ops %>% distinct(op_name), opy$ops %>% distinct(op_name))
199	# op_name
200	# 1 STORE_MAP
201	# 2 SETUP_WITH
202	# 3 PRINT_ITEM
203	# 4 PRINT_NEWLINE
204	# 5 PRINT_ITEM_TO
205	# 6 PRINT_NEWLINE_TO
206
207	# Unused opcodes:
208	# op_name
209	# 1 BINARY_TRUE_DIVIDE
210	# 2 BUILD_SET
211	# 3 BUILD_SLICE
212	# 4 CONTINUE_LOOP
213	# 5 DELETE_ATTR
214	# 6 DELETE_GLOBAL
215	# 7 DELETE_SLICE+2
216	# 8 DELETE_SLICE+3
217	# 9 EXTENDED_ARG
218	# 10 INPLACE_DIVIDE
219	# 11 INPLACE_FLOOR_DIVIDE
220	# 12 INPLACE_LSHIFT
221	# 13 INPLACE_MODULO
222	# 14 INPLACE_OR
223	# 15 INPLACE_POWER
224	# 16 INPLACE_TRUE_DIVIDE
225	# 17 NOP
226	# 18 PRINT_EXPR
227	# 19 PRINT_ITEM
228	# 20 PRINT_ITEM_TO
229	# 21 PRINT_NEWLINE
230	# 22 PRINT_NEWLINE_TO
231	# 23 ROT_FOUR
232	# 24 SETUP_WITH
233	# 25 SET_ADD
234	# 26 STOP_CODE
235	# 27 STORE_MAP
236	# 28 STORE_SLICE+2
237	# 29 STORE_SLICE+3
238	# 30 UNARY_CONVERT
239	# 31 UNARY_POSITIVE
240
241
242	Report = function(ctx) {
243	Basic(ctx)
244	BigStrings(ctx$consts)
245
246	Frames(ctx)
247	Names(ctx$names)
248	Consts(ctx$consts)
249	Flags(ctx$flags)
250
251	Ops(ctx$ops)
252	UniqueOpsByFile(ctx$ops)
253	}
254
255	Load = function(in_dir) {
256	list(
257	frames = read.table(file.path(in_dir, 'frames.tsv2'), header=T),
258	names = read.table(file.path(in_dir, 'names.tsv2'), header=T),
259	consts = read.table(file.path(in_dir, 'consts.tsv2'), header=T),
260	flags = read.table(file.path(in_dir, 'flags.tsv2'), header=T),
261	ops = read.table(file.path(in_dir, 'ops.tsv2'), header=T)
262	)
263	}
264
265	# This takes a table of (py_path, pyc_path) and calls file.info()$size on both.
266	# Then it computes the ratio.
267
268	FileSizes = function(all_deps_py, pyc_base_dir) {
269	py_pyc = read.table(all_deps_py, header=F)
270	colnames(py_pyc) = c('py_path', 'pyc_path')
271
272	py_pyc$py_bytes = file.info(py_pyc$py_path)$size
273
274	pyc_paths = file.path(pyc_base_dir, py_pyc$pyc_path)
275	py_pyc$pyc_bytes = file.info(pyc_paths)$size
276
277	py_pyc %>% filter(py_bytes != 0) %>% mutate(ratio = pyc_bytes / py_bytes) %>%
278	arrange(ratio) -> py_pyc
279
280	Banner('RATIO')
281
282	py_pyc %>% head(10) -> small
283	ShowFrame('small .pyc files:', small)
284
285	py_pyc %>% tail(10) -> big
286	ShowFrame('big .pyc files:', big)
287
288	# This ratio is a ltitle misleading because it counts comments.
289	py_total = sum(py_pyc$py_bytes)
290	pyc_total = sum(py_pyc$pyc_bytes)
291
292	ShowValue('Overall: %d bytes of .py -> %d bytes of .pyc', py_total, pyc_total)
293	ShowValue('Ratio: %f', pyc_total / py_total)
294
295	Banner('FULL LISTING')
296
297	py_pyc %>% select(c(pyc_bytes, pyc_path)) %>% arrange(desc(pyc_bytes)) -> f1
298	ShowFrame('bytecode', f1)
299	ShowValue('total (again): %d', pyc_total)
300
301	py_pyc
302	}
303
304
305	CompareCol = function(ctx) {
306	c(nrow(ctx$frames),
307	nrow(ctx$names),
308	nrow(ctx$consts),
309	nrow(ctx$flags),
310	nrow(ctx$ops)
311	)
312	}
313
314	Compare = function(cpython_ctx, opy_ctx) {
315	Banner('CPYTHON vs. OPY')
316
317	tibble(
318	table_name = c('frames', 'names', 'consts', 'flags', 'ops'),
319	cpython = CompareCol(cpython_ctx),
320	opy = CompareCol(opy_ctx)
321	) -> f1
322
323	ShowFrame('Overview', f1)
324
325	Banner('Cell Variables')
326
327	cpython_ctx$names %>% filter(kind == 'cell') -> f2
328	opy_ctx$names %>% filter(kind == 'cell') -> f3
329
330	ShowFrame('CPython', f2)
331	ShowFrame('OPy', f3)
332
333	Banner('CLOSURE bytecodes')
334
335	cpython_ctx$ops %>%
336	filter(op_name %in% c('LOAD_CLOSURE', 'MAKE_CLOSURE')) -> f4
337	opy_ctx$ops %>%
338	filter(op_name %in% c('LOAD_CLOSURE', 'MAKE_CLOSURE')) -> f5
339
340	ShowFrame('CPython', f4)
341	ShowFrame('OPy', f5)
342
343	Banner('Rare bytecodes')
344
345	cpython_ctx$ops %>% filter(op_name == 'DELETE_FAST') -> f6
346	ShowFrame('DELETE_FAST in CPython', f6)
347
348	opy_ctx$ops %>% filter(op_name == 'DELETE_FAST') -> f7
349	ShowFrame('DELETE_FAST in OPy', f7)
350
351	# These are all for the global util.GetResourceLoader().
352	opy_ctx$ops %>% filter(op_name == 'STORE_GLOBAL') -> f8
353	ShowFrame('STORE_GLOBAL in OPy', f8)
354
355	# In asdl/unpickle.py.
356	opy_ctx$ops %>% filter(op_name == 'STORE_SLICE+1') -> f9
357	ShowFrame('STORE_SLICE+1 in OPy', f9)
358	}
359
360	main = function(argv) {
361	action = argv[[1]]
362
363	if (action == 'metrics') {
364	in_dir = argv[[2]]
365	ctx = Load(in_dir)
366	Report(ctx)
367
368	} else if (action == 'compare') {
369	cpython_ctx = Load(argv[[2]])
370	opy_ctx = Load(argv[[3]])
371	Compare(cpython_ctx, opy_ctx)
372
373	} else if (action == 'src-bin-ratio') { # This takes different inputs
374	all_deps_py = argv[[2]]
375	pyc_base_dir = argv[[3]]
376	ctx = FileSizes(all_deps_py, pyc_base_dir)
377
378	} else {
379	Log("Invalid action '%s'", action)
380	quit(status = 1)
381	}
382	}
383
384	if (length(sys.frames()) == 0) {
385	# increase ggplot font size globally
386	#theme_set(theme_grey(base_size = 20))
387	main(commandArgs(TRUE))
388	}