1 | #!/usr/bin/env Rscript
|
2 | #
|
3 | # bytecode.R -- Analyze output of opyc dis-tables.
|
4 | #
|
5 | # Usage:
|
6 | # bytecode.R ACTION IN_DIR OUT_DIR
|
7 |
|
8 | library(dplyr)
|
9 | library(tidyr) # spread()
|
10 | library(stringr)
|
11 |
|
12 | source('benchmarks/common.R')
|
13 |
|
14 | options(stringsAsFactors = F,
|
15 | # Make the report wide. tibble.width doesn't appear to do this?
|
16 | width=200,
|
17 | tibble.print_max=Inf
|
18 | )
|
19 |
|
20 | Basic = function(ctx) {
|
21 | Banner('BASIC METRICS')
|
22 |
|
23 | # Number of files
|
24 | ctx$frames %>% count(path) -> by_path
|
25 | ShowValue('Number of files: %d', nrow(by_path))
|
26 |
|
27 | # 216K
|
28 | b = sum(ctx$frames$bytecode_bytes)
|
29 | ShowValue('Total bytecode bytes: %d', b)
|
30 |
|
31 | num_insts = nrow(ctx$ops)
|
32 | ShowValue('Total instructions: %d', num_insts)
|
33 |
|
34 | # Hm this isn't reliable because the code name isn't unique! I think we need
|
35 | # firstlineno
|
36 | ctx$frames %>% count(path, code_name) %>% arrange(desc(n)) %>% head() -> f1
|
37 | ShowFrame('Duplicate path/name', f1)
|
38 | }
|
39 |
|
40 | BigStrings = function(consts) {
|
41 | Banner('BIG STRINGS')
|
42 |
|
43 | strs = consts %>% filter(type == 'str') %>% arrange(desc(len_or_val))
|
44 | strs %>% head(20) %>% print()
|
45 | total_bytes = sum(strs$len_or_val)
|
46 |
|
47 | # 184 KB of strings! That's just the payload; the header is probably more.
|
48 | ShowValue('total string bytes: %d', total_bytes)
|
49 |
|
50 | # This plot says:
|
51 | #
|
52 | # total bytes is 184 KB
|
53 | # - the top 10 strings sum to 20K bytes
|
54 | # - the top 100 strings sum to 30K bytes
|
55 |
|
56 | cum = cumsum(strs$len_or_val)
|
57 | plot(cum)
|
58 |
|
59 | #plot(ecdf(strs$len_or_val))
|
60 | }
|
61 |
|
62 | Consts = function(consts) {
|
63 | Banner('CONSTS')
|
64 |
|
65 | # count of types of constants. Strings dominate of course.
|
66 | # But there are only 7 or so immutable types!
|
67 |
|
68 | # - only 2 float constants.
|
69 | # - get rid of the unicode constants in posixpath.
|
70 |
|
71 | consts %>% count(type) %>% arrange(desc(n)) %>% head(20) -> frequent
|
72 | ShowFrame('Types of constants', frequent)
|
73 | }
|
74 |
|
75 | # Frames by number of consts, number of ops, etc.
|
76 | Frames = function(ctx) {
|
77 | Banner('FRAMES')
|
78 |
|
79 | ctx$consts %>% count(path, code_name, sort=T) %>% head(20) -> f1
|
80 | ShowFrame('Frames with many consts', f1)
|
81 |
|
82 | ctx$ops %>% count(path, code_name, sort=T) %>% head(20) -> f2
|
83 | ShowFrame('Frames with many ops', f2)
|
84 |
|
85 | ctx$frames %>% arrange(desc(stacksize)) %>% head(10) -> f3
|
86 | ShowFrame('Frames with large stacksize', f3)
|
87 |
|
88 | ctx$frames %>% arrange(desc(nlocals)) %>% head(10) -> f4
|
89 | ShowFrame('Frames with many locals', f4)
|
90 | }
|
91 |
|
92 | # OpKind is FAST for LOAD_FAST, or SLICE for STORE_SLICE+1
|
93 | #
|
94 | # [,1] is the whole match, and [,2] is the first match. Like $0 and $1 in
|
95 | # normal regexes.
|
96 | OpKind = function(op_name) {
|
97 | # optional +1 suffix
|
98 | str_match(op_name, '([A-Z]+)(?:\\+[0-9])?$')[,2]
|
99 | }
|
100 |
|
101 | Ops = function(ops, ops_defined = '_tmp/opcodes-defined.txt') {
|
102 | Banner('OPS')
|
103 |
|
104 | ops %>% count(op_name) %>% arrange(desc(n)) -> op_freq
|
105 |
|
106 | ShowFrame('Ops Used by Frequency', op_freq)
|
107 |
|
108 | u2 = ops %>% distinct(op_name)
|
109 | ShowValue('Total unique opcodes: %d', nrow(u2))
|
110 |
|
111 | if (ops_defined != '') {
|
112 | defined = read.table(ops_defined, header=F)
|
113 | colnames(defined) = c('op_name')
|
114 |
|
115 | setdiff(defined, u2) -> f4
|
116 | ShowFrame('Unused opcodes:', f4)
|
117 | }
|
118 |
|
119 | op_freq %>%
|
120 | filter(str_detect(op_name, 'LOAD|STORE|FAST')) %>%
|
121 | mutate(kind = OpKind(op_name)) %>%
|
122 | arrange(kind) %>%
|
123 | select(kind, op_name, n) -> mem_ops
|
124 | ShowFrame('Memory Operations:', mem_ops)
|
125 |
|
126 | # NOTE: got rid of IMPORT_STAR!
|
127 | ops %>% filter(str_detect(op_name, 'IMPORT')) %>% count(op_name) -> imports
|
128 | ShowFrame('Imports:', imports)
|
129 |
|
130 | # These are all the big jump targets! Max is 3,852, which is a lot less than
|
131 | # 65,536. We don't need EXTENDED_ARG!
|
132 | ops %>% arrange(desc(op_arg)) %>% head(10) -> f1
|
133 | ShowFrame('Large op_arg (jump targets):', f1)
|
134 | }
|
135 |
|
136 | Flags = function(flags) {
|
137 | Banner('FLAGS')
|
138 |
|
139 | flags %>% count(flag) %>% arrange(desc(n)) -> f1
|
140 | ShowFrame('Common flags', f1)
|
141 | }
|
142 |
|
143 | Names = function(names) {
|
144 | Banner('NAMES')
|
145 |
|
146 | # Common types: free, cell, etc.
|
147 | names %>% count(kind) %>% arrange(desc(n)) %>% head(20) -> f1
|
148 | ShowFrame('Common types', f1)
|
149 |
|
150 | # Common names:
|
151 | # self, None, True, False, append, len
|
152 | names %>% count(name) %>% arrange(desc(n)) %>% head(20) -> f2
|
153 | ShowFrame('Common names', f2)
|
154 |
|
155 | names %>% mutate(len=nchar(name)) -> all
|
156 | names %>% count(name) %>% mutate(len=nchar(name)) -> unique
|
157 |
|
158 | ShowValue('Total length of all %d names: %d',
|
159 | nrow(all), sum(all$len))
|
160 | ShowValue('Total length of %d unique names: %d',
|
161 | nrow(unique), sum(unique$len))
|
162 | }
|
163 |
|
164 | # Hm max unique ops is 58
|
165 | # _build/oil/bytecode-opy/core/cmd_exec.pyc 54
|
166 | # _build/oil/bytecode-opy/warnings.pyc 55
|
167 | # _build/oil/bytecode-opy/_abcoll.pyc 58
|
168 | #
|
169 | # But there are 119 total opcodes. A lot of the math ones are uncommon.
|
170 |
|
171 | # Written by opy/metrics.sh. Could get rid of that file.
|
172 | UniqueOpsByFile = function(ops) {
|
173 | Banner('UNIQUE OPS')
|
174 |
|
175 | # This is a row for every path/op_name
|
176 | u = ops %>% group_by(path) %>% distinct(op_name)
|
177 | u %>% count(path) %>% arrange(n) -> ops_by_file
|
178 |
|
179 | ops_by_file %>% head(20) -> f1
|
180 | ShowFrame('Files with few ops:', f1)
|
181 |
|
182 | ops_by_file %>% tail(10) -> f2
|
183 | ShowFrame('Files with many ops:', f2)
|
184 |
|
185 | ops_by_file %>% filter(grepl('reader|lex|parse', path)) -> f3
|
186 | ShowFrame('Unique ops for files that just parse:', f3) # 17, 23, 34, 34, 46
|
187 |
|
188 | ops %>% filter(grepl('reader|lex|parse', path)) %>% distinct(op_name) ->
|
189 | string_ops
|
190 | ShowValue('Unique opcodes for parsing: %d', nrow(string_ops))
|
191 | }
|
192 |
|
193 | # OPy emits 88 distinct opcodes out of 119. Interesting.
|
194 | # CPython emits 94 distinct opcodes.
|
195 | # STORE_MAP and SETUP_WITH are the only differences. Is this for dict literals?
|
196 | #
|
197 | #
|
198 | # setdiff(cpy$ops %>% distinct(op_name), opy$ops %>% distinct(op_name))
|
199 | # op_name
|
200 | # 1 STORE_MAP
|
201 | # 2 SETUP_WITH
|
202 | # 3 PRINT_ITEM
|
203 | # 4 PRINT_NEWLINE
|
204 | # 5 PRINT_ITEM_TO
|
205 | # 6 PRINT_NEWLINE_TO
|
206 |
|
207 | # Unused opcodes:
|
208 | # op_name
|
209 | # 1 BINARY_TRUE_DIVIDE
|
210 | # 2 BUILD_SET
|
211 | # 3 BUILD_SLICE
|
212 | # 4 CONTINUE_LOOP
|
213 | # 5 DELETE_ATTR
|
214 | # 6 DELETE_GLOBAL
|
215 | # 7 DELETE_SLICE+2
|
216 | # 8 DELETE_SLICE+3
|
217 | # 9 EXTENDED_ARG
|
218 | # 10 INPLACE_DIVIDE
|
219 | # 11 INPLACE_FLOOR_DIVIDE
|
220 | # 12 INPLACE_LSHIFT
|
221 | # 13 INPLACE_MODULO
|
222 | # 14 INPLACE_OR
|
223 | # 15 INPLACE_POWER
|
224 | # 16 INPLACE_TRUE_DIVIDE
|
225 | # 17 NOP
|
226 | # 18 PRINT_EXPR
|
227 | # 19 PRINT_ITEM
|
228 | # 20 PRINT_ITEM_TO
|
229 | # 21 PRINT_NEWLINE
|
230 | # 22 PRINT_NEWLINE_TO
|
231 | # 23 ROT_FOUR
|
232 | # 24 SETUP_WITH
|
233 | # 25 SET_ADD
|
234 | # 26 STOP_CODE
|
235 | # 27 STORE_MAP
|
236 | # 28 STORE_SLICE+2
|
237 | # 29 STORE_SLICE+3
|
238 | # 30 UNARY_CONVERT
|
239 | # 31 UNARY_POSITIVE
|
240 |
|
241 |
|
242 | Report = function(ctx) {
|
243 | Basic(ctx)
|
244 | BigStrings(ctx$consts)
|
245 |
|
246 | Frames(ctx)
|
247 | Names(ctx$names)
|
248 | Consts(ctx$consts)
|
249 | Flags(ctx$flags)
|
250 |
|
251 | Ops(ctx$ops)
|
252 | UniqueOpsByFile(ctx$ops)
|
253 | }
|
254 |
|
255 | Load = function(in_dir) {
|
256 | list(
|
257 | frames = read.table(file.path(in_dir, 'frames.tsv2'), header=T),
|
258 | names = read.table(file.path(in_dir, 'names.tsv2'), header=T),
|
259 | consts = read.table(file.path(in_dir, 'consts.tsv2'), header=T),
|
260 | flags = read.table(file.path(in_dir, 'flags.tsv2'), header=T),
|
261 | ops = read.table(file.path(in_dir, 'ops.tsv2'), header=T)
|
262 | )
|
263 | }
|
264 |
|
265 | # This takes a table of (py_path, pyc_path) and calls file.info()$size on both.
|
266 | # Then it computes the ratio.
|
267 |
|
268 | FileSizes = function(all_deps_py, pyc_base_dir) {
|
269 | py_pyc = read.table(all_deps_py, header=F)
|
270 | colnames(py_pyc) = c('py_path', 'pyc_path')
|
271 |
|
272 | py_pyc$py_bytes = file.info(py_pyc$py_path)$size
|
273 |
|
274 | pyc_paths = file.path(pyc_base_dir, py_pyc$pyc_path)
|
275 | py_pyc$pyc_bytes = file.info(pyc_paths)$size
|
276 |
|
277 | py_pyc %>% filter(py_bytes != 0) %>% mutate(ratio = pyc_bytes / py_bytes) %>%
|
278 | arrange(ratio) -> py_pyc
|
279 |
|
280 | Banner('RATIO')
|
281 |
|
282 | py_pyc %>% head(10) -> small
|
283 | ShowFrame('small .pyc files:', small)
|
284 |
|
285 | py_pyc %>% tail(10) -> big
|
286 | ShowFrame('big .pyc files:', big)
|
287 |
|
288 | # This ratio is a ltitle misleading because it counts comments.
|
289 | py_total = sum(py_pyc$py_bytes)
|
290 | pyc_total = sum(py_pyc$pyc_bytes)
|
291 |
|
292 | ShowValue('Overall: %d bytes of .py -> %d bytes of .pyc', py_total, pyc_total)
|
293 | ShowValue('Ratio: %f', pyc_total / py_total)
|
294 |
|
295 | Banner('FULL LISTING')
|
296 |
|
297 | py_pyc %>% select(c(pyc_bytes, pyc_path)) %>% arrange(desc(pyc_bytes)) -> f1
|
298 | ShowFrame('bytecode', f1)
|
299 | ShowValue('total (again): %d', pyc_total)
|
300 |
|
301 | py_pyc
|
302 | }
|
303 |
|
304 |
|
305 | CompareCol = function(ctx) {
|
306 | c(nrow(ctx$frames),
|
307 | nrow(ctx$names),
|
308 | nrow(ctx$consts),
|
309 | nrow(ctx$flags),
|
310 | nrow(ctx$ops)
|
311 | )
|
312 | }
|
313 |
|
314 | Compare = function(cpython_ctx, opy_ctx) {
|
315 | Banner('CPYTHON vs. OPY')
|
316 |
|
317 | tibble(
|
318 | table_name = c('frames', 'names', 'consts', 'flags', 'ops'),
|
319 | cpython = CompareCol(cpython_ctx),
|
320 | opy = CompareCol(opy_ctx)
|
321 | ) -> f1
|
322 |
|
323 | ShowFrame('Overview', f1)
|
324 |
|
325 | Banner('Cell Variables')
|
326 |
|
327 | cpython_ctx$names %>% filter(kind == 'cell') -> f2
|
328 | opy_ctx$names %>% filter(kind == 'cell') -> f3
|
329 |
|
330 | ShowFrame('CPython', f2)
|
331 | ShowFrame('OPy', f3)
|
332 |
|
333 | Banner('CLOSURE bytecodes')
|
334 |
|
335 | cpython_ctx$ops %>%
|
336 | filter(op_name %in% c('LOAD_CLOSURE', 'MAKE_CLOSURE')) -> f4
|
337 | opy_ctx$ops %>%
|
338 | filter(op_name %in% c('LOAD_CLOSURE', 'MAKE_CLOSURE')) -> f5
|
339 |
|
340 | ShowFrame('CPython', f4)
|
341 | ShowFrame('OPy', f5)
|
342 |
|
343 | Banner('Rare bytecodes')
|
344 |
|
345 | cpython_ctx$ops %>% filter(op_name == 'DELETE_FAST') -> f6
|
346 | ShowFrame('DELETE_FAST in CPython', f6)
|
347 |
|
348 | opy_ctx$ops %>% filter(op_name == 'DELETE_FAST') -> f7
|
349 | ShowFrame('DELETE_FAST in OPy', f7)
|
350 |
|
351 | # These are all for the global util.GetResourceLoader().
|
352 | opy_ctx$ops %>% filter(op_name == 'STORE_GLOBAL') -> f8
|
353 | ShowFrame('STORE_GLOBAL in OPy', f8)
|
354 |
|
355 | # In asdl/unpickle.py.
|
356 | opy_ctx$ops %>% filter(op_name == 'STORE_SLICE+1') -> f9
|
357 | ShowFrame('STORE_SLICE+1 in OPy', f9)
|
358 | }
|
359 |
|
360 | main = function(argv) {
|
361 | action = argv[[1]]
|
362 |
|
363 | if (action == 'metrics') {
|
364 | in_dir = argv[[2]]
|
365 | ctx = Load(in_dir)
|
366 | Report(ctx)
|
367 |
|
368 | } else if (action == 'compare') {
|
369 | cpython_ctx = Load(argv[[2]])
|
370 | opy_ctx = Load(argv[[3]])
|
371 | Compare(cpython_ctx, opy_ctx)
|
372 |
|
373 | } else if (action == 'src-bin-ratio') { # This takes different inputs
|
374 | all_deps_py = argv[[2]]
|
375 | pyc_base_dir = argv[[3]]
|
376 | ctx = FileSizes(all_deps_py, pyc_base_dir)
|
377 |
|
378 | } else {
|
379 | Log("Invalid action '%s'", action)
|
380 | quit(status = 1)
|
381 | }
|
382 | }
|
383 |
|
384 | if (length(sys.frames()) == 0) {
|
385 | # increase ggplot font size globally
|
386 | #theme_set(theme_grey(base_size = 20))
|
387 | main(commandArgs(TRUE))
|
388 | }
|