| 1 | #!/usr/bin/env Rscript
|
| 2 | #
|
| 3 | # bytecode.R -- Analyze output of opyc dis-tables.
|
| 4 | #
|
| 5 | # Usage:
|
| 6 | # bytecode.R ACTION IN_DIR OUT_DIR
|
| 7 |
|
| 8 | library(dplyr)
|
| 9 | library(tidyr) # spread()
|
| 10 | library(stringr)
|
| 11 |
|
| 12 | source('benchmarks/common.R')
|
| 13 |
|
| 14 | options(stringsAsFactors = F,
|
| 15 | # Make the report wide. tibble.width doesn't appear to do this?
|
| 16 | width=200,
|
| 17 | tibble.print_max=Inf
|
| 18 | )
|
| 19 |
|
| 20 | Basic = function(ctx) {
|
| 21 | Banner('BASIC METRICS')
|
| 22 |
|
| 23 | # Number of files
|
| 24 | ctx$frames %>% count(path) -> by_path
|
| 25 | ShowValue('Number of files: %d', nrow(by_path))
|
| 26 |
|
| 27 | # 216K
|
| 28 | b = sum(ctx$frames$bytecode_bytes)
|
| 29 | ShowValue('Total bytecode bytes: %d', b)
|
| 30 |
|
| 31 | num_insts = nrow(ctx$ops)
|
| 32 | ShowValue('Total instructions: %d', num_insts)
|
| 33 |
|
| 34 | # Hm this isn't reliable because the code name isn't unique! I think we need
|
| 35 | # firstlineno
|
| 36 | ctx$frames %>% count(path, code_name) %>% arrange(desc(n)) %>% head() -> f1
|
| 37 | ShowFrame('Duplicate path/name', f1)
|
| 38 | }
|
| 39 |
|
| 40 | BigStrings = function(consts) {
|
| 41 | Banner('BIG STRINGS')
|
| 42 |
|
| 43 | strs = consts %>% filter(type == 'str') %>% arrange(desc(len_or_val))
|
| 44 | strs %>% head(20) %>% print()
|
| 45 | total_bytes = sum(strs$len_or_val)
|
| 46 |
|
| 47 | # 184 KB of strings! That's just the payload; the header is probably more.
|
| 48 | ShowValue('total string bytes: %d', total_bytes)
|
| 49 |
|
| 50 | # This plot says:
|
| 51 | #
|
| 52 | # total bytes is 184 KB
|
| 53 | # - the top 10 strings sum to 20K bytes
|
| 54 | # - the top 100 strings sum to 30K bytes
|
| 55 |
|
| 56 | cum = cumsum(strs$len_or_val)
|
| 57 | plot(cum)
|
| 58 |
|
| 59 | #plot(ecdf(strs$len_or_val))
|
| 60 | }
|
| 61 |
|
| 62 | Consts = function(consts) {
|
| 63 | Banner('CONSTS')
|
| 64 |
|
| 65 | # count of types of constants. Strings dominate of course.
|
| 66 | # But there are only 7 or so immutable types!
|
| 67 |
|
| 68 | # - only 2 float constants.
|
| 69 | # - get rid of the unicode constants in posixpath.
|
| 70 |
|
| 71 | consts %>% count(type) %>% arrange(desc(n)) %>% head(20) -> frequent
|
| 72 | ShowFrame('Types of constants', frequent)
|
| 73 | }
|
| 74 |
|
| 75 | # Frames by number of consts, number of ops, etc.
|
| 76 | Frames = function(ctx) {
|
| 77 | Banner('FRAMES')
|
| 78 |
|
| 79 | ctx$consts %>% count(path, code_name, sort=T) %>% head(20) -> f1
|
| 80 | ShowFrame('Frames with many consts', f1)
|
| 81 |
|
| 82 | ctx$ops %>% count(path, code_name, sort=T) %>% head(20) -> f2
|
| 83 | ShowFrame('Frames with many ops', f2)
|
| 84 |
|
| 85 | ctx$frames %>% arrange(desc(stacksize)) %>% head(10) -> f3
|
| 86 | ShowFrame('Frames with large stacksize', f3)
|
| 87 |
|
| 88 | ctx$frames %>% arrange(desc(nlocals)) %>% head(10) -> f4
|
| 89 | ShowFrame('Frames with many locals', f4)
|
| 90 | }
|
| 91 |
|
| 92 | # OpKind is FAST for LOAD_FAST, or SLICE for STORE_SLICE+1
|
| 93 | #
|
| 94 | # [,1] is the whole match, and [,2] is the first match. Like $0 and $1 in
|
| 95 | # normal regexes.
|
| 96 | OpKind = function(op_name) {
|
| 97 | # optional +1 suffix
|
| 98 | str_match(op_name, '([A-Z]+)(?:\\+[0-9])?$')[,2]
|
| 99 | }
|
| 100 |
|
| 101 | Ops = function(ops, ops_defined = '_tmp/opcodes-defined.txt') {
|
| 102 | Banner('OPS')
|
| 103 |
|
| 104 | ops %>% count(op_name) %>% arrange(desc(n)) -> op_freq
|
| 105 |
|
| 106 | ShowFrame('Ops Used by Frequency', op_freq)
|
| 107 |
|
| 108 | u2 = ops %>% distinct(op_name)
|
| 109 | ShowValue('Total unique opcodes: %d', nrow(u2))
|
| 110 |
|
| 111 | if (ops_defined != '') {
|
| 112 | defined = read.table(ops_defined, header=F)
|
| 113 | colnames(defined) = c('op_name')
|
| 114 |
|
| 115 | setdiff(defined, u2) -> f4
|
| 116 | ShowFrame('Unused opcodes:', f4)
|
| 117 | }
|
| 118 |
|
| 119 | op_freq %>%
|
| 120 | filter(str_detect(op_name, 'LOAD|STORE|FAST')) %>%
|
| 121 | mutate(kind = OpKind(op_name)) %>%
|
| 122 | arrange(kind) %>%
|
| 123 | select(kind, op_name, n) -> mem_ops
|
| 124 | ShowFrame('Memory Operations:', mem_ops)
|
| 125 |
|
| 126 | # NOTE: got rid of IMPORT_STAR!
|
| 127 | ops %>% filter(str_detect(op_name, 'IMPORT')) %>% count(op_name) -> imports
|
| 128 | ShowFrame('Imports:', imports)
|
| 129 |
|
| 130 | # These are all the big jump targets! Max is 3,852, which is a lot less than
|
| 131 | # 65,536. We don't need EXTENDED_ARG!
|
| 132 | ops %>% arrange(desc(op_arg)) %>% head(10) -> f1
|
| 133 | ShowFrame('Large op_arg (jump targets):', f1)
|
| 134 | }
|
| 135 |
|
| 136 | Flags = function(flags) {
|
| 137 | Banner('FLAGS')
|
| 138 |
|
| 139 | flags %>% count(flag) %>% arrange(desc(n)) -> f1
|
| 140 | ShowFrame('Common flags', f1)
|
| 141 | }
|
| 142 |
|
| 143 | Names = function(names) {
|
| 144 | Banner('NAMES')
|
| 145 |
|
| 146 | # Common types: free, cell, etc.
|
| 147 | names %>% count(kind) %>% arrange(desc(n)) %>% head(20) -> f1
|
| 148 | ShowFrame('Common types', f1)
|
| 149 |
|
| 150 | # Common names:
|
| 151 | # self, None, True, False, append, len
|
| 152 | names %>% count(name) %>% arrange(desc(n)) %>% head(20) -> f2
|
| 153 | ShowFrame('Common names', f2)
|
| 154 |
|
| 155 | names %>% mutate(len=nchar(name)) -> all
|
| 156 | names %>% count(name) %>% mutate(len=nchar(name)) -> unique
|
| 157 |
|
| 158 | ShowValue('Total length of all %d names: %d',
|
| 159 | nrow(all), sum(all$len))
|
| 160 | ShowValue('Total length of %d unique names: %d',
|
| 161 | nrow(unique), sum(unique$len))
|
| 162 | }
|
| 163 |
|
| 164 | # Hm max unique ops is 58
|
| 165 | # _build/oil/bytecode-opy/core/cmd_exec.pyc 54
|
| 166 | # _build/oil/bytecode-opy/warnings.pyc 55
|
| 167 | # _build/oil/bytecode-opy/_abcoll.pyc 58
|
| 168 | #
|
| 169 | # But there are 119 total opcodes. A lot of the math ones are uncommon.
|
| 170 |
|
| 171 | # Written by opy/metrics.sh. Could get rid of that file.
|
| 172 | UniqueOpsByFile = function(ops) {
|
| 173 | Banner('UNIQUE OPS')
|
| 174 |
|
| 175 | # This is a row for every path/op_name
|
| 176 | u = ops %>% group_by(path) %>% distinct(op_name)
|
| 177 | u %>% count(path) %>% arrange(n) -> ops_by_file
|
| 178 |
|
| 179 | ops_by_file %>% head(20) -> f1
|
| 180 | ShowFrame('Files with few ops:', f1)
|
| 181 |
|
| 182 | ops_by_file %>% tail(10) -> f2
|
| 183 | ShowFrame('Files with many ops:', f2)
|
| 184 |
|
| 185 | ops_by_file %>% filter(grepl('reader|lex|parse', path)) -> f3
|
| 186 | ShowFrame('Unique ops for files that just parse:', f3) # 17, 23, 34, 34, 46
|
| 187 |
|
| 188 | ops %>% filter(grepl('reader|lex|parse', path)) %>% distinct(op_name) ->
|
| 189 | string_ops
|
| 190 | ShowValue('Unique opcodes for parsing: %d', nrow(string_ops))
|
| 191 | }
|
| 192 |
|
| 193 | # OPy emits 88 distinct opcodes out of 119. Interesting.
|
| 194 | # CPython emits 94 distinct opcodes.
|
| 195 | # STORE_MAP and SETUP_WITH are the only differences. Is this for dict literals?
|
| 196 | #
|
| 197 | #
|
| 198 | # setdiff(cpy$ops %>% distinct(op_name), opy$ops %>% distinct(op_name))
|
| 199 | # op_name
|
| 200 | # 1 STORE_MAP
|
| 201 | # 2 SETUP_WITH
|
| 202 | # 3 PRINT_ITEM
|
| 203 | # 4 PRINT_NEWLINE
|
| 204 | # 5 PRINT_ITEM_TO
|
| 205 | # 6 PRINT_NEWLINE_TO
|
| 206 |
|
| 207 | # Unused opcodes:
|
| 208 | # op_name
|
| 209 | # 1 BINARY_TRUE_DIVIDE
|
| 210 | # 2 BUILD_SET
|
| 211 | # 3 BUILD_SLICE
|
| 212 | # 4 CONTINUE_LOOP
|
| 213 | # 5 DELETE_ATTR
|
| 214 | # 6 DELETE_GLOBAL
|
| 215 | # 7 DELETE_SLICE+2
|
| 216 | # 8 DELETE_SLICE+3
|
| 217 | # 9 EXTENDED_ARG
|
| 218 | # 10 INPLACE_DIVIDE
|
| 219 | # 11 INPLACE_FLOOR_DIVIDE
|
| 220 | # 12 INPLACE_LSHIFT
|
| 221 | # 13 INPLACE_MODULO
|
| 222 | # 14 INPLACE_OR
|
| 223 | # 15 INPLACE_POWER
|
| 224 | # 16 INPLACE_TRUE_DIVIDE
|
| 225 | # 17 NOP
|
| 226 | # 18 PRINT_EXPR
|
| 227 | # 19 PRINT_ITEM
|
| 228 | # 20 PRINT_ITEM_TO
|
| 229 | # 21 PRINT_NEWLINE
|
| 230 | # 22 PRINT_NEWLINE_TO
|
| 231 | # 23 ROT_FOUR
|
| 232 | # 24 SETUP_WITH
|
| 233 | # 25 SET_ADD
|
| 234 | # 26 STOP_CODE
|
| 235 | # 27 STORE_MAP
|
| 236 | # 28 STORE_SLICE+2
|
| 237 | # 29 STORE_SLICE+3
|
| 238 | # 30 UNARY_CONVERT
|
| 239 | # 31 UNARY_POSITIVE
|
| 240 |
|
| 241 |
|
| 242 | Report = function(ctx) {
|
| 243 | Basic(ctx)
|
| 244 | BigStrings(ctx$consts)
|
| 245 |
|
| 246 | Frames(ctx)
|
| 247 | Names(ctx$names)
|
| 248 | Consts(ctx$consts)
|
| 249 | Flags(ctx$flags)
|
| 250 |
|
| 251 | Ops(ctx$ops)
|
| 252 | UniqueOpsByFile(ctx$ops)
|
| 253 | }
|
| 254 |
|
| 255 | Load = function(in_dir) {
|
| 256 | list(
|
| 257 | frames = read.table(file.path(in_dir, 'frames.tsv2'), header=T),
|
| 258 | names = read.table(file.path(in_dir, 'names.tsv2'), header=T),
|
| 259 | consts = read.table(file.path(in_dir, 'consts.tsv2'), header=T),
|
| 260 | flags = read.table(file.path(in_dir, 'flags.tsv2'), header=T),
|
| 261 | ops = read.table(file.path(in_dir, 'ops.tsv2'), header=T)
|
| 262 | )
|
| 263 | }
|
| 264 |
|
| 265 | # This takes a table of (py_path, pyc_path) and calls file.info()$size on both.
|
| 266 | # Then it computes the ratio.
|
| 267 |
|
| 268 | FileSizes = function(all_deps_py, pyc_base_dir) {
|
| 269 | py_pyc = read.table(all_deps_py, header=F)
|
| 270 | colnames(py_pyc) = c('py_path', 'pyc_path')
|
| 271 |
|
| 272 | py_pyc$py_bytes = file.info(py_pyc$py_path)$size
|
| 273 |
|
| 274 | pyc_paths = file.path(pyc_base_dir, py_pyc$pyc_path)
|
| 275 | py_pyc$pyc_bytes = file.info(pyc_paths)$size
|
| 276 |
|
| 277 | py_pyc %>% filter(py_bytes != 0) %>% mutate(ratio = pyc_bytes / py_bytes) %>%
|
| 278 | arrange(ratio) -> py_pyc
|
| 279 |
|
| 280 | Banner('RATIO')
|
| 281 |
|
| 282 | py_pyc %>% head(10) -> small
|
| 283 | ShowFrame('small .pyc files:', small)
|
| 284 |
|
| 285 | py_pyc %>% tail(10) -> big
|
| 286 | ShowFrame('big .pyc files:', big)
|
| 287 |
|
| 288 | # This ratio is a ltitle misleading because it counts comments.
|
| 289 | py_total = sum(py_pyc$py_bytes)
|
| 290 | pyc_total = sum(py_pyc$pyc_bytes)
|
| 291 |
|
| 292 | ShowValue('Overall: %d bytes of .py -> %d bytes of .pyc', py_total, pyc_total)
|
| 293 | ShowValue('Ratio: %f', pyc_total / py_total)
|
| 294 |
|
| 295 | Banner('FULL LISTING')
|
| 296 |
|
| 297 | py_pyc %>% select(c(pyc_bytes, pyc_path)) %>% arrange(desc(pyc_bytes)) -> f1
|
| 298 | ShowFrame('bytecode', f1)
|
| 299 | ShowValue('total (again): %d', pyc_total)
|
| 300 |
|
| 301 | py_pyc
|
| 302 | }
|
| 303 |
|
| 304 |
|
| 305 | CompareCol = function(ctx) {
|
| 306 | c(nrow(ctx$frames),
|
| 307 | nrow(ctx$names),
|
| 308 | nrow(ctx$consts),
|
| 309 | nrow(ctx$flags),
|
| 310 | nrow(ctx$ops)
|
| 311 | )
|
| 312 | }
|
| 313 |
|
| 314 | Compare = function(cpython_ctx, opy_ctx) {
|
| 315 | Banner('CPYTHON vs. OPY')
|
| 316 |
|
| 317 | tibble(
|
| 318 | table_name = c('frames', 'names', 'consts', 'flags', 'ops'),
|
| 319 | cpython = CompareCol(cpython_ctx),
|
| 320 | opy = CompareCol(opy_ctx)
|
| 321 | ) -> f1
|
| 322 |
|
| 323 | ShowFrame('Overview', f1)
|
| 324 |
|
| 325 | Banner('Cell Variables')
|
| 326 |
|
| 327 | cpython_ctx$names %>% filter(kind == 'cell') -> f2
|
| 328 | opy_ctx$names %>% filter(kind == 'cell') -> f3
|
| 329 |
|
| 330 | ShowFrame('CPython', f2)
|
| 331 | ShowFrame('OPy', f3)
|
| 332 |
|
| 333 | Banner('CLOSURE bytecodes')
|
| 334 |
|
| 335 | cpython_ctx$ops %>%
|
| 336 | filter(op_name %in% c('LOAD_CLOSURE', 'MAKE_CLOSURE')) -> f4
|
| 337 | opy_ctx$ops %>%
|
| 338 | filter(op_name %in% c('LOAD_CLOSURE', 'MAKE_CLOSURE')) -> f5
|
| 339 |
|
| 340 | ShowFrame('CPython', f4)
|
| 341 | ShowFrame('OPy', f5)
|
| 342 |
|
| 343 | Banner('Rare bytecodes')
|
| 344 |
|
| 345 | cpython_ctx$ops %>% filter(op_name == 'DELETE_FAST') -> f6
|
| 346 | ShowFrame('DELETE_FAST in CPython', f6)
|
| 347 |
|
| 348 | opy_ctx$ops %>% filter(op_name == 'DELETE_FAST') -> f7
|
| 349 | ShowFrame('DELETE_FAST in OPy', f7)
|
| 350 |
|
| 351 | # These are all for the global util.GetResourceLoader().
|
| 352 | opy_ctx$ops %>% filter(op_name == 'STORE_GLOBAL') -> f8
|
| 353 | ShowFrame('STORE_GLOBAL in OPy', f8)
|
| 354 |
|
| 355 | # In asdl/unpickle.py.
|
| 356 | opy_ctx$ops %>% filter(op_name == 'STORE_SLICE+1') -> f9
|
| 357 | ShowFrame('STORE_SLICE+1 in OPy', f9)
|
| 358 | }
|
| 359 |
|
| 360 | main = function(argv) {
|
| 361 | action = argv[[1]]
|
| 362 |
|
| 363 | if (action == 'metrics') {
|
| 364 | in_dir = argv[[2]]
|
| 365 | ctx = Load(in_dir)
|
| 366 | Report(ctx)
|
| 367 |
|
| 368 | } else if (action == 'compare') {
|
| 369 | cpython_ctx = Load(argv[[2]])
|
| 370 | opy_ctx = Load(argv[[3]])
|
| 371 | Compare(cpython_ctx, opy_ctx)
|
| 372 |
|
| 373 | } else if (action == 'src-bin-ratio') { # This takes different inputs
|
| 374 | all_deps_py = argv[[2]]
|
| 375 | pyc_base_dir = argv[[3]]
|
| 376 | ctx = FileSizes(all_deps_py, pyc_base_dir)
|
| 377 |
|
| 378 | } else {
|
| 379 | Log("Invalid action '%s'", action)
|
| 380 | quit(status = 1)
|
| 381 | }
|
| 382 | }
|
| 383 |
|
| 384 | if (length(sys.frames()) == 0) {
|
| 385 | # increase ggplot font size globally
|
| 386 | #theme_set(theme_grey(base_size = 20))
|
| 387 | main(commandArgs(TRUE))
|
| 388 | }
|