| 1 | #!/usr/bin/env Rscript
 | 
| 2 | #
 | 
| 3 | # bytecode.R -- Analyze output of opyc dis-tables.
 | 
| 4 | #
 | 
| 5 | # Usage:
 | 
| 6 | #   bytecode.R ACTION IN_DIR OUT_DIR
 | 
| 7 | 
 | 
| 8 | library(dplyr)
 | 
| 9 | library(tidyr)  # spread()
 | 
| 10 | library(stringr)
 | 
| 11 | 
 | 
| 12 | source('benchmarks/common.R')
 | 
| 13 | 
 | 
| 14 | options(stringsAsFactors = F,
 | 
| 15 |         # Make the report wide.  tibble.width doesn't appear to do this?
 | 
| 16 |         width=200,
 | 
| 17 |         tibble.print_max=Inf
 | 
| 18 | )
 | 
| 19 | 
 | 
| 20 | Basic = function(ctx) {
 | 
| 21 |   Banner('BASIC METRICS')
 | 
| 22 | 
 | 
| 23 |   # Number of files
 | 
| 24 |   ctx$frames %>% count(path) -> by_path
 | 
| 25 |   ShowValue('Number of files: %d', nrow(by_path))
 | 
| 26 | 
 | 
| 27 |   # 216K
 | 
| 28 |   b = sum(ctx$frames$bytecode_bytes)
 | 
| 29 |   ShowValue('Total bytecode bytes: %d', b)
 | 
| 30 | 
 | 
| 31 |   num_insts = nrow(ctx$ops)
 | 
| 32 |   ShowValue('Total instructions: %d', num_insts)
 | 
| 33 | 
 | 
| 34 |   # Hm this isn't reliable because the code name isn't unique!  I think we need
 | 
| 35 |   # firstlineno
 | 
| 36 |   ctx$frames %>% count(path, code_name) %>% arrange(desc(n)) %>% head() -> f1
 | 
| 37 |   ShowFrame('Duplicate path/name', f1)
 | 
| 38 | }
 | 
| 39 | 
 | 
| 40 | BigStrings = function(consts) {
 | 
| 41 |   Banner('BIG STRINGS')
 | 
| 42 | 
 | 
| 43 |   strs = consts %>% filter(type == 'str') %>% arrange(desc(len_or_val))
 | 
| 44 |   strs %>% head(20) %>% print()
 | 
| 45 |   total_bytes = sum(strs$len_or_val)
 | 
| 46 | 
 | 
| 47 |   # 184 KB of strings!  That's just the payload; the header is probably more.
 | 
| 48 |   ShowValue('total string bytes: %d', total_bytes)
 | 
| 49 | 
 | 
| 50 |   # This plot says:
 | 
| 51 |   #
 | 
| 52 |   # total bytes is 184 KB
 | 
| 53 |   # - the top 10 strings sum to 20K bytes
 | 
| 54 |   # - the top 100 strings sum to 30K bytes
 | 
| 55 | 
 | 
| 56 |   cum = cumsum(strs$len_or_val)
 | 
| 57 |   plot(cum)
 | 
| 58 | 
 | 
| 59 |   #plot(ecdf(strs$len_or_val))
 | 
| 60 | }
 | 
| 61 | 
 | 
| 62 | Consts = function(consts) {
 | 
| 63 |   Banner('CONSTS')
 | 
| 64 | 
 | 
| 65 |   # count of types of constants.  Strings dominate of course.
 | 
| 66 |   # But there are only 7 or so immutable types!
 | 
| 67 | 
 | 
| 68 |   # - only 2 float constants.
 | 
| 69 |   # - get rid of the unicode constants in posixpath.
 | 
| 70 | 
 | 
| 71 |   consts %>% count(type) %>% arrange(desc(n)) %>% head(20) -> frequent
 | 
| 72 |   ShowFrame('Types of constants', frequent)
 | 
| 73 | }
 | 
| 74 | 
 | 
| 75 | # Frames by number of consts, number of ops, etc.
 | 
| 76 | Frames = function(ctx) {
 | 
| 77 |   Banner('FRAMES')
 | 
| 78 | 
 | 
| 79 |   ctx$consts %>% count(path, code_name, sort=T) %>% head(20) -> f1
 | 
| 80 |   ShowFrame('Frames with many consts', f1)
 | 
| 81 | 
 | 
| 82 |   ctx$ops %>% count(path, code_name, sort=T) %>% head(20) -> f2
 | 
| 83 |   ShowFrame('Frames with many ops', f2)
 | 
| 84 | 
 | 
| 85 |   ctx$frames %>% arrange(desc(stacksize)) %>% head(10) -> f3
 | 
| 86 |   ShowFrame('Frames with large stacksize', f3)
 | 
| 87 | 
 | 
| 88 |   ctx$frames %>% arrange(desc(nlocals)) %>% head(10) -> f4
 | 
| 89 |   ShowFrame('Frames with many locals', f4)
 | 
| 90 | }
 | 
| 91 | 
 | 
| 92 | # OpKind is FAST for LOAD_FAST, or SLICE for STORE_SLICE+1
 | 
| 93 | #
 | 
| 94 | # [,1] is the whole match, and [,2] is the first match.  Like $0 and $1 in
 | 
| 95 | # normal regexes.
 | 
| 96 | OpKind = function(op_name) {
 | 
| 97 |   # optional +1 suffix
 | 
| 98 |   str_match(op_name, '([A-Z]+)(?:\\+[0-9])?$')[,2]
 | 
| 99 | }
 | 
| 100 | 
 | 
| 101 | Ops = function(ops, ops_defined = '_tmp/opcodes-defined.txt') {
 | 
| 102 |   Banner('OPS')
 | 
| 103 | 
 | 
| 104 |   ops %>% count(op_name) %>% arrange(desc(n)) -> op_freq
 | 
| 105 | 
 | 
| 106 |   ShowFrame('Ops Used by Frequency', op_freq)
 | 
| 107 | 
 | 
| 108 |   u2 = ops %>% distinct(op_name) 
 | 
| 109 |   ShowValue('Total unique opcodes: %d', nrow(u2))
 | 
| 110 | 
 | 
| 111 |   if (ops_defined != '') {
 | 
| 112 |     defined = read.table(ops_defined, header=F)
 | 
| 113 |     colnames(defined) = c('op_name')
 | 
| 114 | 
 | 
| 115 |     setdiff(defined, u2) -> f4
 | 
| 116 |     ShowFrame('Unused opcodes:', f4)
 | 
| 117 |   }
 | 
| 118 | 
 | 
| 119 |   op_freq %>%
 | 
| 120 |     filter(str_detect(op_name, 'LOAD|STORE|FAST')) %>%
 | 
| 121 |     mutate(kind = OpKind(op_name)) %>%
 | 
| 122 |     arrange(kind) %>%
 | 
| 123 |     select(kind, op_name, n) -> mem_ops
 | 
| 124 |   ShowFrame('Memory Operations:', mem_ops)
 | 
| 125 | 
 | 
| 126 |   # NOTE: got rid of IMPORT_STAR!
 | 
| 127 |   ops %>% filter(str_detect(op_name, 'IMPORT')) %>% count(op_name) -> imports
 | 
| 128 |   ShowFrame('Imports:', imports)
 | 
| 129 | 
 | 
| 130 |   # These are all the big jump targets!  Max is 3,852, which is a lot less than
 | 
| 131 |   # 65,536.  We don't need EXTENDED_ARG!
 | 
| 132 |   ops %>% arrange(desc(op_arg)) %>% head(10) -> f1
 | 
| 133 |   ShowFrame('Large op_arg (jump targets):', f1)
 | 
| 134 | }
 | 
| 135 | 
 | 
| 136 | Flags = function(flags) {
 | 
| 137 |   Banner('FLAGS')
 | 
| 138 | 
 | 
| 139 |   flags %>% count(flag) %>% arrange(desc(n)) -> f1
 | 
| 140 |   ShowFrame('Common flags', f1)
 | 
| 141 | }
 | 
| 142 | 
 | 
| 143 | Names = function(names) {
 | 
| 144 |   Banner('NAMES')
 | 
| 145 | 
 | 
| 146 |   # Common types: free, cell, etc.
 | 
| 147 |   names %>% count(kind) %>% arrange(desc(n)) %>% head(20) -> f1
 | 
| 148 |   ShowFrame('Common types', f1)
 | 
| 149 | 
 | 
| 150 |   # Common names:
 | 
| 151 |   # self, None, True, False, append, len
 | 
| 152 |   names %>% count(name) %>% arrange(desc(n)) %>% head(20) -> f2
 | 
| 153 |   ShowFrame('Common names', f2)
 | 
| 154 | 
 | 
| 155 |   names %>% mutate(len=nchar(name)) -> all
 | 
| 156 |   names %>% count(name) %>% mutate(len=nchar(name)) -> unique
 | 
| 157 | 
 | 
| 158 |   ShowValue('Total length of all %d names: %d',
 | 
| 159 |             nrow(all), sum(all$len))
 | 
| 160 |   ShowValue('Total length of %d unique names: %d',
 | 
| 161 |             nrow(unique), sum(unique$len))
 | 
| 162 | }
 | 
| 163 | 
 | 
| 164 | # Hm max unique ops is 58
 | 
| 165 | # _build/oil/bytecode-opy/core/cmd_exec.pyc     54
 | 
| 166 | # _build/oil/bytecode-opy/warnings.pyc          55
 | 
| 167 | # _build/oil/bytecode-opy/_abcoll.pyc           58
 | 
| 168 | #
 | 
| 169 | # But there are 119 total opcodes.  A lot of the math ones are uncommon.
 | 
| 170 | 
 | 
| 171 | # Written by opy/metrics.sh.  Could get rid of that file.
 | 
| 172 | UniqueOpsByFile = function(ops) {
 | 
| 173 |   Banner('UNIQUE OPS')
 | 
| 174 | 
 | 
| 175 |   # This is a row for every path/op_name
 | 
| 176 |   u = ops %>% group_by(path) %>% distinct(op_name)
 | 
| 177 |   u %>% count(path) %>% arrange(n) -> ops_by_file
 | 
| 178 | 
 | 
| 179 |   ops_by_file %>% head(20) -> f1
 | 
| 180 |   ShowFrame('Files with few ops:', f1)
 | 
| 181 | 
 | 
| 182 |   ops_by_file %>% tail(10) -> f2
 | 
| 183 |   ShowFrame('Files with many ops:', f2)
 | 
| 184 | 
 | 
| 185 |   ops_by_file %>% filter(grepl('reader|lex|parse', path)) -> f3
 | 
| 186 |   ShowFrame('Unique ops for files that just parse:', f3)  # 17, 23, 34, 34, 46
 | 
| 187 | 
 | 
| 188 |   ops %>% filter(grepl('reader|lex|parse', path)) %>% distinct(op_name) ->
 | 
| 189 |     string_ops
 | 
| 190 |   ShowValue('Unique opcodes for parsing: %d', nrow(string_ops))
 | 
| 191 | }
 | 
| 192 | 
 | 
| 193 | # OPy emits 88 distinct opcodes out of 119.  Interesting.
 | 
| 194 | # CPython emits 94 distinct opcodes.
 | 
| 195 | # STORE_MAP and SETUP_WITH are the only differences.  Is this for dict literals?
 | 
| 196 | #
 | 
| 197 | #
 | 
| 198 | # setdiff(cpy$ops %>% distinct(op_name), opy$ops %>% distinct(op_name))
 | 
| 199 | #            op_name
 | 
| 200 | # 1        STORE_MAP
 | 
| 201 | # 2       SETUP_WITH
 | 
| 202 | # 3       PRINT_ITEM
 | 
| 203 | # 4    PRINT_NEWLINE
 | 
| 204 | # 5    PRINT_ITEM_TO
 | 
| 205 | # 6 PRINT_NEWLINE_TO
 | 
| 206 | 
 | 
| 207 | # Unused opcodes:
 | 
| 208 | #                 op_name
 | 
| 209 | # 1    BINARY_TRUE_DIVIDE
 | 
| 210 | # 2             BUILD_SET
 | 
| 211 | # 3           BUILD_SLICE
 | 
| 212 | # 4         CONTINUE_LOOP
 | 
| 213 | # 5           DELETE_ATTR
 | 
| 214 | # 6         DELETE_GLOBAL
 | 
| 215 | # 7        DELETE_SLICE+2
 | 
| 216 | # 8        DELETE_SLICE+3
 | 
| 217 | # 9          EXTENDED_ARG
 | 
| 218 | # 10       INPLACE_DIVIDE
 | 
| 219 | # 11 INPLACE_FLOOR_DIVIDE
 | 
| 220 | # 12       INPLACE_LSHIFT
 | 
| 221 | # 13       INPLACE_MODULO
 | 
| 222 | # 14           INPLACE_OR
 | 
| 223 | # 15        INPLACE_POWER
 | 
| 224 | # 16  INPLACE_TRUE_DIVIDE
 | 
| 225 | # 17                  NOP
 | 
| 226 | # 18           PRINT_EXPR
 | 
| 227 | # 19           PRINT_ITEM
 | 
| 228 | # 20        PRINT_ITEM_TO
 | 
| 229 | # 21        PRINT_NEWLINE
 | 
| 230 | # 22     PRINT_NEWLINE_TO
 | 
| 231 | # 23             ROT_FOUR
 | 
| 232 | # 24           SETUP_WITH
 | 
| 233 | # 25              SET_ADD
 | 
| 234 | # 26            STOP_CODE
 | 
| 235 | # 27            STORE_MAP
 | 
| 236 | # 28        STORE_SLICE+2
 | 
| 237 | # 29        STORE_SLICE+3
 | 
| 238 | # 30        UNARY_CONVERT
 | 
| 239 | # 31       UNARY_POSITIVE
 | 
| 240 | 
 | 
| 241 | 
 | 
| 242 | Report = function(ctx) {
 | 
| 243 |   Basic(ctx)
 | 
| 244 |   BigStrings(ctx$consts)
 | 
| 245 | 
 | 
| 246 |   Frames(ctx)
 | 
| 247 |   Names(ctx$names)
 | 
| 248 |   Consts(ctx$consts)
 | 
| 249 |   Flags(ctx$flags)
 | 
| 250 | 
 | 
| 251 |   Ops(ctx$ops)
 | 
| 252 |   UniqueOpsByFile(ctx$ops)
 | 
| 253 | }
 | 
| 254 | 
 | 
| 255 | Load = function(in_dir) {
 | 
| 256 |   list(
 | 
| 257 |        frames = read.table(file.path(in_dir, 'frames.tsv2'), header=T),
 | 
| 258 |        names = read.table(file.path(in_dir, 'names.tsv2'), header=T),
 | 
| 259 |        consts = read.table(file.path(in_dir, 'consts.tsv2'), header=T),
 | 
| 260 |        flags = read.table(file.path(in_dir, 'flags.tsv2'), header=T),
 | 
| 261 |        ops = read.table(file.path(in_dir, 'ops.tsv2'), header=T)
 | 
| 262 |        )
 | 
| 263 | }
 | 
| 264 | 
 | 
| 265 | # This takes a table of (py_path, pyc_path) and calls file.info()$size on both.
 | 
| 266 | # Then it computes the ratio.
 | 
| 267 | 
 | 
| 268 | FileSizes = function(all_deps_py, pyc_base_dir) {
 | 
| 269 |   py_pyc = read.table(all_deps_py, header=F)
 | 
| 270 |   colnames(py_pyc) = c('py_path', 'pyc_path')
 | 
| 271 | 
 | 
| 272 |   py_pyc$py_bytes = file.info(py_pyc$py_path)$size
 | 
| 273 | 
 | 
| 274 |   pyc_paths = file.path(pyc_base_dir, py_pyc$pyc_path)
 | 
| 275 |   py_pyc$pyc_bytes = file.info(pyc_paths)$size
 | 
| 276 | 
 | 
| 277 |   py_pyc %>% filter(py_bytes != 0) %>% mutate(ratio = pyc_bytes / py_bytes) %>%
 | 
| 278 |     arrange(ratio) -> py_pyc
 | 
| 279 | 
 | 
| 280 |   Banner('RATIO')
 | 
| 281 | 
 | 
| 282 |   py_pyc %>% head(10) -> small
 | 
| 283 |   ShowFrame('small .pyc files:', small)
 | 
| 284 | 
 | 
| 285 |   py_pyc %>% tail(10) -> big
 | 
| 286 |   ShowFrame('big .pyc files:', big)
 | 
| 287 | 
 | 
| 288 |   # This ratio is a ltitle misleading because it counts comments.
 | 
| 289 |   py_total = sum(py_pyc$py_bytes)
 | 
| 290 |   pyc_total =  sum(py_pyc$pyc_bytes)
 | 
| 291 | 
 | 
| 292 |   ShowValue('Overall: %d bytes of .py -> %d bytes of .pyc', py_total, pyc_total)
 | 
| 293 |   ShowValue('Ratio: %f', pyc_total / py_total)
 | 
| 294 | 
 | 
| 295 |   Banner('FULL LISTING')
 | 
| 296 | 
 | 
| 297 |   py_pyc %>% select(c(pyc_bytes, pyc_path)) %>% arrange(desc(pyc_bytes)) -> f1
 | 
| 298 |   ShowFrame('bytecode', f1)
 | 
| 299 |   ShowValue('total (again): %d', pyc_total)
 | 
| 300 | 
 | 
| 301 |   py_pyc
 | 
| 302 | }
 | 
| 303 | 
 | 
| 304 | 
 | 
| 305 | CompareCol = function(ctx) {
 | 
| 306 |   c(nrow(ctx$frames),
 | 
| 307 |     nrow(ctx$names),
 | 
| 308 |     nrow(ctx$consts),
 | 
| 309 |     nrow(ctx$flags),
 | 
| 310 |     nrow(ctx$ops)
 | 
| 311 |   )
 | 
| 312 | }
 | 
| 313 | 
 | 
| 314 | Compare = function(cpython_ctx, opy_ctx) {
 | 
| 315 |   Banner('CPYTHON vs. OPY')
 | 
| 316 | 
 | 
| 317 |   tibble(
 | 
| 318 |     table_name = c('frames', 'names', 'consts', 'flags', 'ops'),
 | 
| 319 |     cpython = CompareCol(cpython_ctx),
 | 
| 320 |     opy = CompareCol(opy_ctx)
 | 
| 321 |   ) -> f1
 | 
| 322 | 
 | 
| 323 |   ShowFrame('Overview', f1)
 | 
| 324 | 
 | 
| 325 |   Banner('Cell Variables')
 | 
| 326 | 
 | 
| 327 |   cpython_ctx$names %>% filter(kind == 'cell') -> f2
 | 
| 328 |   opy_ctx$names %>% filter(kind == 'cell') -> f3
 | 
| 329 | 
 | 
| 330 |   ShowFrame('CPython', f2)
 | 
| 331 |   ShowFrame('OPy', f3)
 | 
| 332 | 
 | 
| 333 |   Banner('CLOSURE bytecodes')
 | 
| 334 | 
 | 
| 335 |   cpython_ctx$ops %>%
 | 
| 336 |     filter(op_name %in% c('LOAD_CLOSURE', 'MAKE_CLOSURE')) -> f4
 | 
| 337 |   opy_ctx$ops %>%
 | 
| 338 |     filter(op_name %in% c('LOAD_CLOSURE', 'MAKE_CLOSURE')) -> f5
 | 
| 339 | 
 | 
| 340 |   ShowFrame('CPython', f4)
 | 
| 341 |   ShowFrame('OPy', f5)
 | 
| 342 | 
 | 
| 343 |   Banner('Rare bytecodes')
 | 
| 344 | 
 | 
| 345 |   cpython_ctx$ops %>% filter(op_name == 'DELETE_FAST') -> f6
 | 
| 346 |   ShowFrame('DELETE_FAST in CPython', f6)
 | 
| 347 | 
 | 
| 348 |   opy_ctx$ops %>% filter(op_name == 'DELETE_FAST') -> f7
 | 
| 349 |   ShowFrame('DELETE_FAST in OPy', f7)
 | 
| 350 | 
 | 
| 351 |   # These are all for the global util.GetResourceLoader().
 | 
| 352 |   opy_ctx$ops %>% filter(op_name == 'STORE_GLOBAL') -> f8
 | 
| 353 |   ShowFrame('STORE_GLOBAL in OPy', f8)
 | 
| 354 | 
 | 
| 355 |   # In asdl/unpickle.py.
 | 
| 356 |   opy_ctx$ops %>% filter(op_name == 'STORE_SLICE+1') -> f9
 | 
| 357 |   ShowFrame('STORE_SLICE+1 in OPy', f9)
 | 
| 358 | }
 | 
| 359 | 
 | 
| 360 | main = function(argv) {
 | 
| 361 |   action = argv[[1]]
 | 
| 362 | 
 | 
| 363 |   if (action == 'metrics') {
 | 
| 364 |     in_dir = argv[[2]]
 | 
| 365 |     ctx = Load(in_dir)
 | 
| 366 |     Report(ctx)
 | 
| 367 | 
 | 
| 368 |   } else if (action == 'compare') {
 | 
| 369 |     cpython_ctx = Load(argv[[2]])
 | 
| 370 |     opy_ctx = Load(argv[[3]])
 | 
| 371 |     Compare(cpython_ctx, opy_ctx)
 | 
| 372 | 
 | 
| 373 |   } else if (action == 'src-bin-ratio') {  # This takes different inputs
 | 
| 374 |     all_deps_py = argv[[2]]
 | 
| 375 |     pyc_base_dir = argv[[3]]
 | 
| 376 |     ctx = FileSizes(all_deps_py, pyc_base_dir)
 | 
| 377 | 
 | 
| 378 |   } else {
 | 
| 379 |     Log("Invalid action '%s'", action)
 | 
| 380 |     quit(status = 1)
 | 
| 381 |   }
 | 
| 382 | }
 | 
| 383 | 
 | 
| 384 | if (length(sys.frames()) == 0) {
 | 
| 385 |   # increase ggplot font size globally
 | 
| 386 |   #theme_set(theme_grey(base_size = 20))
 | 
| 387 |   main(commandArgs(TRUE))
 | 
| 388 | }
 |