| 1 | from __future__ import print_function
 | 
| 2 | """
 | 
| 3 | callgraph.py
 | 
| 4 | """
 | 
| 5 | 
 | 
| 6 | import collections
 | 
| 7 | import os
 | 
| 8 | import sys
 | 
| 9 | 
 | 
| 10 | import __builtin__  # For looking up names
 | 
| 11 | import types
 | 
| 12 | 
 | 
| 13 | from .lib import dis
 | 
| 14 | from .lib import inspect
 | 
| 15 | 
 | 
| 16 | from .util import log
 | 
| 17 | 
 | 
| 18 | 
 | 
| 19 | def Disassemble(co):
 | 
| 20 |   """Given a code object, yield instructions of interest.
 | 
| 21 | 
 | 
| 22 |   Args:
 | 
| 23 |     co: __code__ attribute
 | 
| 24 | 
 | 
| 25 |   Structure copied from opy/compiler2/dis_tool.py, which was copied from
 | 
| 26 |   dis.disassemble().
 | 
| 27 |   """
 | 
| 28 |   code = co.co_code
 | 
| 29 |   extended_arg = 0  # Not used
 | 
| 30 | 
 | 
| 31 |   i = 0
 | 
| 32 |   n = len(code)
 | 
| 33 |   #log('\tLENGTH OF CODE %s: %d', co.co_name, n)
 | 
| 34 | 
 | 
| 35 |   while i < n:
 | 
| 36 |     #log('\ti = %d, n = %d', i, n)
 | 
| 37 |     op = ord(code[i])
 | 
| 38 |     i += 1
 | 
| 39 | 
 | 
| 40 |     op_name = dis.opname[op]
 | 
| 41 | 
 | 
| 42 |     # operation is 1 byte, argument is 2 bytes.
 | 
| 43 |     if op >= dis.HAVE_ARGUMENT:
 | 
| 44 |       oparg = ord(code[i]) + ord(code[i+1])*256 + extended_arg
 | 
| 45 |       extended_arg = 0
 | 
| 46 | 
 | 
| 47 |       if op == dis.EXTENDED_ARG:
 | 
| 48 |         # Hm not used?
 | 
| 49 |         raise AssertionError
 | 
| 50 |         extended_arg = oparg*65536L
 | 
| 51 | 
 | 
| 52 |       i += 2
 | 
| 53 | 
 | 
| 54 |       const_name = None
 | 
| 55 |       var_name = None
 | 
| 56 | 
 | 
| 57 |       if op in dis.hasconst:
 | 
| 58 |         const_name = co.co_consts[oparg]
 | 
| 59 | 
 | 
| 60 |       elif op in dis.hasname:
 | 
| 61 |         try:
 | 
| 62 |           var_name = co.co_names[oparg]
 | 
| 63 |         except IndexError:
 | 
| 64 |           log('Error: %r cannot index %s with %d', op_name, co.co_names,
 | 
| 65 |               oparg)
 | 
| 66 |           raise
 | 
| 67 | 
 | 
| 68 |       elif op in dis.hasjrel:
 | 
| 69 |         #raise AssertionError(op_name)
 | 
| 70 |         pass
 | 
| 71 | 
 | 
| 72 |       elif op in dis.haslocal:
 | 
| 73 |         #raise AssertionError(op_name)
 | 
| 74 |         pass
 | 
| 75 | 
 | 
| 76 |       elif op in dis.hascompare:
 | 
| 77 |         #raise AssertionError(op_name)
 | 
| 78 |         pass
 | 
| 79 | 
 | 
| 80 |       elif op in dis.hasfree:
 | 
| 81 |         #raise AssertionError(op_name)
 | 
| 82 |         pass
 | 
| 83 | 
 | 
| 84 |     yield op_name, const_name, var_name
 | 
| 85 |     #log('\t==> i = %d, n = %d', i, n)
 | 
| 86 | 
 | 
| 87 | 
 | 
| 88 | import sre_compile
 | 
| 89 | 
 | 
| 90 | def _GetAttr(module, name):
 | 
| 91 |   # Hack for bug in _fixup_range() !  (No longer in Python 3.6 head.)
 | 
| 92 |   if module is sre_compile and name == 'l':
 | 
| 93 |     return None
 | 
| 94 |   # traceback.py has a hasattr() test
 | 
| 95 |   if module is sys and name == 'tracebacklimit':
 | 
| 96 |     return None
 | 
| 97 | 
 | 
| 98 |   try:
 | 
| 99 |     val = getattr(module, name)
 | 
| 100 |   except AttributeError:
 | 
| 101 |     #log('%r not on %r', name, module)
 | 
| 102 |     # This could raise too
 | 
| 103 |     val = getattr(__builtin__, name)
 | 
| 104 |   return val
 | 
| 105 | 
 | 
| 106 | 
 | 
| 107 | def _Walk(obj, cls, ref, syms):
 | 
| 108 |   """
 | 
| 109 |   Discover statically what (globally-accessible) functions and classes are
 | 
| 110 |   used.
 | 
| 111 | 
 | 
| 112 |   Args:
 | 
| 113 |     obj: a callable object
 | 
| 114 |     cls: an optional class that it was attached to, could be None
 | 
| 115 |     ref: the way the object was referenced from another place in the code
 | 
| 116 |     syms: output Symbols()
 | 
| 117 | 
 | 
| 118 |   Something like this is OK:
 | 
| 119 |   # def Adder(x):
 | 
| 120 |   #   def f(y):
 | 
| 121 |   #     return x + y
 | 
| 122 |   #   return f
 | 
| 123 | 
 | 
| 124 |   Because we'll still have access to the inner code object.  We probably won't
 | 
| 125 |   compile it though.
 | 
| 126 |   """
 | 
| 127 |   if syms.Seen(obj):
 | 
| 128 |     return
 | 
| 129 | 
 | 
| 130 |   module_name = getattr(obj, '__module__', None)
 | 
| 131 |   # NOTE: We get duplicate objects like this, due to bound methods on different
 | 
| 132 |   # objects.  Not sure how to fix it.
 | 
| 133 |   #
 | 
| 134 |   # OBJ <function Parse at 0x7fcd270b4de8>
 | 
| 135 |   # OBJ <function Parse at 0x7fcd2709e500>
 | 
| 136 |   # OBJ <built-in method get of dict object at 0x7fcd28c53280>
 | 
| 137 |   # OBJ <built-in method get of dict object at 0x7fcd28c53398>
 | 
| 138 | 
 | 
| 139 |   #if obj.__name__ in ('write', 'get', 'Parse'):
 | 
| 140 |   #  log('OBJ %s %d', obj, id(obj))
 | 
| 141 |   #  pass
 | 
| 142 | 
 | 
| 143 |   # Oh is the namedtuple_ crap because of the Block class byterun/pyobj?
 | 
| 144 | 
 | 
| 145 |   if module_name is None or module_name in (
 | 
| 146 |       'namedtuple_Arguments', 'namedtuple_ArgSpec',
 | 
| 147 |       'namedtuple_Block'):
 | 
| 148 |     syms.Add(obj, None, ref, None, None, None)
 | 
| 149 |     return  # Can't walk anything
 | 
| 150 | 
 | 
| 151 |   #log('OBJ %s %d', obj, id(obj))
 | 
| 152 |   module = sys.modules[obj.__module__]
 | 
| 153 | 
 | 
| 154 |   co = getattr(obj, '__code__', None)
 | 
| 155 |   # For example, Builtins don't have bytecode.
 | 
| 156 |   if isinstance(co, types.CodeType):
 | 
| 157 |     co = obj.__code__
 | 
| 158 |     #log('CO %s', co)
 | 
| 159 |     #path = co.co_filename
 | 
| 160 | 
 | 
| 161 |     mod_name = None
 | 
| 162 |     mod_path = co.co_filename
 | 
| 163 | 
 | 
| 164 |     syms.Add(obj, cls, ref, mod_name, mod_path, co.co_firstlineno)
 | 
| 165 |   else:
 | 
| 166 |     mod_name = module.__name__
 | 
| 167 |     try:
 | 
| 168 |       mod_path = module.__file__
 | 
| 169 |       if mod_path.endswith('.pyc'):
 | 
| 170 |         mod_path = mod_path[:-1]
 | 
| 171 |     except AttributeError:
 | 
| 172 |       mod_path = None
 | 
| 173 | 
 | 
| 174 |     #if obj.__name__ == 'lex_mode_e':
 | 
| 175 |     #  log('!!! %s name %s path %s', obj, mod.__name__, mod.__file__)
 | 
| 176 | 
 | 
| 177 |     #mod_name = None
 | 
| 178 |     #mod_path = None
 | 
| 179 | 
 | 
| 180 |     syms.Add(obj, cls, ref, mod_name, mod_path, None)
 | 
| 181 |     return
 | 
| 182 | 
 | 
| 183 |   #log('\tNAME %s', val.__code__.co_name)
 | 
| 184 |   #log('\tNAMES %s', val.__code__.co_names)
 | 
| 185 | 
 | 
| 186 |   # Most functions and classes we call are globals!
 | 
| 187 |   #log('\t_Walk %s %s', obj, module)
 | 
| 188 |   #log('\t%s', sorted(dir(module)))
 | 
| 189 | 
 | 
| 190 |   # Have to account for foo.Bar(), which gives this sequence:
 | 
| 191 |   # 2           0 LOAD_GLOBAL              0 (foo)
 | 
| 192 |   #             3 LOAD_ATTR                1 (Bar)
 | 
| 193 |   #             6 CALL_FUNCTION            0
 | 
| 194 |   #
 | 
| 195 |   # Also: os.path.join().
 | 
| 196 | 
 | 
| 197 |   try:
 | 
| 198 |     last_val = None  # value from previous LOAD_GLOBAL or LOAD_ATTR
 | 
| 199 |     ref = []
 | 
| 200 |     g = Disassemble(obj.__code__)
 | 
| 201 | 
 | 
| 202 |     while True:
 | 
| 203 |       op, const, var = g.next()
 | 
| 204 | 
 | 
| 205 |       if op == 'LOAD_GLOBAL':
 | 
| 206 |         val = _GetAttr(module, var)
 | 
| 207 |         ref = [var]  # reset it
 | 
| 208 | 
 | 
| 209 |       elif op == 'LOAD_ATTR':
 | 
| 210 |         #if last_val is not None and isinstance(last_val, types.ModuleType):
 | 
| 211 |         if last_val is not None:
 | 
| 212 |           #log('%s %s', op, var)
 | 
| 213 |           val = _GetAttr(last_val, var)
 | 
| 214 |           ref.append(var)
 | 
| 215 | 
 | 
| 216 |           # Crawl the methods below.  Otherwise we get duplicate bound/unbound
 | 
| 217 |           # methods, which have unique addresses.
 | 
| 218 |           # Examples: WAIT_SPEC.Parse, sys.stdout.write
 | 
| 219 | 
 | 
| 220 |           # BUG: os.fork and sys.stdout.write are the same?
 | 
| 221 |           # I thought os.fork is types.BuiltinFunctionType, and
 | 
| 222 |           # sys.stdout.write is types.BuiltinMethodType, but why not?
 | 
| 223 | 
 | 
| 224 |           if isinstance(val, (types.MethodType, types.BuiltinMethodType)):
 | 
| 225 |             val = None
 | 
| 226 |             ref = []
 | 
| 227 | 
 | 
| 228 |         else:
 | 
| 229 |           val = None
 | 
| 230 |           ref = []
 | 
| 231 | 
 | 
| 232 |       else:  # Some other opcode
 | 
| 233 |         val = None
 | 
| 234 |         ref = []
 | 
| 235 | 
 | 
| 236 |       if callable(val):
 | 
| 237 |         #log('VAL %s module %s', val, val.__module__)
 | 
| 238 |         # Recursive call.
 | 
| 239 | 
 | 
| 240 |         # Check for old style:
 | 
| 241 |         #if isinstance(val, types.ClassType):
 | 
| 242 |         #  print('OLD %s' % val)
 | 
| 243 |         _Walk(val, None, ref, syms)
 | 
| 244 | 
 | 
| 245 |       # If the value is a class, walk its methods.  Note that we assume ALL
 | 
| 246 |       # methods are used.  It's possible to narrow this down a bit and detect
 | 
| 247 |       # unused methods.
 | 
| 248 |       if isinstance(val, type):
 | 
| 249 |         cls = val
 | 
| 250 |         #log('type %s', val)
 | 
| 251 |         for name in dir(val):
 | 
| 252 |           # prevent error with __abstractmethods__ attribute
 | 
| 253 |           #if name.startswith('__'):
 | 
| 254 |           if name == '__abstractmethods__':
 | 
| 255 |             continue
 | 
| 256 |           field_val = getattr(val, name)
 | 
| 257 |           #log('field_val %s', field_val)
 | 
| 258 |           if isinstance(field_val, types.MethodType):
 | 
| 259 |             func_obj = field_val.im_func
 | 
| 260 |             _Walk(func_obj, cls, ref, syms)
 | 
| 261 | 
 | 
| 262 |       last_val = val  # Used on next iteration
 | 
| 263 | 
 | 
| 264 |   except StopIteration:
 | 
| 265 |     pass
 | 
| 266 | 
 | 
| 267 |   #log('\tDone _Walk %s %s', obj, module)
 | 
| 268 | 
 | 
| 269 | 
 | 
| 270 | def PrintSig(fmt, func):
 | 
| 271 |   if os.getenv('CALLGRAPH_SIG') != '1':
 | 
| 272 |     return
 | 
| 273 |   try:
 | 
| 274 |     argspec = inspect.getargspec(func)
 | 
| 275 |   except TypeError:
 | 
| 276 |     return
 | 
| 277 |   parts = [':%s' % ' '.join(argspec.args)]
 | 
| 278 |   # These are keyword-only args?
 | 
| 279 |   if argspec.varargs:
 | 
| 280 |     parts.append('varargs=%s' % (argspec.varargs,))
 | 
| 281 |   if argspec.keywords:
 | 
| 282 |     parts.append('kw=%s' % (argspec.keywords,))
 | 
| 283 |   # Hm the default of 'None' is bad -- you can't distinguish a default of None,
 | 
| 284 |   # which is very common!
 | 
| 285 |   # It's better to get this by parsing the AST.
 | 
| 286 |   if argspec.defaults and any(d is not None for d in argspec.defaults):
 | 
| 287 |     parts.append('defaults=%s' % (argspec.defaults,))
 | 
| 288 | 
 | 
| 289 |   print(fmt % ' '.join(parts))
 | 
| 290 | 
 | 
| 291 | 
 | 
| 292 | class Class(object):
 | 
| 293 | 
 | 
| 294 |   def __init__(self, cls, mod_name, mod_path):
 | 
| 295 |     self.cls = cls
 | 
| 296 |     self.mod_name = mod_name
 | 
| 297 |     self.mod_path = mod_path
 | 
| 298 |     self.methods = []
 | 
| 299 | 
 | 
| 300 |   def Name(self):
 | 
| 301 |     return self.cls.__name__
 | 
| 302 | 
 | 
| 303 |   def AddMethod(self, m, mod_name, mod_path, line_num):
 | 
| 304 |     # Just assume the method is in the same file as the class itself.
 | 
| 305 |     self.methods.append((m, mod_name, mod_path, line_num))
 | 
| 306 | 
 | 
| 307 |   def Print(self):
 | 
| 308 |     base_names = ' '.join(c.__name__ for c in self.cls.__bases__)
 | 
| 309 |     print('  %s(%s)' % (self.cls.__name__, base_names))
 | 
| 310 | 
 | 
| 311 |     methods = [(m.__name__, m) for (m, _, _, _) in self.methods]
 | 
| 312 |     methods.sort()
 | 
| 313 |     for name, m in methods:
 | 
| 314 |       print('    %s' % name)
 | 
| 315 |       PrintSig('      %s', m)
 | 
| 316 | 
 | 
| 317 |   def __repr__(self):
 | 
| 318 |     return '<Class %s %s %s %s>' % (
 | 
| 319 |         self.cls, self.mod_name, self.mod_path, self.methods)
 | 
| 320 | 
 | 
| 321 | 
 | 
| 322 | class Symbols(object):
 | 
| 323 |   """A sink for discovered symbols."""
 | 
| 324 | 
 | 
| 325 |   def __init__(self):
 | 
| 326 |     self.seen = set()
 | 
| 327 | 
 | 
| 328 |     self.classes = {}  # integer id() -> Class
 | 
| 329 |     self.functions = []  # list of callables
 | 
| 330 | 
 | 
| 331 |     self.paths = {}  # path -> list of functions
 | 
| 332 | 
 | 
| 333 |   def Seen(self, c):
 | 
| 334 |     """c: a callable."""
 | 
| 335 |     id_ = id(c)
 | 
| 336 |     if id_ in self.seen:
 | 
| 337 |       return True
 | 
| 338 |     self.seen.add(id_)
 | 
| 339 |     return False
 | 
| 340 | 
 | 
| 341 |   def Add(self, obj, cls, ref, mod_name, mod_path, line_num):
 | 
| 342 |     """Could be a function, class Constructor, or method.
 | 
| 343 |     Can also be native (C) or interpreted (Python with __code__ attribute.)
 | 
| 344 | 
 | 
| 345 |     Returns:
 | 
| 346 |       True if we haven't yet seen it.
 | 
| 347 |     """
 | 
| 348 |     if mod_path is not None:
 | 
| 349 |       mod_path = os.path.normpath(mod_path)
 | 
| 350 | 
 | 
| 351 |     if isinstance(obj, type):
 | 
| 352 |       id_ = id(obj)
 | 
| 353 | 
 | 
| 354 |       # NOTE: Python's classes don't have a __code__ object, which appears to
 | 
| 355 |       # be an irregularity.  So we have to get location information from the
 | 
| 356 |       # METHODS.
 | 
| 357 | 
 | 
| 358 |       # Exception: this type has a __code__ object that is not types.CodeType
 | 
| 359 |       if obj is not types.FunctionType:
 | 
| 360 |         assert not hasattr(obj, '__code__'), obj
 | 
| 361 |       #assert path is None
 | 
| 362 |       assert line_num is None
 | 
| 363 | 
 | 
| 364 |       self.classes[id_] = Class(obj, mod_name, mod_path)
 | 
| 365 | 
 | 
| 366 |     elif cls is not None:
 | 
| 367 |       id_ = id(cls)
 | 
| 368 |       descriptor = self.classes[id_]
 | 
| 369 |       descriptor.AddMethod(obj, mod_name, mod_path, line_num)
 | 
| 370 | 
 | 
| 371 |     else:
 | 
| 372 |       self.functions.append((obj, ref, mod_name, mod_path, line_num))
 | 
| 373 | 
 | 
| 374 |     return True
 | 
| 375 | 
 | 
| 376 |   def Report(self, f=sys.stdout):
 | 
| 377 |     # Now categorize into files.  We couldn't do that earlier because classes
 | 
| 378 |     # don't know where they are located!
 | 
| 379 | 
 | 
| 380 |     py_srcs = collections.defaultdict(SourceFile)
 | 
| 381 |     c_srcs = collections.defaultdict(SourceFile)
 | 
| 382 | 
 | 
| 383 |     for func, ref, mod_name, mod_path, line_num in self.functions:
 | 
| 384 |       if mod_path:
 | 
| 385 |         py_srcs[mod_path].functions.append((func, ref, line_num))
 | 
| 386 |       else:
 | 
| 387 |         c_srcs[mod_name].functions.append((func, ref, line_num))
 | 
| 388 | 
 | 
| 389 |     for cls in self.classes.values():
 | 
| 390 |       #if cls.cls.__name__ == 'lex_mode_e':
 | 
| 391 |       #  log('!!! %s', cls)
 | 
| 392 | 
 | 
| 393 |       if cls.mod_path:
 | 
| 394 |         py_srcs[cls.mod_path].classes.append(cls)
 | 
| 395 |       else:
 | 
| 396 |         c_srcs[cls.mod_name].classes.append(cls)
 | 
| 397 | 
 | 
| 398 |     prefix = os.getcwd()
 | 
| 399 |     n = len(prefix) + 1
 | 
| 400 | 
 | 
| 401 |     #for name in sorted(py_srcs):
 | 
| 402 |     #  print(name)
 | 
| 403 | 
 | 
| 404 |     #return
 | 
| 405 | 
 | 
| 406 |     # Still missing: non-enum ASDL types?  Why?  CompoundObj?
 | 
| 407 |     # command_e is there, but command and SimpleCommand aren't.
 | 
| 408 |     # it's because we do
 | 
| 409 |     # ast.command_e vs. ast.SimpleCommand
 | 
| 410 |     # in both cases ast is a core.meta _AsdlModule?
 | 
| 411 | 
 | 
| 412 |     print('PYTHON CODE')
 | 
| 413 |     print()
 | 
| 414 | 
 | 
| 415 |     for path in sorted(py_srcs):
 | 
| 416 |       src = py_srcs[path]
 | 
| 417 | 
 | 
| 418 |       if path is not None and path.startswith(prefix):
 | 
| 419 |         path = path[n:]
 | 
| 420 | 
 | 
| 421 |       print('%s' % path)
 | 
| 422 | 
 | 
| 423 |       for func, ref, _ in src.functions:
 | 
| 424 |         third = func
 | 
| 425 |         #third = ''
 | 
| 426 |         #print('  %s [%s] %s' % (func.__name__, '.'.join(ref), third))
 | 
| 427 |         try:
 | 
| 428 |           print('  %s' % func.__name__)
 | 
| 429 |         except AttributeError: 
 | 
| 430 |           # In the CI, when we don't have fastlex, we get _MatchOshToken_Slow()
 | 
| 431 |           # instances
 | 
| 432 |           print("  NOTE: %s found at top level, but it's not a simple function" % func)
 | 
| 433 | 
 | 
| 434 |         PrintSig('    %s', func)
 | 
| 435 | 
 | 
| 436 |       classes = [(c.Name(), c) for c in src.classes]
 | 
| 437 |       classes.sort()
 | 
| 438 |       for c in src.classes:
 | 
| 439 |         c.Print()
 | 
| 440 | 
 | 
| 441 |       print()
 | 
| 442 | 
 | 
| 443 |     print('NATIVE CODE')
 | 
| 444 |     print()
 | 
| 445 | 
 | 
| 446 |     for mod_name in sorted(c_srcs):
 | 
| 447 |       src = c_srcs[mod_name]
 | 
| 448 | 
 | 
| 449 |       print('%s' % mod_name)
 | 
| 450 | 
 | 
| 451 |       for func, ref, _ in src.functions:
 | 
| 452 |         #third = func
 | 
| 453 |         third = ''
 | 
| 454 |         #print('  %s [%s] %s' % (func.__name__, '.'.join(ref), third))
 | 
| 455 |         print('  %s' % func.__name__)
 | 
| 456 | 
 | 
| 457 |       classes = [(c.Name(), c) for c in src.classes]
 | 
| 458 |       classes.sort()
 | 
| 459 |       for c in src.classes:
 | 
| 460 |         c.Print()
 | 
| 461 | 
 | 
| 462 |       print()
 | 
| 463 | 
 | 
| 464 | 
 | 
| 465 | class SourceFile(object):
 | 
| 466 | 
 | 
| 467 |   def __init__(self):
 | 
| 468 |     self.classes = []
 | 
| 469 |     self.functions = []
 | 
| 470 | 
 | 
| 471 | 
 | 
| 472 | def Walk(main, modules):
 | 
| 473 |   """Given a function main, finds all functions it transitively calls.
 | 
| 474 | 
 | 
| 475 |   Uses heuristic bytecode analysis.  Limitations:
 | 
| 476 | 
 | 
| 477 |   - functions that are local variables might not work?  But this should work:
 | 
| 478 | 
 | 
| 479 |   if x > 0:
 | 
| 480 |     f = GlobalFunc
 | 
| 481 |   else:
 | 
| 482 |     f = OtherGlobalFunc
 | 
| 483 |   f()    # The LOAD_GLOBAL will be found.
 | 
| 484 | 
 | 
| 485 |   Args:
 | 
| 486 |     main: function
 | 
| 487 |     modules: Dict[str, module]
 | 
| 488 |   """
 | 
| 489 |   syms = Symbols()
 | 
| 490 |   _Walk(main, None, ['main'], syms)
 | 
| 491 | 
 | 
| 492 |   # TODO:
 | 
| 493 |   # - co_consts should be unified?  So we know how big the const pool is.
 | 
| 494 | 
 | 
| 495 |   syms.Report()
 | 
| 496 | 
 |