1 | #!/usr/bin/python
|
2 | from __future__ import print_function
|
3 | """
|
4 | callgraph.py
|
5 | """
|
6 |
|
7 | import collections
|
8 | import dis
|
9 | import inspect
|
10 | import os
|
11 | import sys
|
12 |
|
13 | import __builtin__ # For looking up names
|
14 | import types
|
15 |
|
16 | from core import util
|
17 | log = util.log
|
18 |
|
19 |
|
20 | def Disassemble(co):
|
21 | """Given a code object, yield instructions of interest.
|
22 |
|
23 | Args:
|
24 | co: __code__ attribute
|
25 |
|
26 | Structure copied from misc/inspect_pyc.py, which was copied from
|
27 | dis.disassemble().
|
28 | """
|
29 | code = co.co_code
|
30 | extended_arg = 0 # Not used
|
31 |
|
32 | i = 0
|
33 | n = len(code)
|
34 | #log('\tLENGTH OF CODE %s: %d', co.co_name, n)
|
35 |
|
36 | while i < n:
|
37 | #log('\ti = %d, n = %d', i, n)
|
38 | op = ord(code[i])
|
39 | i += 1
|
40 |
|
41 | op_name = dis.opname[op]
|
42 |
|
43 | # operation is 1 byte, argument is 2 bytes.
|
44 | if op >= dis.HAVE_ARGUMENT:
|
45 | oparg = ord(code[i]) + ord(code[i+1])*256 + extended_arg
|
46 | extended_arg = 0
|
47 |
|
48 | if op == dis.EXTENDED_ARG:
|
49 | # Hm not used?
|
50 | raise AssertionError
|
51 | extended_arg = oparg*65536L
|
52 |
|
53 | i += 2
|
54 |
|
55 | const_name = None
|
56 | var_name = None
|
57 |
|
58 | if op in dis.hasconst:
|
59 | const_name = co.co_consts[oparg]
|
60 |
|
61 | elif op in dis.hasname:
|
62 | try:
|
63 | var_name = co.co_names[oparg]
|
64 | except IndexError:
|
65 | log('Error: %r cannot index %s with %d', op_name, co.co_names,
|
66 | oparg)
|
67 | raise
|
68 |
|
69 | elif op in dis.hasjrel:
|
70 | #raise AssertionError(op_name)
|
71 | pass
|
72 |
|
73 | elif op in dis.haslocal:
|
74 | #raise AssertionError(op_name)
|
75 | pass
|
76 |
|
77 | elif op in dis.hascompare:
|
78 | #raise AssertionError(op_name)
|
79 | pass
|
80 |
|
81 | elif op in dis.hasfree:
|
82 | #raise AssertionError(op_name)
|
83 | pass
|
84 |
|
85 | yield op_name, const_name, var_name
|
86 | #log('\t==> i = %d, n = %d', i, n)
|
87 |
|
88 |
|
89 | import sre_compile
|
90 |
|
91 | def _GetAttr(module, name):
|
92 | # Hack for bug in _fixup_range() ! (No longer in Python 3.6 head.)
|
93 | if module is sre_compile and name == 'l':
|
94 | return None
|
95 | # traceback.py has a hasattr() test
|
96 | if module is sys and name == 'tracebacklimit':
|
97 | return None
|
98 |
|
99 | try:
|
100 | val = getattr(module, name)
|
101 | except AttributeError:
|
102 | #log('%r not on %r', name, module)
|
103 | # This could raise too
|
104 | val = getattr(__builtin__, name)
|
105 | return val
|
106 |
|
107 |
|
108 | def _Walk(obj, cls, ref, syms):
|
109 | """
|
110 | Discover statically what (globally-accessible) functions and classes are
|
111 | used.
|
112 |
|
113 | Args:
|
114 | obj: a callable object
|
115 | cls: an optional class that it was attached to, could be None
|
116 | ref: the way the object was referenced from another place in the code
|
117 | syms: output Symbols()
|
118 |
|
119 | Something like this is OK:
|
120 | # def Adder(x):
|
121 | # def f(y):
|
122 | # return x + y
|
123 | # return f
|
124 |
|
125 | Because we'll still have access to the inner code object. We probably won't
|
126 | compile it though.
|
127 | """
|
128 | if syms.Seen(obj):
|
129 | return
|
130 |
|
131 | module_name = getattr(obj, '__module__', None)
|
132 | # NOTE: We get duplicate objects like this, due to bound methods on different
|
133 | # objects. Not sure how to fix it.
|
134 | #
|
135 | # OBJ <function Parse at 0x7fcd270b4de8>
|
136 | # OBJ <function Parse at 0x7fcd2709e500>
|
137 | # OBJ <built-in method get of dict object at 0x7fcd28c53280>
|
138 | # OBJ <built-in method get of dict object at 0x7fcd28c53398>
|
139 |
|
140 | #if obj.__name__ in ('write', 'get', 'Parse'):
|
141 | # log('OBJ %s %d', obj, id(obj))
|
142 | # pass
|
143 |
|
144 | if module_name is None:
|
145 | syms.Add(obj, None, ref, None, None, None)
|
146 | return # Can't walk anything
|
147 |
|
148 | module = sys.modules[obj.__module__]
|
149 |
|
150 | #print(obj)
|
151 | if hasattr(obj, '__code__'): # Builtins don't have bytecode.
|
152 | co = obj.__code__
|
153 | #path = co.co_filename
|
154 |
|
155 | mod_name = None
|
156 | mod_path = co.co_filename
|
157 |
|
158 | syms.Add(obj, cls, ref, mod_name, mod_path, co.co_firstlineno)
|
159 | else:
|
160 | mod_name = module.__name__
|
161 | try:
|
162 | mod_path = module.__file__
|
163 | if mod_path.endswith('.pyc'):
|
164 | mod_path = mod_path[:-1]
|
165 | except AttributeError:
|
166 | mod_path = None
|
167 |
|
168 | #if obj.__name__ == 'lex_mode_e':
|
169 | # log('!!! %s name %s path %s', obj, mod.__name__, mod.__file__)
|
170 |
|
171 | #mod_name = None
|
172 | #mod_path = None
|
173 |
|
174 | syms.Add(obj, cls, ref, mod_name, mod_path, None)
|
175 | return
|
176 |
|
177 | #log('\tNAME %s', val.__code__.co_name)
|
178 | #log('\tNAMES %s', val.__code__.co_names)
|
179 |
|
180 | # Most functions and classes we call are globals!
|
181 | #log('\t_Walk %s %s', obj, module)
|
182 | #log('\t%s', sorted(dir(module)))
|
183 |
|
184 | # Have to account for foo.Bar(), which gives this sequence:
|
185 | # 2 0 LOAD_GLOBAL 0 (foo)
|
186 | # 3 LOAD_ATTR 1 (Bar)
|
187 | # 6 CALL_FUNCTION 0
|
188 | #
|
189 | # Also: os.path.join().
|
190 |
|
191 | try:
|
192 | last_val = None # value from previous LOAD_GLOBAL or LOAD_ATTR
|
193 | ref = []
|
194 | g = Disassemble(obj.__code__)
|
195 |
|
196 | while True:
|
197 | op, const, var = g.next()
|
198 |
|
199 | if op == 'LOAD_GLOBAL':
|
200 | val = _GetAttr(module, var)
|
201 | ref = [var] # reset it
|
202 |
|
203 | elif op == 'LOAD_ATTR':
|
204 | #if last_val is not None and isinstance(last_val, types.ModuleType):
|
205 | if last_val is not None:
|
206 | #log('%s %s', op, var)
|
207 | val = _GetAttr(last_val, var)
|
208 | ref.append(var)
|
209 |
|
210 | # Crawl the methods below. Otherwise we get duplicate bound/unbound
|
211 | # methods, which have unique addresses.
|
212 | # Examples: WAIT_SPEC.Parse, sys.stdout.write
|
213 |
|
214 | # BUG: os.fork and sys.stdout.write are the same?
|
215 | # I thought os.fork is types.BuiltinFunctionType, and
|
216 | # sys.stdout.write is types.BuiltinMethodType, but why not?
|
217 |
|
218 | if isinstance(val, (types.MethodType, types.BuiltinMethodType)):
|
219 | val = None
|
220 | ref = []
|
221 |
|
222 | else:
|
223 | val = None
|
224 | ref = []
|
225 |
|
226 | else: # Some other opcode
|
227 | val = None
|
228 | ref = []
|
229 |
|
230 | if callable(val):
|
231 | #log('VAL %s module %s', val, val.__module__)
|
232 | # Recursive call.
|
233 | _Walk(val, None, ref, syms)
|
234 |
|
235 | # If the value is a class, walk its methods. Note that we assume ALL
|
236 | # methods are used. It's possible to narrow this down a bit and detect
|
237 | # unused methods.
|
238 | if isinstance(val, type):
|
239 | cls = val
|
240 | #log('type %s', val)
|
241 | for name in dir(val):
|
242 | # prevent error with __abstractmethods__ attribute
|
243 | #if name.startswith('__'):
|
244 | if name == '__abstractmethods__':
|
245 | continue
|
246 | field_val = getattr(val, name)
|
247 | #log('field_val %s', field_val)
|
248 | if isinstance(field_val, types.MethodType):
|
249 | func_obj = field_val.im_func
|
250 | _Walk(func_obj, cls, ref, syms)
|
251 |
|
252 | last_val = val # Used on next iteration
|
253 |
|
254 | except StopIteration:
|
255 | pass
|
256 |
|
257 | #log('\tDone _Walk %s %s', obj, module)
|
258 |
|
259 |
|
260 | def PrintSig(fmt, func):
|
261 | if os.getenv('CALLGRAPH_SIG') != '1':
|
262 | return
|
263 | try:
|
264 | argspec = inspect.getargspec(func)
|
265 | except TypeError:
|
266 | return
|
267 | parts = [':%s' % ' '.join(argspec.args)]
|
268 | # These are keyword-only args?
|
269 | if argspec.varargs:
|
270 | parts.append('varargs=%s' % (argspec.varargs,))
|
271 | if argspec.keywords:
|
272 | parts.append('kw=%s' % (argspec.keywords,))
|
273 | # Hm the default of 'None' is bad -- you can't distinguish a default of None,
|
274 | # which is very common!
|
275 | # It's better to get this by parsing the AST.
|
276 | if argspec.defaults and any(d is not None for d in argspec.defaults):
|
277 | parts.append('defaults=%s' % (argspec.defaults,))
|
278 |
|
279 | print(fmt % ' '.join(parts))
|
280 |
|
281 |
|
282 | class Class(object):
|
283 |
|
284 | def __init__(self, cls, mod_name, mod_path):
|
285 | self.cls = cls
|
286 | self.mod_name = mod_name
|
287 | self.mod_path = mod_path
|
288 | self.methods = []
|
289 |
|
290 | def Name(self):
|
291 | return self.cls.__name__
|
292 |
|
293 | def AddMethod(self, m, mod_name, mod_path, line_num):
|
294 | # Just assume the method is in the same file as the class itself.
|
295 | self.methods.append((m, mod_name, mod_path, line_num))
|
296 |
|
297 | def Print(self):
|
298 | base_names = ' '.join(c.__name__ for c in self.cls.__bases__)
|
299 | print(' %s(%s)' % (self.cls.__name__, base_names))
|
300 |
|
301 | methods = [(m.__name__, m) for (m, _, _, _) in self.methods]
|
302 | methods.sort()
|
303 | for name, m in methods:
|
304 | print(' %s' % name)
|
305 | PrintSig(' %s', m)
|
306 |
|
307 | def __repr__(self):
|
308 | return '<Class %s %s %s %s>' % (
|
309 | self.cls, self.mod_name, self.mod_path, self.methods)
|
310 |
|
311 |
|
312 | class Symbols(object):
|
313 | """A sink for discovered symbols."""
|
314 |
|
315 | def __init__(self):
|
316 | self.seen = set()
|
317 |
|
318 | self.classes = {} # integer id() -> Class
|
319 | self.functions = [] # list of callables
|
320 |
|
321 | self.paths = {} # path -> list of functions
|
322 |
|
323 | def Seen(self, c):
|
324 | """c: a callable."""
|
325 | id_ = id(c)
|
326 | if id_ in self.seen:
|
327 | return True
|
328 | self.seen.add(id_)
|
329 | return False
|
330 |
|
331 | def Add(self, obj, cls, ref, mod_name, mod_path, line_num):
|
332 | """Could be a function, class Constructor, or method.
|
333 | Can also be native (C) or interpreted (Python with __code__ attribute.)
|
334 |
|
335 | Returns:
|
336 | True if we haven't yet seen it.
|
337 | """
|
338 | if mod_path is not None:
|
339 | mod_path = os.path.normpath(mod_path)
|
340 |
|
341 | if isinstance(obj, type):
|
342 | id_ = id(obj)
|
343 |
|
344 | # NOTE: Python's classes don't have a __code__ object, which appears to
|
345 | # be an irregularity. So we have to get location information from the
|
346 | # METHODS.
|
347 | assert not hasattr(obj, '__code__'), obj
|
348 | #assert path is None
|
349 | assert line_num is None
|
350 |
|
351 | self.classes[id_] = Class(obj, mod_name, mod_path)
|
352 |
|
353 | elif cls is not None:
|
354 | id_ = id(cls)
|
355 | descriptor = self.classes[id_]
|
356 | descriptor.AddMethod(obj, mod_name, mod_path, line_num)
|
357 |
|
358 | else:
|
359 | self.functions.append((obj, ref, mod_name, mod_path, line_num))
|
360 |
|
361 | return True
|
362 |
|
363 | def Report(self, f=sys.stdout):
|
364 | # Now categorize into files. We couldn't do that earlier because classes
|
365 | # don't know where they are located!
|
366 |
|
367 | py_srcs = collections.defaultdict(SourceFile)
|
368 | c_srcs = collections.defaultdict(SourceFile)
|
369 |
|
370 | for func, ref, mod_name, mod_path, line_num in self.functions:
|
371 | if mod_path:
|
372 | py_srcs[mod_path].functions.append((func, ref, line_num))
|
373 | else:
|
374 | c_srcs[mod_name].functions.append((func, ref, line_num))
|
375 |
|
376 | for cls in self.classes.values():
|
377 | #if cls.cls.__name__ == 'lex_mode_e':
|
378 | # log('!!! %s', cls)
|
379 |
|
380 | if cls.mod_path:
|
381 | py_srcs[cls.mod_path].classes.append(cls)
|
382 | else:
|
383 | c_srcs[cls.mod_name].classes.append(cls)
|
384 |
|
385 | prefix = os.getcwd()
|
386 | n = len(prefix) + 1
|
387 |
|
388 | #for name in sorted(py_srcs):
|
389 | # print(name)
|
390 |
|
391 | #return
|
392 |
|
393 | # Still missing: non-enum ASDL types? Why? CompoundObj?
|
394 | # command_e is there, but command and SimpleCommand aren't.
|
395 | # it's because we do
|
396 | # ast.command_e vs. ast.SimpleCommand
|
397 | # in both cases ast is a osh/meta _AsdlModule?
|
398 |
|
399 | print('PYTHON CODE')
|
400 | print()
|
401 |
|
402 | for path in sorted(py_srcs):
|
403 | src = py_srcs[path]
|
404 |
|
405 | if path is not None and path.startswith(prefix):
|
406 | path = path[n:]
|
407 |
|
408 | print('%s' % path)
|
409 |
|
410 | for func, ref, _ in src.functions:
|
411 | #third = func
|
412 | third = ''
|
413 | #print(' %s [%s] %s' % (func.__name__, '.'.join(ref), third))
|
414 | print(' %s' % func.__name__)
|
415 | PrintSig(' %s', func)
|
416 |
|
417 | classes = [(c.Name(), c) for c in src.classes]
|
418 | classes.sort()
|
419 | for c in src.classes:
|
420 | c.Print()
|
421 |
|
422 | print()
|
423 |
|
424 | print('NATIVE CODE')
|
425 | print()
|
426 |
|
427 | for mod_name in sorted(c_srcs):
|
428 | src = c_srcs[mod_name]
|
429 |
|
430 | print('%s' % mod_name)
|
431 |
|
432 | for func, ref, _ in src.functions:
|
433 | #third = func
|
434 | third = ''
|
435 | #print(' %s [%s] %s' % (func.__name__, '.'.join(ref), third))
|
436 | print(' %s' % func.__name__)
|
437 |
|
438 | classes = [(c.Name(), c) for c in src.classes]
|
439 | classes.sort()
|
440 | for c in src.classes:
|
441 | c.Print()
|
442 |
|
443 | print()
|
444 |
|
445 |
|
446 | class SourceFile(object):
|
447 |
|
448 | def __init__(self):
|
449 | self.classes = []
|
450 | self.functions = []
|
451 |
|
452 |
|
453 | def Walk(main, modules):
|
454 | """Given a function main, finds all functions it transitively calls.
|
455 |
|
456 | Uses heuristic bytecode analysis. Limitations:
|
457 |
|
458 | - functions that are local variables might not work? But this should work:
|
459 |
|
460 | if x > 0:
|
461 | f = GlobalFunc
|
462 | else:
|
463 | f = OtherGlobalFunc
|
464 | f() # The LOAD_GLOBAL will be found.
|
465 |
|
466 | Args:
|
467 | main: function
|
468 | modules: Dict[str, module]
|
469 | """
|
470 | syms = Symbols()
|
471 | _Walk(main, None, ['main'], syms)
|
472 |
|
473 | # TODO:
|
474 | # - co_consts should be unified? So we know how big the const pool is.
|
475 |
|
476 | syms.Report()
|
477 |
|