OILS / build / cpython_defs.py View on Github | oilshell.org

515 lines, 335 significant
1#!/usr/bin/env python2
2"""
3parse_cpython.py
4"""
5from __future__ import print_function
6
7import errno
8import os
9import re
10import sys
11
12from mycpp.mylib import log
13# TODO: Could move these to a place where they don't depend on Oil
14from frontend.lexer_def import C, R
15
16
17C_DEF = [
18 R(r'#.*', 'Comment'),
19 R(r'[ \t\n]+', 'Whitespace'),
20
21 # This could be more space-insensitive.
22 R(r'static.*PyMethodDef (.*)\[\]\s*=\s*', 'BeginDef'),
23 C(r'{', 'LBrace'),
24 C(r'}', 'RBrace'),
25 C(r',', 'Comma'),
26 C(r';', 'Semi'),
27 R(r'"([^"]*)"', 'Str'),
28 C(r'FILE', 'FILE'),
29 C(r'PyDoc_STR(', 'LDocStr'),
30 C(r')', 'RDocStr'),
31 R(r'[^,}\n]+', 'Opaque'),
32]
33
34
35# NOTE: This is copied from osh/match.py because we don't have 're' there.
36def _CompileAll(pat_list):
37 result = []
38 for is_regex, pat, token_id in pat_list:
39 if not is_regex:
40 pat = re.escape(pat) # turn $ into \$
41 result.append((re.compile(pat), token_id))
42 return result
43
44
45class Lexer(object):
46 def __init__(self, pat_list):
47 self.pat_list = _CompileAll(pat_list)
48
49 def Tokens(self, s):
50 pos = 0
51 n = len(s)
52 while pos < n:
53 for pat, id_ in self.pat_list:
54 # FIRST MATCH
55 m = pat.match(s, pos)
56 if m:
57 if m.groups():
58 start, end = m.start(1), m.end(1)
59 else:
60 start, end = m.start(0), m.end(0)
61 pos = m.end()
62 break # found a match
63 else:
64 raise AssertionError(
65 'no token matched at position %r: %r' % ( pos, s[pos]))
66
67 if id_ != 'Whitespace':
68 yield id_, s[start:end], pos
69 yield 'EOF', '', -1
70
71
72class Parser(object):
73 """Parser for C PyMethodDef initializer lists."""
74
75 def __init__(self, tokens):
76 self.tokens = tokens
77 self.Next() # initialize
78
79 def Next(self):
80 while True:
81 self.tok_id, self.tok_val, self.pos = self.tokens.next()
82 if self.tok_id not in ('Comment', 'Whitespace'):
83 break
84 if 0:
85 log('%s %r', self.tok_id, self.tok_val)
86
87 def Eat(self, tok_id):
88 if self.tok_id != tok_id:
89 raise RuntimeError(
90 'Expected %r, got %r %r (byte offset %d)' %
91 (tok_id, self.tok_id, self.tok_val, self.pos))
92
93 self.Next()
94
95 def ParseName(self):
96 """
97 Name = Str | Opaque('NULL') | Opaque('0')
98 """
99 if self.tok_id == 'Str':
100 name = self.tok_val
101 elif self.tok_id == 'Opaque':
102 assert self.tok_val in ('NULL', '0')
103 name = None
104 else:
105 raise RuntimeError('Unexpected token %r' % self.tok_id)
106 self.Next()
107 return name
108
109 def ParseVal(self):
110 """
111 Val = Str
112 | Opaque
113 | LDocStr Str+ RDocStr # string concatenation happens
114 """
115 if self.tok_id == 'LDocStr':
116 self.Next()
117
118 val = self.tok_val
119 self.Eat('Str')
120 while self.tok_id == 'Str':
121 val += self.tok_val
122 self.Next()
123
124 self.Eat('RDocStr')
125
126 elif self.tok_id in ('Opaque', 'Str'):
127 val = self.tok_val
128 self.Next()
129
130 else:
131 raise RuntimeError('Unexpected token %r' % self.tok_id)
132
133 return val
134
135 def ParseItem(self):
136 """
137 Item = '{' Name (',' Val)+ '}' ','?
138 """
139 self.Eat('LBrace')
140 name = self.ParseName()
141
142 vals = []
143 while self.tok_id == 'Comma':
144 self.Next()
145 vals.append(self.ParseVal())
146
147 self.Eat('RBrace')
148
149 if self.tok_id == 'Comma': # Optional
150 self.Next()
151
152 return name, vals
153
154 def ParseDef(self):
155 """
156 Def = BeginDef '{' Item+ '}' ';'
157 """
158 def_name = self.tok_val
159 self.Eat('BeginDef')
160 self.Eat('LBrace')
161
162 items = []
163 while self.tok_id != 'RBrace':
164 items.append(self.ParseItem())
165
166 self.Next()
167 self.Eat('Semi')
168
169 return (def_name, items)
170
171 def ParseHeader(self):
172 self.Eat('FILE')
173 path = self.tok_val
174 self.Eat('Opaque')
175 return path
176
177 def ParseFile(self):
178 """
179 File = Header Def*
180 """
181 path = self.ParseHeader()
182 defs = []
183 while self.tok_id not in ('FILE', 'EOF'):
184 defs.append(self.ParseDef())
185
186 return path, defs
187
188 def ParseStream(self):
189 """
190 Stream = File*
191 """
192 files = []
193 while self.tok_id != 'EOF':
194 files.append(self.ParseFile())
195
196 return files
197
198
199def PrettyPrint(rel_path, def_name, entries, predicate, f, stats):
200 def out(msg, *args):
201 if args:
202 msg = msg % args
203 print(msg, file=f, end='')
204
205 out('static PyMethodDef %s[] = {\n', def_name)
206 for entry_name, vals in entries:
207 if entry_name is None:
208 out(' {0},\n') # null initializer
209 continue
210 stats['num_methods'] += 1
211
212 if not predicate(rel_path, def_name, entry_name):
213 stats['num_filtered'] += 1
214 continue
215
216 # Reprint the definition, but omit the docstring.
217 out(' {"%s", ', entry_name)
218 out(vals[0]) # The C function
219 out(', ')
220 out(vals[1]) # The flags
221 out('},\n')
222 out('};\n')
223
224
225MODULES_TO_FILTER = [
226 # My Own
227 'libc.c',
228 'fastlex.c',
229 'line_input.c',
230
231 'import.c',
232 'marshal.c', # additional filters below
233 #'zipimport.c', # Cannot filter this because find_module called from C!
234
235 # Types for Builtins
236 'enumobject.c',
237 'rangeobject.c',
238
239 # Interpreter types
240 'descrobject.c',
241 'exceptions.c',
242 'structseq.c',
243 '_warnings.c',
244
245 # Control flow
246 'frameobject.c',
247 'genobject.c',
248 'iterobject.c',
249
250 # GC
251 '_weakref.c',
252 'weakrefobject.c',
253 'gcmodule.c',
254
255 # "Data types"
256 #'boolobject.c', # No defs
257 'cStringIO.c',
258 'dictobject.c',
259 'fileobject.c',
260 'floatobject.c',
261 'intobject.c',
262 'listobject.c',
263 'longobject.c',
264 #'moduleobject.c', # No defs
265 'setobject.c',
266 'stringobject.c',
267 'tupleobject.c',
268 'sliceobject.c',
269 'typeobject.c',
270
271 # Builtins
272 'bltinmodule.c', # additional filters below
273 #'sysmodule.c', # Filtered below
274
275 # Libraries
276 'errnomodule.c', # has no methods, but include it for completeness
277 'fcntlmodule.c',
278 'posixmodule.c',
279 'pwdmodule.c',
280 'readline.c',
281 'resource.c',
282 'signalmodule.c',
283 'timemodule.c',
284 'termios.c',
285]
286
287
288class OilMethodFilter(object):
289
290 def __init__(self, py_names):
291 self.py_names = py_names
292
293 def __call__(self, rel_path, def_name, method_name):
294 basename = os.path.basename(rel_path)
295
296 if method_name == 'count': # False positive for {str,list,tuple}.count()
297 return False
298
299 if method_name == 'collect': # False positive: pyannotate and gcmodule.c
300 return False
301
302 # enter/exit needed for 'with open'. __length_hint__ is an optimization.
303 if method_name in ('__enter__', '__exit__', '__length_hint__'):
304 return True
305 # Notes:
306 # - __reduce__ and __setstate__ are for pickle. And I think
307 # __getnewargs__.
308 # - Do we need __sizeof__? Is that for sys.getsizeof()?
309
310 # 5/2022: avoid regression? Not sure why this was getting deleted
311 if method_name == '__getitem__':
312 return True
313
314 # NOTE: LoadYshGrammar needs marshal.loads().
315 # False positive for yajl.dumps() and load()
316 if basename == 'marshal.c' and method_name in ('dump', 'dumps', 'load'):
317 return False
318
319 # Auto-filtering gave false-positives here.
320 # We don't need top-level next(). The method should be good enough.
321 # iter is a field name
322 if (basename == 'bltinmodule.c' and method_name in
323 ('compile', 'format', 'next', 'vars', 'iter', 'eval', 'bin')):
324 return False
325 if basename == 'bltinmodule.c':
326 # Get "bootstrapping error" without this.
327 if method_name == '__import__':
328 return True
329
330 if basename == '_warnings.c' and method_name == 'warn':
331 return False
332
333 if basename == 'dictobject.c' and method_name in (
334 'iterkeys', 'itervalues', 'copy', 'fromkeys', 'popitem', 'setdefault'):
335 return False
336
337 if basename == 'tupleobject.c' and method_name == 'index':
338 return False
339
340 if basename == 'stringobject.c' and method_name == 'translate':
341 # false positive from arg.translate
342 return False
343
344 if basename == 'setobject.c' and method_name in ('pop', 'copy'):
345 return False
346
347 if basename == 'frozensetobject.c' and method_name == 'copy':
348 return False
349
350 if basename == 'sliceobject.c' and method_name == 'indices':
351 return False
352
353 # Shadowed by fanos.send(), posix.close(), etc.
354 if basename == 'genobject.c' and method_name in ('send', 'close'):
355 return False
356
357 # We're using list.remove()
358 if basename == 'posixmodule.c' and method_name == 'remove': # Shadowed
359 return False
360
361 # We're using dict.clear() and list.remove()
362 if basename == 'setobject.c' and method_name in ('clear', 'remove'):
363 return False
364
365 # Do custom filtering here.
366 if (basename == 'sysmodule.c' and method_name not in self.py_names):
367 # These can't be removed or they cause assertions!
368 if method_name not in ('displayhook', 'excepthook'):
369 return False
370
371 # This one is called from C.
372 if basename == 'signalmodule.c' and method_name == 'default_int_handler':
373 return True
374
375 # segfault without this
376 if basename == 'typeobject.c' and method_name == '__new__':
377 return True
378
379 if basename == 'descrobject.c':
380 # Apparently used for dir() on class namespace, as in dir(Id).
381 if method_name == 'keys':
382 return True
383 return False
384
385 # Try just filtering {time,pwd,posix}module.c, etc.
386 if basename in MODULES_TO_FILTER and method_name not in self.py_names:
387 return False
388
389 #log('= %s %s', def_name, method_name)
390
391 # If it doesn't appear in the .py source, it can't be used. (Exception: it
392 # could be used in C source with dynamic lookup? But I don't think CPython
393 # does that.)
394 #if method_name not in self.py_names:
395 if 0:
396 log('Omitting %r', method_name)
397 return False
398
399 return True
400
401
402def main(argv):
403 action = argv[1]
404
405 try:
406 py_names_path = argv[2]
407 except IndexError:
408 method_filter = None
409 else:
410 py_names = set()
411 with open(py_names_path) as f:
412 for line in f:
413 py_names.add(line.strip())
414 method_filter = OilMethodFilter(py_names)
415
416 if action == 'filtered':
417 tokens = None
418 else:
419 tokens = Lexer(C_DEF).Tokens(sys.stdin.read())
420
421 if action == 'lex': # for debugging
422 while True:
423 id_, value, pos = tokens.next()
424 print('%s\t%r' % (id_, value))
425 if id_ == 'EOF':
426 break
427
428 elif action == 'audit': # show after filtering, for debugging
429 p = Parser(tokens)
430 files = p.ParseStream()
431 for rel_path, defs in files:
432 basename = os.path.basename(rel_path)
433
434 print(rel_path)
435 for def_name, entries in defs:
436 print('\t' + def_name)
437 for method_name, vals in entries:
438 if method_name is None:
439 continue
440 if not method_filter(rel_path, def_name, method_name):
441 continue
442 print('\t\t%s %s' % (method_name, vals))
443
444 elif action == 'filter': # for slimming the build down
445 out_dir = argv[3]
446
447 p = Parser(tokens)
448 files = p.ParseStream()
449
450 # Print to files.
451
452 stats = {'num_methods': 0, 'num_defs': 0, 'num_filtered': 0}
453 for rel_path, defs in files:
454 # Make a directory for each .c file! Each file is a def.
455 c_dir = os.path.join(out_dir, rel_path)
456 try:
457 os.makedirs(c_dir)
458 except OSError as e:
459 if e.errno != errno.EEXIST:
460 raise
461
462 for def_name, entries in defs:
463 out_path = os.path.join(c_dir, '%s.def' % def_name)
464
465 # TODO: Write a separate file here for each one. We have to include a
466 # different file at each definition.
467
468 with open(out_path, 'w') as f:
469 print('// %s' % rel_path, file=f)
470 print('', file=f)
471 PrettyPrint(rel_path, def_name, entries, method_filter, f, stats)
472
473 stats['num_defs'] += 1
474 log('Wrote %s', out_path)
475
476 stats['num_left'] = stats['num_methods'] - stats['num_filtered']
477 log('cpython_defs.py: Filtered %(num_filtered)d of %(num_methods)d methods, '
478 'leaving %(num_left)d (from %(num_defs)d definitions)' % stats)
479
480 elif action == 'tsv':
481 p = Parser(tokens)
482 files = p.ParseStream()
483 header = [
484 'file', 'def_name', 'py_method_name', 'c_symbol_name', 'flags',
485 'used'
486 ]
487 print('\t'.join(header))
488 for rel_path, defs in files:
489 for def_name, entries in defs:
490 for method_name, vals in entries:
491 if method_name is None:
492 continue
493 b = method_filter(rel_path, def_name, method_name)
494 used = 'T' if b else 'F'
495
496 # TODO: The c_symbol_name could be parsed better. It sometimes has
497 # "(PyCFunction)" on the front of it.
498
499 row = [rel_path, def_name, method_name, vals[0], vals[1], used]
500 print('\t'.join(row))
501
502 elif action == 'filtered':
503 for name in MODULES_TO_FILTER:
504 print(name)
505
506 else:
507 raise RuntimeError('Invalid action %r' % action)
508
509
510if __name__ == '__main__':
511 try:
512 main(sys.argv)
513 except RuntimeError as e:
514 print('FATAL: %s' % e, file=sys.stderr)
515 sys.exit(1)