devtools/tsv_column_from

OILS / devtools / tsv_column_from_files.py View on Github | oilshell.org

120 lines, 76 significant

1	#!/usr/bin/env python2
2	"""
3	tsv_column_from_files.py
4
5	- Read a TSV file on stdin, and take a series of transformations.
6	- A transformation consists of
7	- a source column name, which contains file system paths
8	- optional --base-dir
9	- a new column name
10	- a regex to extract from the files
11	"""
12
13	from __future__ import print_function
14
15	import csv
16	import optparse
17	import os
18	import re
19	import sys
20
21
22	def _PrintNewRow(row, path_col_index, new_val):
23	"""Print a row, replacing the cell in one column."""
24	for i, cell in enumerate(row):
25	if i != 0:
26	print('\t', end='')
27
28	if i == path_col_index:
29	print(new_val, end='')
30	else:
31	print(row[i], end='')
32	print()
33
34
35	def Options():
36	"""Returns an option parser instance."""
37	p = optparse.OptionParser('tsv_column_from_files.py FLAG* FILE')
38	p.add_option(
39	'--path-column', dest='path_column',
40	help='The name of the column that has file system paths')
41	p.add_option(
42	'--new-column', dest='new_column',
43	help='The name of the new column to create')
44	p.add_option(
45	'--extract-group-1', dest='extract_group_1',
46	help="Search the file contents for this Python regex. Then extract the first group")
47	p.add_option(
48	'--remove-commas', dest='remove_commas', action='store_true',
49	help='Remove commas from the value after --extract-group1')
50	return p
51
52
53	def main(argv):
54	p = Options()
55	(opts, argv) = p.parse_args(argv[1:])
56
57	# tsv_column_from_files.py \
58	# --path-column cachegrind_out_path \
59	# --new-column I_refs \
60	# --extract-group-1 'I[ ]*refs:[ ]+([\d,]+)' \ # it extracts the first group
61	# foo.tsv
62	#
63	# NOTE: QTT can allow commas like 1,000,000. Like 1_000_000
64
65	try:
66	tsv_path = argv[0]
67	except IndexError:
68	p.print_usage()
69	return 2
70
71	base_dir = os.path.dirname(tsv_path)
72
73	path_col_index = -1
74
75	with open(tsv_path) as f:
76	for i, line in enumerate(f):
77	line = line.rstrip()
78
79	row = line.split('\t')
80	if i == 0:
81	try:
82	path_col_index = row.index(opts.path_column)
83	except ValueError:
84	raise RuntimeError('Expected %r in header %r' % (opts.path_column, row))
85	_PrintNewRow(row, path_col_index, opts.new_column)
86	continue # skip to first row
87
88	assert path_col_index != -1
89	rel_path = row[path_col_index]
90
91	cell_path = os.path.join(base_dir, rel_path)
92	with open(cell_path) as f2:
93	contents = f2.read()
94	if opts.extract_group_1:
95	pat = re.compile(opts.extract_group_1, re.VERBOSE)
96	m = pat.search(contents)
97	if not m:
98	raise RuntimeError("Couldn't find %r in %r" % (opts.extract_group_1, contents))
99	val = m.group(1)
100
101	if opts.remove_commas: # annoying hack for cachegrind output
102	val = val.replace(',', '')
103
104	#print(repr(val))
105	else:
106	val = contents # just use the whole file
107	if '\t' in val or '\n' in val:
108	raise RuntimeError("Found tab or newline in TSV cell %r" % val)
109
110	_PrintNewRow(row, path_col_index, val)
111
112	return 0
113
114
115	if __name__ == '__main__':
116	try:
117	sys.exit(main(sys.argv))
118	except RuntimeError as e:
119	print('%s FATAL: %s' % (sys.argv[0], e), file=sys.stderr)
120	sys.exit(1)