OILS / devtools / tsv_column_from_files.py View on Github | oilshell.org

120 lines, 76 significant
1#!/usr/bin/env python2
2"""
3tsv_column_from_files.py
4
5- Read a TSV file on stdin, and take a series of transformations.
6- A transformation consists of
7 - a source column name, which contains file system paths
8 - optional --base-dir
9 - a new column name
10 - a regex to extract from the files
11"""
12
13from __future__ import print_function
14
15import csv
16import optparse
17import os
18import re
19import sys
20
21
22def _PrintNewRow(row, path_col_index, new_val):
23 """Print a row, replacing the cell in one column."""
24 for i, cell in enumerate(row):
25 if i != 0:
26 print('\t', end='')
27
28 if i == path_col_index:
29 print(new_val, end='')
30 else:
31 print(row[i], end='')
32 print()
33
34
35def Options():
36 """Returns an option parser instance."""
37 p = optparse.OptionParser('tsv_column_from_files.py FLAG* FILE')
38 p.add_option(
39 '--path-column', dest='path_column',
40 help='The name of the column that has file system paths')
41 p.add_option(
42 '--new-column', dest='new_column',
43 help='The name of the new column to create')
44 p.add_option(
45 '--extract-group-1', dest='extract_group_1',
46 help="Search the file contents for this Python regex. Then extract the first group")
47 p.add_option(
48 '--remove-commas', dest='remove_commas', action='store_true',
49 help='Remove commas from the value after --extract-group1')
50 return p
51
52
53def main(argv):
54 p = Options()
55 (opts, argv) = p.parse_args(argv[1:])
56
57 # tsv_column_from_files.py \
58 # --path-column cachegrind_out_path \
59 # --new-column I_refs \
60 # --extract-group-1 'I[ ]*refs:[ ]+([\d,]+)' \ # it extracts the first group
61 # foo.tsv
62 #
63 # NOTE: QTT can allow commas like 1,000,000. Like 1_000_000
64
65 try:
66 tsv_path = argv[0]
67 except IndexError:
68 p.print_usage()
69 return 2
70
71 base_dir = os.path.dirname(tsv_path)
72
73 path_col_index = -1
74
75 with open(tsv_path) as f:
76 for i, line in enumerate(f):
77 line = line.rstrip()
78
79 row = line.split('\t')
80 if i == 0:
81 try:
82 path_col_index = row.index(opts.path_column)
83 except ValueError:
84 raise RuntimeError('Expected %r in header %r' % (opts.path_column, row))
85 _PrintNewRow(row, path_col_index, opts.new_column)
86 continue # skip to first row
87
88 assert path_col_index != -1
89 rel_path = row[path_col_index]
90
91 cell_path = os.path.join(base_dir, rel_path)
92 with open(cell_path) as f2:
93 contents = f2.read()
94 if opts.extract_group_1:
95 pat = re.compile(opts.extract_group_1, re.VERBOSE)
96 m = pat.search(contents)
97 if not m:
98 raise RuntimeError("Couldn't find %r in %r" % (opts.extract_group_1, contents))
99 val = m.group(1)
100
101 if opts.remove_commas: # annoying hack for cachegrind output
102 val = val.replace(',', '')
103
104 #print(repr(val))
105 else:
106 val = contents # just use the whole file
107 if '\t' in val or '\n' in val:
108 raise RuntimeError("Found tab or newline in TSV cell %r" % val)
109
110 _PrintNewRow(row, path_col_index, val)
111
112 return 0
113
114
115if __name__ == '__main__':
116 try:
117 sys.exit(main(sys.argv))
118 except RuntimeError as e:
119 print('%s FATAL: %s' % (sys.argv[0], e), file=sys.stderr)
120 sys.exit(1)