1 | #!/usr/bin/env python2
|
2 | """
|
3 | tsv_column_from_files.py
|
4 |
|
5 | - Read a TSV file on stdin, and take a series of transformations.
|
6 | - A transformation consists of
|
7 | - a source column name, which contains file system paths
|
8 | - optional --base-dir
|
9 | - a new column name
|
10 | - a regex to extract from the files
|
11 | """
|
12 |
|
13 | from __future__ import print_function
|
14 |
|
15 | import csv
|
16 | import optparse
|
17 | import os
|
18 | import re
|
19 | import sys
|
20 |
|
21 |
|
22 | def _PrintNewRow(row, path_col_index, new_val):
|
23 | """Print a row, replacing the cell in one column."""
|
24 | for i, cell in enumerate(row):
|
25 | if i != 0:
|
26 | print('\t', end='')
|
27 |
|
28 | if i == path_col_index:
|
29 | print(new_val, end='')
|
30 | else:
|
31 | print(row[i], end='')
|
32 | print()
|
33 |
|
34 |
|
35 | def Options():
|
36 | """Returns an option parser instance."""
|
37 | p = optparse.OptionParser('tsv_column_from_files.py FLAG* FILE')
|
38 | p.add_option(
|
39 | '--path-column', dest='path_column',
|
40 | help='The name of the column that has file system paths')
|
41 | p.add_option(
|
42 | '--new-column', dest='new_column',
|
43 | help='The name of the new column to create')
|
44 | p.add_option(
|
45 | '--extract-group-1', dest='extract_group_1',
|
46 | help="Search the file contents for this Python regex. Then extract the first group")
|
47 | p.add_option(
|
48 | '--remove-commas', dest='remove_commas', action='store_true',
|
49 | help='Remove commas from the value after --extract-group1')
|
50 | return p
|
51 |
|
52 |
|
53 | def main(argv):
|
54 | p = Options()
|
55 | (opts, argv) = p.parse_args(argv[1:])
|
56 |
|
57 | # tsv_column_from_files.py \
|
58 | # --path-column cachegrind_out_path \
|
59 | # --new-column I_refs \
|
60 | # --extract-group-1 'I[ ]*refs:[ ]+([\d,]+)' \ # it extracts the first group
|
61 | # foo.tsv
|
62 | #
|
63 | # NOTE: QTT can allow commas like 1,000,000. Like 1_000_000
|
64 |
|
65 | try:
|
66 | tsv_path = argv[0]
|
67 | except IndexError:
|
68 | p.print_usage()
|
69 | return 2
|
70 |
|
71 | base_dir = os.path.dirname(tsv_path)
|
72 |
|
73 | path_col_index = -1
|
74 |
|
75 | with open(tsv_path) as f:
|
76 | for i, line in enumerate(f):
|
77 | line = line.rstrip()
|
78 |
|
79 | row = line.split('\t')
|
80 | if i == 0:
|
81 | try:
|
82 | path_col_index = row.index(opts.path_column)
|
83 | except ValueError:
|
84 | raise RuntimeError('Expected %r in header %r' % (opts.path_column, row))
|
85 | _PrintNewRow(row, path_col_index, opts.new_column)
|
86 | continue # skip to first row
|
87 |
|
88 | assert path_col_index != -1
|
89 | rel_path = row[path_col_index]
|
90 |
|
91 | cell_path = os.path.join(base_dir, rel_path)
|
92 | with open(cell_path) as f2:
|
93 | contents = f2.read()
|
94 | if opts.extract_group_1:
|
95 | pat = re.compile(opts.extract_group_1, re.VERBOSE)
|
96 | m = pat.search(contents)
|
97 | if not m:
|
98 | raise RuntimeError("Couldn't find %r in %r" % (opts.extract_group_1, contents))
|
99 | val = m.group(1)
|
100 |
|
101 | if opts.remove_commas: # annoying hack for cachegrind output
|
102 | val = val.replace(',', '')
|
103 |
|
104 | #print(repr(val))
|
105 | else:
|
106 | val = contents # just use the whole file
|
107 | if '\t' in val or '\n' in val:
|
108 | raise RuntimeError("Found tab or newline in TSV cell %r" % val)
|
109 |
|
110 | _PrintNewRow(row, path_col_index, val)
|
111 |
|
112 | return 0
|
113 |
|
114 |
|
115 | if __name__ == '__main__':
|
116 | try:
|
117 | sys.exit(main(sys.argv))
|
118 | except RuntimeError as e:
|
119 | print('%s FATAL: %s' % (sys.argv[0], e), file=sys.stderr)
|
120 | sys.exit(1)
|