| 1 | #!/usr/bin/env python2
|
| 2 | """
|
| 3 | tsv_column_from_files.py
|
| 4 |
|
| 5 | - Read a TSV file on stdin, and take a series of transformations.
|
| 6 | - A transformation consists of
|
| 7 | - a source column name, which contains file system paths
|
| 8 | - optional --base-dir
|
| 9 | - a new column name
|
| 10 | - a regex to extract from the files
|
| 11 | """
|
| 12 |
|
| 13 | from __future__ import print_function
|
| 14 |
|
| 15 | import csv
|
| 16 | import optparse
|
| 17 | import os
|
| 18 | import re
|
| 19 | import sys
|
| 20 |
|
| 21 |
|
| 22 | def _PrintNewRow(row, path_col_index, new_val):
|
| 23 | """Print a row, replacing the cell in one column."""
|
| 24 | for i, cell in enumerate(row):
|
| 25 | if i != 0:
|
| 26 | print('\t', end='')
|
| 27 |
|
| 28 | if i == path_col_index:
|
| 29 | print(new_val, end='')
|
| 30 | else:
|
| 31 | print(row[i], end='')
|
| 32 | print()
|
| 33 |
|
| 34 |
|
| 35 | def Options():
|
| 36 | """Returns an option parser instance."""
|
| 37 | p = optparse.OptionParser('tsv_column_from_files.py FLAG* FILE')
|
| 38 | p.add_option(
|
| 39 | '--path-column', dest='path_column',
|
| 40 | help='The name of the column that has file system paths')
|
| 41 | p.add_option(
|
| 42 | '--new-column', dest='new_column',
|
| 43 | help='The name of the new column to create')
|
| 44 | p.add_option(
|
| 45 | '--extract-group-1', dest='extract_group_1',
|
| 46 | help="Search the file contents for this Python regex. Then extract the first group")
|
| 47 | p.add_option(
|
| 48 | '--remove-commas', dest='remove_commas', action='store_true',
|
| 49 | help='Remove commas from the value after --extract-group1')
|
| 50 | return p
|
| 51 |
|
| 52 |
|
| 53 | def main(argv):
|
| 54 | p = Options()
|
| 55 | (opts, argv) = p.parse_args(argv[1:])
|
| 56 |
|
| 57 | # tsv_column_from_files.py \
|
| 58 | # --path-column cachegrind_out_path \
|
| 59 | # --new-column I_refs \
|
| 60 | # --extract-group-1 'I[ ]*refs:[ ]+([\d,]+)' \ # it extracts the first group
|
| 61 | # foo.tsv
|
| 62 | #
|
| 63 | # NOTE: QTT can allow commas like 1,000,000. Like 1_000_000
|
| 64 |
|
| 65 | try:
|
| 66 | tsv_path = argv[0]
|
| 67 | except IndexError:
|
| 68 | p.print_usage()
|
| 69 | return 2
|
| 70 |
|
| 71 | base_dir = os.path.dirname(tsv_path)
|
| 72 |
|
| 73 | path_col_index = -1
|
| 74 |
|
| 75 | with open(tsv_path) as f:
|
| 76 | for i, line in enumerate(f):
|
| 77 | line = line.rstrip()
|
| 78 |
|
| 79 | row = line.split('\t')
|
| 80 | if i == 0:
|
| 81 | try:
|
| 82 | path_col_index = row.index(opts.path_column)
|
| 83 | except ValueError:
|
| 84 | raise RuntimeError('Expected %r in header %r' % (opts.path_column, row))
|
| 85 | _PrintNewRow(row, path_col_index, opts.new_column)
|
| 86 | continue # skip to first row
|
| 87 |
|
| 88 | assert path_col_index != -1
|
| 89 | rel_path = row[path_col_index]
|
| 90 |
|
| 91 | cell_path = os.path.join(base_dir, rel_path)
|
| 92 | with open(cell_path) as f2:
|
| 93 | contents = f2.read()
|
| 94 | if opts.extract_group_1:
|
| 95 | pat = re.compile(opts.extract_group_1, re.VERBOSE)
|
| 96 | m = pat.search(contents)
|
| 97 | if not m:
|
| 98 | raise RuntimeError("Couldn't find %r in %r" % (opts.extract_group_1, contents))
|
| 99 | val = m.group(1)
|
| 100 |
|
| 101 | if opts.remove_commas: # annoying hack for cachegrind output
|
| 102 | val = val.replace(',', '')
|
| 103 |
|
| 104 | #print(repr(val))
|
| 105 | else:
|
| 106 | val = contents # just use the whole file
|
| 107 | if '\t' in val or '\n' in val:
|
| 108 | raise RuntimeError("Found tab or newline in TSV cell %r" % val)
|
| 109 |
|
| 110 | _PrintNewRow(row, path_col_index, val)
|
| 111 |
|
| 112 | return 0
|
| 113 |
|
| 114 |
|
| 115 | if __name__ == '__main__':
|
| 116 | try:
|
| 117 | sys.exit(main(sys.argv))
|
| 118 | except RuntimeError as e:
|
| 119 | print('%s FATAL: %s' % (sys.argv[0], e), file=sys.stderr)
|
| 120 | sys.exit(1)
|