| 1 | #!/usr/bin/env python2
 | 
| 2 | """
 | 
| 3 | tsv_column_from_files.py
 | 
| 4 | 
 | 
| 5 | - Read a TSV file on stdin, and take a series of transformations.
 | 
| 6 | - A transformation consists of
 | 
| 7 |   - a source column name, which contains file system paths
 | 
| 8 |   - optional --base-dir
 | 
| 9 |   - a new column name
 | 
| 10 |   - a regex to extract from the files
 | 
| 11 | """
 | 
| 12 | 
 | 
| 13 | from __future__ import print_function
 | 
| 14 | 
 | 
| 15 | import csv
 | 
| 16 | import optparse
 | 
| 17 | import os
 | 
| 18 | import re
 | 
| 19 | import sys
 | 
| 20 | 
 | 
| 21 | 
 | 
| 22 | def _PrintNewRow(row, path_col_index, new_val):
 | 
| 23 |   """Print a row, replacing the cell in one column."""
 | 
| 24 |   for i, cell in enumerate(row):
 | 
| 25 |     if i != 0:
 | 
| 26 |       print('\t', end='')
 | 
| 27 | 
 | 
| 28 |     if i == path_col_index:
 | 
| 29 |       print(new_val, end='')
 | 
| 30 |     else:
 | 
| 31 |       print(row[i], end='')
 | 
| 32 |   print()
 | 
| 33 | 
 | 
| 34 | 
 | 
| 35 | def Options():
 | 
| 36 |   """Returns an option parser instance."""
 | 
| 37 |   p = optparse.OptionParser('tsv_column_from_files.py FLAG* FILE')
 | 
| 38 |   p.add_option(
 | 
| 39 |       '--path-column', dest='path_column',
 | 
| 40 |       help='The name of the column that has file system paths')
 | 
| 41 |   p.add_option(
 | 
| 42 |       '--new-column', dest='new_column',
 | 
| 43 |       help='The name of the new column to create')
 | 
| 44 |   p.add_option(
 | 
| 45 |       '--extract-group-1', dest='extract_group_1',
 | 
| 46 |       help="Search the file contents for this Python regex.  Then extract the first group")
 | 
| 47 |   p.add_option(
 | 
| 48 |       '--remove-commas', dest='remove_commas', action='store_true',
 | 
| 49 |       help='Remove commas from the value after --extract-group1')
 | 
| 50 |   return p
 | 
| 51 | 
 | 
| 52 | 
 | 
| 53 | def main(argv):
 | 
| 54 |   p = Options()
 | 
| 55 |   (opts, argv) = p.parse_args(argv[1:])
 | 
| 56 | 
 | 
| 57 |   # tsv_column_from_files.py \
 | 
| 58 |   #   --path-column     cachegrind_out_path  \
 | 
| 59 |   #   --new-column      I_refs \
 | 
| 60 |   #   --extract-group-1 'I[ ]*refs:[ ]+([\d,]+)' \  # it extracts the first group
 | 
| 61 |   #   foo.tsv 
 | 
| 62 |   #
 | 
| 63 |   # NOTE: QTT can allow commas like 1,000,000.  Like 1_000_000
 | 
| 64 | 
 | 
| 65 |   try:
 | 
| 66 |     tsv_path = argv[0]
 | 
| 67 |   except IndexError:
 | 
| 68 |     p.print_usage()
 | 
| 69 |     return 2
 | 
| 70 | 
 | 
| 71 |   base_dir = os.path.dirname(tsv_path)
 | 
| 72 | 
 | 
| 73 |   path_col_index = -1
 | 
| 74 | 
 | 
| 75 |   with open(tsv_path) as f:
 | 
| 76 |     for i, line in enumerate(f):
 | 
| 77 |       line = line.rstrip()
 | 
| 78 | 
 | 
| 79 |       row = line.split('\t')
 | 
| 80 |       if i == 0:
 | 
| 81 |         try:
 | 
| 82 |           path_col_index = row.index(opts.path_column)
 | 
| 83 |         except ValueError:
 | 
| 84 |           raise RuntimeError('Expected %r in header %r' % (opts.path_column, row))
 | 
| 85 |         _PrintNewRow(row, path_col_index, opts.new_column)
 | 
| 86 |         continue  # skip to first row
 | 
| 87 | 
 | 
| 88 |       assert path_col_index != -1
 | 
| 89 |       rel_path = row[path_col_index]
 | 
| 90 | 
 | 
| 91 |       cell_path = os.path.join(base_dir, rel_path)
 | 
| 92 |       with open(cell_path) as f2:
 | 
| 93 |         contents = f2.read()
 | 
| 94 |         if opts.extract_group_1:
 | 
| 95 |           pat = re.compile(opts.extract_group_1, re.VERBOSE)
 | 
| 96 |           m = pat.search(contents)
 | 
| 97 |           if not m:
 | 
| 98 |             raise RuntimeError("Couldn't find %r in %r" % (opts.extract_group_1, contents))
 | 
| 99 |           val = m.group(1)
 | 
| 100 | 
 | 
| 101 |           if opts.remove_commas:  # annoying hack for cachegrind output
 | 
| 102 |             val = val.replace(',', '')
 | 
| 103 | 
 | 
| 104 |           #print(repr(val))
 | 
| 105 |         else:
 | 
| 106 |           val = contents  # just use the whole file
 | 
| 107 |       if '\t' in val or '\n' in val:
 | 
| 108 |         raise RuntimeError("Found tab or newline in TSV cell %r" % val)
 | 
| 109 | 
 | 
| 110 |       _PrintNewRow(row, path_col_index, val)
 | 
| 111 | 
 | 
| 112 |   return 0
 | 
| 113 | 
 | 
| 114 | 
 | 
| 115 | if __name__ == '__main__':
 | 
| 116 |   try:
 | 
| 117 |     sys.exit(main(sys.argv))
 | 
| 118 |   except RuntimeError as e:
 | 
| 119 |     print('%s FATAL: %s' % (sys.argv[0], e), file=sys.stderr)
 | 
| 120 |     sys.exit(1)
 |