| 1 | #!/usr/bin/env python2
 | 
| 2 | from __future__ import print_function
 | 
| 3 | """
 | 
| 4 | Usage:
 | 
| 5 |   csv2html.py foo.csv
 | 
| 6 | 
 | 
| 7 | Note: it's run with python2 AND python3
 | 
| 8 | 
 | 
| 9 | Attempts to read foo_schema.csv.  If not it assumes everything is a string.
 | 
| 10 | 
 | 
| 11 | Things it handles:
 | 
| 12 | 
 | 
| 13 | - table-sort.js integration <colgroup>
 | 
| 14 |   - <table id="foo"> for making columns sortable
 | 
| 15 |   - for choosing the comparator to use!
 | 
| 16 |   - for highlighting on sort
 | 
| 17 | - static / visual
 | 
| 18 |   - Aligning right for number, left for strings.
 | 
| 19 |   - highlighting NA numbers in red (only if it's considered a number)
 | 
| 20 |   - formatting numbers to a certain precision
 | 
| 21 |     - or displaying them as percentages
 | 
| 22 |   - changing CSV headers like 'elapsed_ms' to 'elapsed ms'
 | 
| 23 |   - Accepting a column with a '_HREF' suffix to make an HTML link
 | 
| 24 |     - We could have something like type:
 | 
| 25 |       string/anchor:shell-id
 | 
| 26 |       string/href:shell-id
 | 
| 27 |     - But the simple _HREF suffix is simpler.  Easier to write R code for.
 | 
| 28 | 
 | 
| 29 | Implementation notes:
 | 
| 30 | - To align right: need a class on every cell, e.g. "num".  Can't do it through
 | 
| 31 |   <colgroup>.
 | 
| 32 | - To color, can use <colgroup>.  table-sort.js needs this.
 | 
| 33 | 
 | 
| 34 | TODO:
 | 
| 35 |   Does it make sense to implement <rowspan> and <colspan> ?  It's nice for
 | 
| 36 |   visualization.
 | 
| 37 | """
 | 
| 38 | 
 | 
| 39 | try:
 | 
| 40 |   import html
 | 
| 41 | except ImportError:
 | 
| 42 |   import cgi as html
 | 
| 43 | import csv
 | 
| 44 | import optparse
 | 
| 45 | import os
 | 
| 46 | import re
 | 
| 47 | import sys
 | 
| 48 | 
 | 
| 49 | 
 | 
| 50 | def log(msg, *args):
 | 
| 51 |   if args:
 | 
| 52 |     msg = msg % args
 | 
| 53 |   print(msg, file=sys.stderr)
 | 
| 54 | 
 | 
| 55 | 
 | 
| 56 | class NullSchema:
 | 
| 57 |   def VerifyColumnNames(self, col_names):
 | 
| 58 |     pass
 | 
| 59 | 
 | 
| 60 |   def IsNumeric(self, col_name):
 | 
| 61 |     return False
 | 
| 62 | 
 | 
| 63 |   def ColumnIndexIsNumeric(self, index):
 | 
| 64 |     return False
 | 
| 65 | 
 | 
| 66 |   def ColumnIndexIsInteger(self, index):
 | 
| 67 |     return False
 | 
| 68 | 
 | 
| 69 |   def ColumnIndexHasHref(self, index):
 | 
| 70 |     return False
 | 
| 71 | 
 | 
| 72 |   def HasCssClassColumn(self):
 | 
| 73 |     return False
 | 
| 74 | 
 | 
| 75 | 
 | 
| 76 | INTEGER_TYPES = ('integer',)
 | 
| 77 | 
 | 
| 78 | # for sorting, right-justification
 | 
| 79 | # Note: added 'float' as alias for 'double' to be compatible with TSV8
 | 
| 80 | NUMERIC_TYPES = ('double', 'float', 'number') + INTEGER_TYPES
 | 
| 81 | 
 | 
| 82 | 
 | 
| 83 | class Schema:
 | 
| 84 |   def __init__(self, rows):
 | 
| 85 |     schema_col_names = rows[0]
 | 
| 86 |     assert 'column_name' in schema_col_names, schema_col_names
 | 
| 87 |     assert 'type' in schema_col_names, schema_col_names
 | 
| 88 | 
 | 
| 89 |     # Schema columns
 | 
| 90 |     s_cols = {}
 | 
| 91 |     s_cols['column_name'] = []
 | 
| 92 |     s_cols['type'] = []
 | 
| 93 |     s_cols['precision'] = []
 | 
| 94 |     s_cols['strftime'] = []
 | 
| 95 |     for row in rows[1:]:
 | 
| 96 |       for i, cell in enumerate(row):
 | 
| 97 |         name = schema_col_names[i]
 | 
| 98 |         s_cols[name].append(cell)
 | 
| 99 | 
 | 
| 100 |     self.type_lookup = dict(
 | 
| 101 |         (name, t) for (name, t) in
 | 
| 102 |         zip(s_cols['column_name'], s_cols['type']))
 | 
| 103 | 
 | 
| 104 |     # NOTE: it's OK if precision is missing.
 | 
| 105 |     self.precision_lookup = dict(
 | 
| 106 |         (name, p) for (name, p) in
 | 
| 107 |         zip(s_cols['column_name'], s_cols['precision']))
 | 
| 108 | 
 | 
| 109 |     self.strftime_lookup = dict(
 | 
| 110 |         (name, p) for (name, p) in
 | 
| 111 |         zip(s_cols['column_name'], s_cols['strftime']))
 | 
| 112 | 
 | 
| 113 |     #log('SCHEMA %s', schema_col_names)
 | 
| 114 |     #log('type_lookup %s', self.type_lookup)
 | 
| 115 |     #log('precision_lookup %s', self.precision_lookup)
 | 
| 116 | 
 | 
| 117 |     self.col_names = None
 | 
| 118 |     self.col_has_href = None
 | 
| 119 | 
 | 
| 120 |   def VerifyColumnNames(self, col_names):
 | 
| 121 |     """Assert that the column names we got are all in the schema."""
 | 
| 122 |     if 0:
 | 
| 123 |       for name in col_names:
 | 
| 124 |         log('%s : %s', name, self.type_lookup[name])
 | 
| 125 | 
 | 
| 126 |     n = len(col_names)
 | 
| 127 |     self.col_has_href = [False] * n
 | 
| 128 |     for i in range(n-1):
 | 
| 129 |       this_name, next_name= col_names[i], col_names[i+1]
 | 
| 130 |       if this_name + '_HREF' == next_name:
 | 
| 131 |         self.col_has_href[i] = True
 | 
| 132 | 
 | 
| 133 |     #log('href: %s', self.col_has_href)
 | 
| 134 |     self.col_names = col_names
 | 
| 135 | 
 | 
| 136 |   def IsNumeric(self, col_name):
 | 
| 137 |     return self.type_lookup[col_name] in NUMERIC_TYPES
 | 
| 138 | 
 | 
| 139 |   def ColumnIndexIsNumeric(self, index):
 | 
| 140 |     col_name = self.col_names[index]
 | 
| 141 |     return self.IsNumeric(col_name)
 | 
| 142 | 
 | 
| 143 |   def ColumnIndexIsInteger(self, index):
 | 
| 144 |     col_name = self.col_names[index]
 | 
| 145 |     return self.type_lookup[col_name] in INTEGER_TYPES
 | 
| 146 | 
 | 
| 147 |   def ColumnIndexHasHref(self, index):
 | 
| 148 |     """
 | 
| 149 |     Is the next one?
 | 
| 150 |     """
 | 
| 151 |     return self.col_has_href[index]
 | 
| 152 | 
 | 
| 153 |   def ColumnPrecision(self, index):
 | 
| 154 |     col_name = self.col_names[index]
 | 
| 155 |     return self.precision_lookup.get(col_name, 1)  # default is arbitrary
 | 
| 156 | 
 | 
| 157 |   def HasStrfTime(self, col_name):
 | 
| 158 |     # An explicit - means "no entry"
 | 
| 159 |     return self.strftime_lookup.get(col_name, '-') != '-'
 | 
| 160 | 
 | 
| 161 |   def ColumnStrftime(self, index):
 | 
| 162 |     col_name = self.col_names[index]
 | 
| 163 |     return self.strftime_lookup.get(col_name, '-')
 | 
| 164 | 
 | 
| 165 |   def HasCssClassColumn(self):
 | 
| 166 |     # It has to be the first column
 | 
| 167 |     return self.col_names[0] == 'ROW_CSS_CLASS'
 | 
| 168 | 
 | 
| 169 | 
 | 
| 170 | def PrintRow(row, schema, css_class_pattern):
 | 
| 171 |   """Print a CSV row as HTML, using the given formatting.
 | 
| 172 | 
 | 
| 173 |   Returns:
 | 
| 174 |     An array of booleans indicating whether each cell is a number.
 | 
| 175 |   """
 | 
| 176 |   # TODO: cache this computation
 | 
| 177 |   if css_class_pattern:
 | 
| 178 |     row_class_pat, r = css_class_pattern.split(None, 2)
 | 
| 179 |     cell_regex = re.compile(r)
 | 
| 180 |   else:
 | 
| 181 |     row_class_pat = None
 | 
| 182 |     cell_regex = None
 | 
| 183 | 
 | 
| 184 |   i = 0
 | 
| 185 |   n = len(row)
 | 
| 186 | 
 | 
| 187 |   row_classes = []
 | 
| 188 | 
 | 
| 189 |   if schema.HasCssClassColumn():
 | 
| 190 |     i += 1  # Don't print this row
 | 
| 191 |     # It's a CSS class
 | 
| 192 |     row_classes.append(row[0])
 | 
| 193 | 
 | 
| 194 |   if cell_regex:
 | 
| 195 |     for cell in row:
 | 
| 196 |       if cell_regex.search(cell):
 | 
| 197 |         row_classes.append(row_class_pat)
 | 
| 198 |         break
 | 
| 199 | 
 | 
| 200 |   h = ' class="%s"' % ' '.join(row_classes) if row_classes else ''
 | 
| 201 |   print('    <tr%s>' % h)
 | 
| 202 | 
 | 
| 203 |   while True:
 | 
| 204 |     if i == n:
 | 
| 205 |       break
 | 
| 206 | 
 | 
| 207 |     cell = row[i]
 | 
| 208 |     css_classes = []
 | 
| 209 |     cell_str = cell  # by default, we don't touch it
 | 
| 210 | 
 | 
| 211 |     if schema.ColumnIndexIsInteger(i):
 | 
| 212 |       css_classes.append('num')  # right justify
 | 
| 213 | 
 | 
| 214 |       try:
 | 
| 215 |         cell_int = int(cell)
 | 
| 216 |       except ValueError:
 | 
| 217 |         pass  # NA?
 | 
| 218 |       else:
 | 
| 219 |         # commas AND floating point
 | 
| 220 |         cell_str = '{:,}'.format(cell_int)
 | 
| 221 | 
 | 
| 222 |     # Look up by index now?
 | 
| 223 |     elif schema.ColumnIndexIsNumeric(i):
 | 
| 224 |       css_classes.append('num')  # right justify
 | 
| 225 | 
 | 
| 226 |       try:
 | 
| 227 |         cell_float = float(cell)
 | 
| 228 |       except ValueError:
 | 
| 229 |         pass  # NA
 | 
| 230 |       else:
 | 
| 231 |         # Floats can also be timestamps
 | 
| 232 |         fmt = schema.ColumnStrftime(i)
 | 
| 233 |         if fmt not in ('-', ''):
 | 
| 234 |             from datetime import datetime
 | 
| 235 |             t = datetime.fromtimestamp(cell_float)
 | 
| 236 |             if fmt == 'iso':
 | 
| 237 |                 cell_str = t.isoformat()
 | 
| 238 |             else:
 | 
| 239 |                 cell_str = t.strftime(fmt)
 | 
| 240 |         else:
 | 
| 241 |             # commas AND floating point to a given precision
 | 
| 242 |             # default precision is 1
 | 
| 243 |             precision = schema.ColumnPrecision(i)
 | 
| 244 |             cell_str = '{0:,.{precision}f}'.format(cell_float, precision=precision)
 | 
| 245 | 
 | 
| 246 |       # Percentage
 | 
| 247 |       #cell_str = '{:.1f}%'.format(cell_float * 100)
 | 
| 248 | 
 | 
| 249 |     # Special CSS class for R NA values.
 | 
| 250 |     if cell.strip() == 'NA':
 | 
| 251 |       css_classes.append('na')  # make it red
 | 
| 252 | 
 | 
| 253 |     if css_classes:
 | 
| 254 |       print('      <td class="{}">'.format(' '.join(css_classes)), end=' ')
 | 
| 255 |     else:
 | 
| 256 |       print('      <td>', end=' ')
 | 
| 257 | 
 | 
| 258 |     s = html.escape(cell_str)
 | 
| 259 |     # If it's an _HREF, advance to the next column, and mutate 's'.
 | 
| 260 |     if schema.ColumnIndexHasHref(i):
 | 
| 261 |       i += 1
 | 
| 262 |       href = row[i]
 | 
| 263 |       if href:
 | 
| 264 |         s = '<a href="%s">%s</a>' % (html.escape(href), html.escape(cell_str))
 | 
| 265 | 
 | 
| 266 |     print(s, end=' ')
 | 
| 267 |     print('</td>')
 | 
| 268 | 
 | 
| 269 |     i += 1
 | 
| 270 | 
 | 
| 271 |   print('    </tr>')
 | 
| 272 | 
 | 
| 273 | 
 | 
| 274 | def PrintColGroup(col_names, schema):
 | 
| 275 |   """Print HTML colgroup element, used for JavaScript sorting."""
 | 
| 276 |   print('  <colgroup>')
 | 
| 277 |   for i, col in enumerate(col_names):
 | 
| 278 |     if i == 0 and schema.HasCssClassColumn():
 | 
| 279 |       continue
 | 
| 280 |     if col.endswith('_HREF'):
 | 
| 281 |       continue
 | 
| 282 | 
 | 
| 283 |     # CSS class is used for sorting
 | 
| 284 |     if schema.IsNumeric(col) and not schema.HasStrfTime(col):
 | 
| 285 |       css_class = 'number'
 | 
| 286 |     else:
 | 
| 287 |       css_class = 'case-insensitive'
 | 
| 288 | 
 | 
| 289 |     # NOTE: id is a comment only; not used
 | 
| 290 |     print('    <col id="{}" type="{}" />'.format(col, css_class))
 | 
| 291 |   print('  </colgroup>')
 | 
| 292 | 
 | 
| 293 | 
 | 
| 294 | def PrintTable(css_id, schema, col_names, rows, opts):
 | 
| 295 |   print('<table id="%s">' % css_id)
 | 
| 296 |   print('  <thead>')
 | 
| 297 |   print('    <tr>')
 | 
| 298 |   for i, col in enumerate(col_names):
 | 
| 299 |     if i == 0 and schema.HasCssClassColumn():
 | 
| 300 |       continue
 | 
| 301 |     if col.endswith('_HREF'):
 | 
| 302 |       continue
 | 
| 303 | 
 | 
| 304 |     heading_str = html.escape(col.replace('_', ' '))
 | 
| 305 |     if schema.ColumnIndexIsNumeric(i):
 | 
| 306 |       print('    <td class="num">%s</td>' % heading_str)
 | 
| 307 |     else:
 | 
| 308 |       print('    <td>%s</td>' % heading_str)
 | 
| 309 |   print('    </tr>')
 | 
| 310 | 
 | 
| 311 |   for i in range(opts.thead_offset):
 | 
| 312 |     PrintRow(rows[i], schema, opts.css_class_pattern)
 | 
| 313 | 
 | 
| 314 |   print('  </thead>')
 | 
| 315 | 
 | 
| 316 |   print('  <tbody>')
 | 
| 317 |   for row in rows[opts.thead_offset:]:
 | 
| 318 |     PrintRow(row, schema, opts.css_class_pattern)
 | 
| 319 |   print('  </tbody>')
 | 
| 320 | 
 | 
| 321 |   PrintColGroup(col_names, schema)
 | 
| 322 | 
 | 
| 323 |   print('</table>')
 | 
| 324 | 
 | 
| 325 | 
 | 
| 326 | def ReadFile(f, tsv=False):
 | 
| 327 |   """Read the CSV file, returning the column names and rows."""
 | 
| 328 | 
 | 
| 329 |   if tsv:
 | 
| 330 |     c = csv.reader(f, delimiter='\t', doublequote=False,
 | 
| 331 |                    quoting=csv.QUOTE_NONE)
 | 
| 332 |   else:
 | 
| 333 |     c = csv.reader(f)
 | 
| 334 | 
 | 
| 335 |   # The first row of the CSV is assumed to be a header.  The rest are data.
 | 
| 336 |   col_names = []
 | 
| 337 |   rows = []
 | 
| 338 |   for i, row in enumerate(c):
 | 
| 339 |     if i == 0:
 | 
| 340 |       col_names = row
 | 
| 341 |       continue
 | 
| 342 |     rows.append(row)
 | 
| 343 |   return col_names, rows
 | 
| 344 | 
 | 
| 345 | 
 | 
| 346 | def CreateOptionsParser():
 | 
| 347 |   p = optparse.OptionParser()
 | 
| 348 | 
 | 
| 349 |   # We are taking a path, and not using stdin, because we read it twice.
 | 
| 350 |   p.add_option(
 | 
| 351 |       '--schema', dest='schema', metavar="PATH", type='str',
 | 
| 352 |       help='Path to the schema.')
 | 
| 353 |   p.add_option(
 | 
| 354 |       '--tsv', dest='tsv', default=False, action='store_true',
 | 
| 355 |       help='Read input in TSV format')
 | 
| 356 |   p.add_option(
 | 
| 357 |       '--css-class-pattern', dest='css_class_pattern', type='str',
 | 
| 358 |       help='A string of the form CSS_CLASS:PATTERN.  If the cell contents '
 | 
| 359 |            'matches the pattern, then apply the given CSS class. '
 | 
| 360 |            'Example: osh:^osh')
 | 
| 361 |   # TODO: Might want --tfoot-offset from the bottom too?  Default 0
 | 
| 362 |   p.add_option(
 | 
| 363 |       '--thead-offset', dest='thead_offset', default=0, type='int',
 | 
| 364 |       help='Put more rows in the data in the thead section')
 | 
| 365 |   return p
 | 
| 366 | 
 | 
| 367 | 
 | 
| 368 | def main(argv):
 | 
| 369 |   (opts, argv) = CreateOptionsParser().parse_args(argv[1:])
 | 
| 370 | 
 | 
| 371 |   try:
 | 
| 372 |     csv_path = argv[0]
 | 
| 373 |   except IndexError:
 | 
| 374 |     raise RuntimeError('Expected CSV filename.')
 | 
| 375 | 
 | 
| 376 |   schema = None
 | 
| 377 |   if opts.schema:
 | 
| 378 |     try:
 | 
| 379 |       schema_f = open(opts.schema)
 | 
| 380 |     except IOError as e:
 | 
| 381 |       raise RuntimeError('Error opening schema: %s' %  e)
 | 
| 382 |   else:
 | 
| 383 |     if csv_path.endswith('.csv'):
 | 
| 384 |       schema_path = csv_path.replace('.csv', '.schema.csv')
 | 
| 385 |     elif csv_path.endswith('.tsv'):
 | 
| 386 |       schema_path = csv_path.replace('.tsv', '.schema.tsv')
 | 
| 387 |     else:
 | 
| 388 |       raise AssertionError(csv_path)
 | 
| 389 | 
 | 
| 390 |     #log('schema path %s', schema_path)
 | 
| 391 |     try:
 | 
| 392 |       schema_f = open(schema_path)
 | 
| 393 |     except IOError:
 | 
| 394 |       schema_f = None  # allowed to have no schema
 | 
| 395 | 
 | 
| 396 |   if schema_f:
 | 
| 397 |     if opts.tsv:
 | 
| 398 |       r = csv.reader(schema_f, delimiter='\t', doublequote=False,
 | 
| 399 |                      quoting=csv.QUOTE_NONE)
 | 
| 400 |     else:
 | 
| 401 |       r = csv.reader(schema_f)
 | 
| 402 | 
 | 
| 403 |     schema = Schema(list(r))
 | 
| 404 |   else:
 | 
| 405 |     schema = NullSchema()
 | 
| 406 |     # Default string schema
 | 
| 407 | 
 | 
| 408 |   #log('schema %s', schema)
 | 
| 409 | 
 | 
| 410 |   with open(csv_path) as f:
 | 
| 411 |     col_names, rows = ReadFile(f, opts.tsv)
 | 
| 412 | 
 | 
| 413 |   schema.VerifyColumnNames(col_names)
 | 
| 414 | 
 | 
| 415 |   filename = os.path.basename(csv_path)
 | 
| 416 |   css_id, _ = os.path.splitext(filename)
 | 
| 417 |   PrintTable(css_id, schema, col_names, rows, opts)
 | 
| 418 | 
 | 
| 419 | 
 | 
| 420 | if __name__ == '__main__':
 | 
| 421 |   try:
 | 
| 422 |     main(sys.argv)
 | 
| 423 |   except RuntimeError as e:
 | 
| 424 |     print('FATAL: %s' % e, file=sys.stderr)
 | 
| 425 |     sys.exit(1)
 |