OILS / benchmarks / awk-python.sh View on Github | oilshell.org

97 lines, 28 significant
1#!/usr/bin/env bash
2#
3# Test awk vs Python speed.
4#
5# On this hash table benchmark, Python is maybe 10% slower than gawk. mawk is
6# twice is fast as gawk (and bwk).
7#
8# Python has much more functionality, so it's not exactly a fair comparison,
9# but it's instructive.
10#
11# Update: simply adding tolower() makes gawk much slower than Python (555 ms
12# vs. 280 ms), and mawk is still much faster at 138 ms.
13#
14# Mawk is known to be fast? Faster than Java on this benchmark.
15# https://brenocon.com/blog/2009/09/dont-mawk-awk-the-fastest-and-most-elegant-big-data-munging-language/
16#
17# Usage:
18# ./awk-python.sh <function name>
19
20set -o nounset
21set -o pipefail
22set -o errexit
23
24readonly FILES=(../*.sh ../*/*.sh ../*.py ../*/*.py ../*/*/*.py)
25
26# Test out hash table implementations
27# mawk is faster: 77ms vs 155ms for 10 iterations.
28test-awk() {
29 for awk in gawk mawk ~/git/bwk/bwk; do
30 echo ---
31 echo $awk
32 echo ---
33 time for i in {1..10}; do
34 $awk '
35 {
36 line = tolower($0)
37 num_lines += 1
38
39 # NOTE: gawk has length(); mawk does not
40 if (!(line in unique)) {
41 num_unique += 1
42 }
43 unique[line] += 1
44 }
45 END {
46 print "unique lines: " num_unique
47 print "total lines: " num_lines
48 }
49 ' "${FILES[@]}"
50
51 done
52 done
53}
54
55# Python VM is slower: 160-170 ms. Oops.
56#
57# Well Python has more general dictionaries -- they take more than strings.
58test-python() {
59 time for i in {1..10}; do
60 python -S -c '
61import collections
62import sys
63
64num_lines = 0
65num_unique = 0
66unique = collections.defaultdict(int)
67
68for path in sys.argv[1:]:
69 with open(path) as f:
70 for line in f:
71 line = line.lower()
72 num_lines += 1
73
74 if line not in unique:
75 num_unique += 1
76 unique[line] += 1
77
78print "unique lines: ", num_unique
79print "total lines: ", num_lines
80 ' "${FILES[@]}"
81
82 done
83}
84
85# Only 10-30 ms. We are doing real work.
86test-wc() {
87 time for i in {1..10}; do
88 cat "${FILES[@]}" | wc -c
89 done
90}
91
92files() {
93 echo "${FILES[@]}"
94 echo "${#FILES[@]} files"
95}
96
97"$@"