benchmarks/awk-python.sh

OILS / benchmarks / awk-python.sh View on Github | oilshell.org

97 lines, 28 significant

1	#!/usr/bin/env bash
2	#
3	# Test awk vs Python speed.
4	#
5	# On this hash table benchmark, Python is maybe 10% slower than gawk. mawk is
6	# twice is fast as gawk (and bwk).
7	#
8	# Python has much more functionality, so it's not exactly a fair comparison,
9	# but it's instructive.
10	#
11	# Update: simply adding tolower() makes gawk much slower than Python (555 ms
12	# vs. 280 ms), and mawk is still much faster at 138 ms.
13	#
14	# Mawk is known to be fast? Faster than Java on this benchmark.
15	# https://brenocon.com/blog/2009/09/dont-mawk-awk-the-fastest-and-most-elegant-big-data-munging-language/
16	#
17	# Usage:
18	# ./awk-python.sh <function name>
19
20	set -o nounset
21	set -o pipefail
22	set -o errexit
23
24	readonly FILES=(../.sh ..//.sh ../.py ..//.py ..///*.py)
25
26	# Test out hash table implementations
27	# mawk is faster: 77ms vs 155ms for 10 iterations.
28	test-awk() {
29	for awk in gawk mawk ~/git/bwk/bwk; do
30	echo ---
31	echo $awk
32	echo ---
33	time for i in {1..10}; do
34	$awk '
35	{
36	line = tolower($0)
37	num_lines += 1
38
39	# NOTE: gawk has length(); mawk does not
40	if (!(line in unique)) {
41	num_unique += 1
42	}
43	unique[line] += 1
44	}
45	END {
46	print "unique lines: " num_unique
47	print "total lines: " num_lines
48	}
49	' "${FILES[@]}"
50
51	done
52	done
53	}
54
55	# Python VM is slower: 160-170 ms. Oops.
56	#
57	# Well Python has more general dictionaries -- they take more than strings.
58	test-python() {
59	time for i in {1..10}; do
60	python -S -c '
61	import collections
62	import sys
63
64	num_lines = 0
65	num_unique = 0
66	unique = collections.defaultdict(int)
67
68	for path in sys.argv[1:]:
69	with open(path) as f:
70	for line in f:
71	line = line.lower()
72	num_lines += 1
73
74	if line not in unique:
75	num_unique += 1
76	unique[line] += 1
77
78	print "unique lines: ", num_unique
79	print "total lines: ", num_lines
80	' "${FILES[@]}"
81
82	done
83	}
84
85	# Only 10-30 ms. We are doing real work.
86	test-wc() {
87	time for i in {1..10}; do
88	cat "${FILES[@]}" \| wc -c
89	done
90	}
91
92	files() {
93	echo "${FILES[@]}"
94	echo "${#FILES[@]} files"
95	}
96
97	"$@"