osh/string_ops

OILS / osh / string_ops_test.py View on Github | oilshell.org

203 lines, 152 significant

1	#!/usr/bin/env python2
2	"""
3	string_ops_test.py: Tests for string_ops.py
4	"""
5	from __future__ import print_function
6
7	import unittest
8
9	from core import error
10	from osh import string_ops # module under test
11
12
13	class LibStrTest(unittest.TestCase):
14
15	def test_NextUtf8Char(self):
16	CASES = [
17	([1, 3, 6, 10], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
18	([
19	1, 3,
20	'UTF-8 decode: Bad encoding at offset 3 in string of 6 bytes'
21	], '\x24\xC2\xA2\xE0\xE0\xA4'),
22	([
23	1, 3, 6,
24	'UTF-8 decode: Bad encoding at offset 6 in string of 7 bytes'
25	], '\x24\xC2\xA2\xE0\xA4\xA4\xB9'),
26	([
27	1, 3,
28	'UTF-8 decode: Bad encoding at offset 3 in string of 4 bytes'
29	], '\x24\xC2\xA2\xFF'),
30	([
31	1,
32	'UTF-8 decode: Truncated bytes at offset 1 in string of 4 bytes'
33	], '\x24\xF0\x90\x8D'),
34	]
35	for expected_indexes, input_str in CASES:
36	print()
37	print('NextUtf8Char case %r %r' % (expected_indexes, input_str))
38	i = 0
39	actual_indexes = []
40	while True:
41	try:
42	i = string_ops.NextUtf8Char(input_str, i)
43	actual_indexes.append(i)
44	if i >= len(input_str):
45	break
46	except error.Strict as e:
47	actual_indexes.append(e.msg)
48	break
49	self.assertEqual(expected_indexes, actual_indexes)
50
51	def test_DecodeNextUtf8Char(self):
52	s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
53	codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
54	start = 0
55	for codepoint in codepoints:
56	end = string_ops.NextUtf8Char(s, start)
57	codepoint = string_ops.DecodeUtf8Char(s, start)
58	self.assertEqual(codepoint, codepoint)
59	start = end
60
61	def test_DecodePrevUtf8Char(self):
62	s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
63	codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
64	end = len(s)
65	for codepoint in reversed(codepoints):
66	start = string_ops.PreviousUtf8Char(s, end)
67	codepoint = string_ops.DecodeUtf8Char(s, start)
68	self.assertEqual(codepoint, codepoint)
69	end = start
70
71	def test_DecodeUtf8CharError(self):
72	CASES = [
73	('UTF-8 decode: Truncated bytes at offset 0 in string of 1 bytes',
74	'\xC0'),
75	('UTF-8 decode: Bad encoding at offset 0 in string of 2 bytes',
76	'\xC0\x01'),
77	('UTF-8 decode: Bad encoding at offset 0 in string of 1 bytes',
78	'\xff'),
79	]
80	for msg, input in CASES:
81	with self.assertRaises(error.Expr) as ctx:
82	string_ops.DecodeUtf8Char(input, 0)
83	self.assertEqual(ctx.exception.msg, msg)
84
85	def test_PreviousUtf8Char(self):
86	# The error messages could probably be improved for more consistency
87	# with NextUtf8Char, at the expense of more complexity.
88	CASES = [
89	([6, 3, 1, 0], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
90	([6, 3, 1, 'Invalid start of UTF-8 sequence'],
91	'\xA2\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
92	([10, 'Invalid start of UTF-8 sequence'],
93	'\xF0\x90\x8D\x88\x90\x8D\x88\x90\x8D\x88\x24'),
94	([3, 'Invalid start of UTF-8 sequence'], '\xF0\x90\x8D\x24'),
95	]
96	for expected_indexes, input_str in CASES:
97	print()
98	print('PreviousUtf8Char case %r %r' %
99	(expected_indexes, input_str))
100	i = len(input_str)
101	actual_indexes = []
102	while True:
103	try:
104	i = string_ops.PreviousUtf8Char(input_str, i)
105	actual_indexes.append(i)
106	if i == 0:
107	break
108	except error.Strict as e:
109	actual_indexes.append(e.msg)
110	break
111	self.assertEqual(expected_indexes, actual_indexes)
112
113	# The UTF-8 encoding of all the characters from string_ops.SPACES.
114	# See comments there about why that set of characters was chosen.
115	#
116	# Generated by evaluating this Python3 fragment:
117	#
118	# ```
119	# print('\u0009\u000a\u000b\u000c\u000d\u0020\u00a0\ufeff'.encode('utf-8'))
120	# ```
121	ALL_WHITESPACES_UTF8 = '\t\n\x0b\x0c\r \xc2\xa0\xef\xbb\xbf'
122
123	def test_StartsWithWhitespaceByteRange(self):
124	CASES = [
125	((0, 0), ''),
126	((0, 0), 'x'),
127	((0, 1), ' x'),
128	((0, 1), ' x '),
129	((0, 2), '\t x '),
130	((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
131	]
132	for expected, input_str in CASES:
133	print()
134	print('StartsWithWhitespaceByteRange case %r %r' %
135	(expected, input_str))
136	self.assertEqual(
137	expected, string_ops.StartsWithWhitespaceByteRange(input_str))
138
139	def test_EndsWithWhitespaceByteRange(self):
140	CASES = [
141	((0, 0), ''),
142	((1, 1), 'x'),
143	((2, 2), ' x'),
144	((2, 3), ' x '),
145	((2, 4), ' x \t'),
146	((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
147	]
148
149	for expected, input_str in CASES:
150	print()
151	print('EndsWithWhitespaceByteRange case %r %r' %
152	(expected, input_str))
153	self.assertEqual(expected,
154	string_ops.EndsWithWhitespaceByteRange(input_str))
155
156	def testUnarySuffixOpDemo(self):
157	print(string_ops)
158
159	s = 'abcd'
160	n = len(s)
161
162	# All of these loops test exactly 4.
163	# NOTE: These are manually copied into DoUnarySuffixOp
164
165	print('## shortest prefix')
166	for i in xrange(1, n + 1):
167	print('%d test %06r return %06r' % (i, s[:i], s[i:]))
168	print()
169
170	print('# longest prefix')
171	for i in xrange(n, 0, -1):
172	print('%d test %06r return %06r' % (i, s[:i], s[i:]))
173	print()
174
175	print('% shortest suffix')
176	for i in xrange(n - 1, -1, -1):
177	print('%d test %06r return %06r' % (i, s[i:], s[:i]))
178	print()
179
180	print('%% longest suffix')
181	for i in xrange(0, n):
182	print('%d test %06r return %06r' % (i, s[i:], s[:i]))
183	print()
184
185	def testPatSubAllMatches(self):
186	s = 'oXooXoooX'
187
188	# Match positions
189	self.assertEqual([(1, 3), (4, 6)],
190	string_ops._AllMatchPositions(s, '(X.)'))
191
192	# No match
193	self.assertEqual([], string_ops._AllMatchPositions(s, '(z)'))
194
195	# Replacement
196	self.assertEqual('o_o_ooX', string_ops._PatSubAll(s, '(X.)', '_'))
197
198	# Replacement with no match
199	self.assertEqual(s, string_ops._PatSubAll(s, '(z)', '_'))
200
201
202	if __name__ == '__main__':
203	unittest.main()