OILS / osh / string_ops_test.py View on Github | oilshell.org

203 lines, 152 significant
1#!/usr/bin/env python2
2"""
3string_ops_test.py: Tests for string_ops.py
4"""
5from __future__ import print_function
6
7import unittest
8
9from core import error
10from osh import string_ops # module under test
11
12
13class LibStrTest(unittest.TestCase):
14
15 def test_NextUtf8Char(self):
16 CASES = [
17 ([1, 3, 6, 10], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
18 ([
19 1, 3,
20 'UTF-8 decode: Bad encoding at offset 3 in string of 6 bytes'
21 ], '\x24\xC2\xA2\xE0\xE0\xA4'),
22 ([
23 1, 3, 6,
24 'UTF-8 decode: Bad encoding at offset 6 in string of 7 bytes'
25 ], '\x24\xC2\xA2\xE0\xA4\xA4\xB9'),
26 ([
27 1, 3,
28 'UTF-8 decode: Bad encoding at offset 3 in string of 4 bytes'
29 ], '\x24\xC2\xA2\xFF'),
30 ([
31 1,
32 'UTF-8 decode: Truncated bytes at offset 1 in string of 4 bytes'
33 ], '\x24\xF0\x90\x8D'),
34 ]
35 for expected_indexes, input_str in CASES:
36 print()
37 print('NextUtf8Char case %r %r' % (expected_indexes, input_str))
38 i = 0
39 actual_indexes = []
40 while True:
41 try:
42 i = string_ops.NextUtf8Char(input_str, i)
43 actual_indexes.append(i)
44 if i >= len(input_str):
45 break
46 except error.Strict as e:
47 actual_indexes.append(e.msg)
48 break
49 self.assertEqual(expected_indexes, actual_indexes)
50
51 def test_DecodeNextUtf8Char(self):
52 s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
53 codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
54 start = 0
55 for codepoint in codepoints:
56 end = string_ops.NextUtf8Char(s, start)
57 codepoint = string_ops.DecodeUtf8Char(s, start)
58 self.assertEqual(codepoint, codepoint)
59 start = end
60
61 def test_DecodePrevUtf8Char(self):
62 s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
63 codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
64 end = len(s)
65 for codepoint in reversed(codepoints):
66 start = string_ops.PreviousUtf8Char(s, end)
67 codepoint = string_ops.DecodeUtf8Char(s, start)
68 self.assertEqual(codepoint, codepoint)
69 end = start
70
71 def test_DecodeUtf8CharError(self):
72 CASES = [
73 ('UTF-8 decode: Truncated bytes at offset 0 in string of 1 bytes',
74 '\xC0'),
75 ('UTF-8 decode: Bad encoding at offset 0 in string of 2 bytes',
76 '\xC0\x01'),
77 ('UTF-8 decode: Bad encoding at offset 0 in string of 1 bytes',
78 '\xff'),
79 ]
80 for msg, input in CASES:
81 with self.assertRaises(error.Expr) as ctx:
82 string_ops.DecodeUtf8Char(input, 0)
83 self.assertEqual(ctx.exception.msg, msg)
84
85 def test_PreviousUtf8Char(self):
86 # The error messages could probably be improved for more consistency
87 # with NextUtf8Char, at the expense of more complexity.
88 CASES = [
89 ([6, 3, 1, 0], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
90 ([6, 3, 1, 'Invalid start of UTF-8 sequence'],
91 '\xA2\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
92 ([10, 'Invalid start of UTF-8 sequence'],
93 '\xF0\x90\x8D\x88\x90\x8D\x88\x90\x8D\x88\x24'),
94 ([3, 'Invalid start of UTF-8 sequence'], '\xF0\x90\x8D\x24'),
95 ]
96 for expected_indexes, input_str in CASES:
97 print()
98 print('PreviousUtf8Char case %r %r' %
99 (expected_indexes, input_str))
100 i = len(input_str)
101 actual_indexes = []
102 while True:
103 try:
104 i = string_ops.PreviousUtf8Char(input_str, i)
105 actual_indexes.append(i)
106 if i == 0:
107 break
108 except error.Strict as e:
109 actual_indexes.append(e.msg)
110 break
111 self.assertEqual(expected_indexes, actual_indexes)
112
113 # The UTF-8 encoding of all the characters from string_ops.SPACES.
114 # See comments there about why that set of characters was chosen.
115 #
116 # Generated by evaluating this Python3 fragment:
117 #
118 # ```
119 # print('\u0009\u000a\u000b\u000c\u000d\u0020\u00a0\ufeff'.encode('utf-8'))
120 # ```
121 ALL_WHITESPACES_UTF8 = '\t\n\x0b\x0c\r \xc2\xa0\xef\xbb\xbf'
122
123 def test_StartsWithWhitespaceByteRange(self):
124 CASES = [
125 ((0, 0), ''),
126 ((0, 0), 'x'),
127 ((0, 1), ' x'),
128 ((0, 1), ' x '),
129 ((0, 2), '\t x '),
130 ((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
131 ]
132 for expected, input_str in CASES:
133 print()
134 print('StartsWithWhitespaceByteRange case %r %r' %
135 (expected, input_str))
136 self.assertEqual(
137 expected, string_ops.StartsWithWhitespaceByteRange(input_str))
138
139 def test_EndsWithWhitespaceByteRange(self):
140 CASES = [
141 ((0, 0), ''),
142 ((1, 1), 'x'),
143 ((2, 2), ' x'),
144 ((2, 3), ' x '),
145 ((2, 4), ' x \t'),
146 ((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
147 ]
148
149 for expected, input_str in CASES:
150 print()
151 print('EndsWithWhitespaceByteRange case %r %r' %
152 (expected, input_str))
153 self.assertEqual(expected,
154 string_ops.EndsWithWhitespaceByteRange(input_str))
155
156 def testUnarySuffixOpDemo(self):
157 print(string_ops)
158
159 s = 'abcd'
160 n = len(s)
161
162 # All of these loops test exactly 4.
163 # NOTE: These are manually copied into DoUnarySuffixOp
164
165 print('## shortest prefix')
166 for i in xrange(1, n + 1):
167 print('%d test %06r return %06r' % (i, s[:i], s[i:]))
168 print()
169
170 print('# longest prefix')
171 for i in xrange(n, 0, -1):
172 print('%d test %06r return %06r' % (i, s[:i], s[i:]))
173 print()
174
175 print('% shortest suffix')
176 for i in xrange(n - 1, -1, -1):
177 print('%d test %06r return %06r' % (i, s[i:], s[:i]))
178 print()
179
180 print('%% longest suffix')
181 for i in xrange(0, n):
182 print('%d test %06r return %06r' % (i, s[i:], s[:i]))
183 print()
184
185 def testPatSubAllMatches(self):
186 s = 'oXooXoooX'
187
188 # Match positions
189 self.assertEqual([(1, 3), (4, 6)],
190 string_ops._AllMatchPositions(s, '(X.)'))
191
192 # No match
193 self.assertEqual([], string_ops._AllMatchPositions(s, '(z)'))
194
195 # Replacement
196 self.assertEqual('o_o_ooX', string_ops._PatSubAll(s, '(X.)', '_'))
197
198 # Replacement with no match
199 self.assertEqual(s, string_ops._PatSubAll(s, '(z)', '_'))
200
201
202if __name__ == '__main__':
203 unittest.main()