1 | #!/usr/bin/env python2
|
2 | """
|
3 | string_ops_test.py: Tests for string_ops.py
|
4 | """
|
5 | from __future__ import print_function
|
6 |
|
7 | import unittest
|
8 |
|
9 | from core import error
|
10 | from osh import string_ops # module under test
|
11 |
|
12 |
|
13 | class LibStrTest(unittest.TestCase):
|
14 |
|
15 | def test_NextUtf8Char(self):
|
16 | CASES = [
|
17 | ([1, 3, 6, 10], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
|
18 | ([
|
19 | 1, 3,
|
20 | 'UTF-8 decode: Bad encoding at offset 3 in string of 6 bytes'
|
21 | ], '\x24\xC2\xA2\xE0\xE0\xA4'),
|
22 | ([
|
23 | 1, 3, 6,
|
24 | 'UTF-8 decode: Bad encoding at offset 6 in string of 7 bytes'
|
25 | ], '\x24\xC2\xA2\xE0\xA4\xA4\xB9'),
|
26 | ([
|
27 | 1, 3,
|
28 | 'UTF-8 decode: Bad encoding at offset 3 in string of 4 bytes'
|
29 | ], '\x24\xC2\xA2\xFF'),
|
30 | ([
|
31 | 1,
|
32 | 'UTF-8 decode: Truncated bytes at offset 1 in string of 4 bytes'
|
33 | ], '\x24\xF0\x90\x8D'),
|
34 | ]
|
35 | for expected_indexes, input_str in CASES:
|
36 | print()
|
37 | print('NextUtf8Char case %r %r' % (expected_indexes, input_str))
|
38 | i = 0
|
39 | actual_indexes = []
|
40 | while True:
|
41 | try:
|
42 | i = string_ops.NextUtf8Char(input_str, i)
|
43 | actual_indexes.append(i)
|
44 | if i >= len(input_str):
|
45 | break
|
46 | except error.Strict as e:
|
47 | actual_indexes.append(e.msg)
|
48 | break
|
49 | self.assertEqual(expected_indexes, actual_indexes)
|
50 |
|
51 | def test_DecodeNextUtf8Char(self):
|
52 | s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
|
53 | codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
|
54 | start = 0
|
55 | for codepoint in codepoints:
|
56 | end = string_ops.NextUtf8Char(s, start)
|
57 | codepoint = string_ops.DecodeUtf8Char(s, start)
|
58 | self.assertEqual(codepoint, codepoint)
|
59 | start = end
|
60 |
|
61 | def test_DecodePrevUtf8Char(self):
|
62 | s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
|
63 | codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
|
64 | end = len(s)
|
65 | for codepoint in reversed(codepoints):
|
66 | start = string_ops.PreviousUtf8Char(s, end)
|
67 | codepoint = string_ops.DecodeUtf8Char(s, start)
|
68 | self.assertEqual(codepoint, codepoint)
|
69 | end = start
|
70 |
|
71 | def test_DecodeUtf8CharError(self):
|
72 | CASES = [
|
73 | ('UTF-8 decode: Truncated bytes at offset 0 in string of 1 bytes',
|
74 | '\xC0'),
|
75 | ('UTF-8 decode: Bad encoding at offset 0 in string of 2 bytes',
|
76 | '\xC0\x01'),
|
77 | ('UTF-8 decode: Bad encoding at offset 0 in string of 1 bytes',
|
78 | '\xff'),
|
79 | ]
|
80 | for msg, input in CASES:
|
81 | with self.assertRaises(error.Expr) as ctx:
|
82 | string_ops.DecodeUtf8Char(input, 0)
|
83 | self.assertEqual(ctx.exception.msg, msg)
|
84 |
|
85 | def test_PreviousUtf8Char(self):
|
86 | # The error messages could probably be improved for more consistency
|
87 | # with NextUtf8Char, at the expense of more complexity.
|
88 | CASES = [
|
89 | ([6, 3, 1, 0], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
|
90 | ([6, 3, 1, 'Invalid start of UTF-8 sequence'],
|
91 | '\xA2\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
|
92 | ([10, 'Invalid start of UTF-8 sequence'],
|
93 | '\xF0\x90\x8D\x88\x90\x8D\x88\x90\x8D\x88\x24'),
|
94 | ([3, 'Invalid start of UTF-8 sequence'], '\xF0\x90\x8D\x24'),
|
95 | ]
|
96 | for expected_indexes, input_str in CASES:
|
97 | print()
|
98 | print('PreviousUtf8Char case %r %r' %
|
99 | (expected_indexes, input_str))
|
100 | i = len(input_str)
|
101 | actual_indexes = []
|
102 | while True:
|
103 | try:
|
104 | i = string_ops.PreviousUtf8Char(input_str, i)
|
105 | actual_indexes.append(i)
|
106 | if i == 0:
|
107 | break
|
108 | except error.Strict as e:
|
109 | actual_indexes.append(e.msg)
|
110 | break
|
111 | self.assertEqual(expected_indexes, actual_indexes)
|
112 |
|
113 | # The UTF-8 encoding of all the characters from string_ops.SPACES.
|
114 | # See comments there about why that set of characters was chosen.
|
115 | #
|
116 | # Generated by evaluating this Python3 fragment:
|
117 | #
|
118 | # ```
|
119 | # print('\u0009\u000a\u000b\u000c\u000d\u0020\u00a0\ufeff'.encode('utf-8'))
|
120 | # ```
|
121 | ALL_WHITESPACES_UTF8 = '\t\n\x0b\x0c\r \xc2\xa0\xef\xbb\xbf'
|
122 |
|
123 | def test_StartsWithWhitespaceByteRange(self):
|
124 | CASES = [
|
125 | ((0, 0), ''),
|
126 | ((0, 0), 'x'),
|
127 | ((0, 1), ' x'),
|
128 | ((0, 1), ' x '),
|
129 | ((0, 2), '\t x '),
|
130 | ((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
|
131 | ]
|
132 | for expected, input_str in CASES:
|
133 | print()
|
134 | print('StartsWithWhitespaceByteRange case %r %r' %
|
135 | (expected, input_str))
|
136 | self.assertEqual(
|
137 | expected, string_ops.StartsWithWhitespaceByteRange(input_str))
|
138 |
|
139 | def test_EndsWithWhitespaceByteRange(self):
|
140 | CASES = [
|
141 | ((0, 0), ''),
|
142 | ((1, 1), 'x'),
|
143 | ((2, 2), ' x'),
|
144 | ((2, 3), ' x '),
|
145 | ((2, 4), ' x \t'),
|
146 | ((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
|
147 | ]
|
148 |
|
149 | for expected, input_str in CASES:
|
150 | print()
|
151 | print('EndsWithWhitespaceByteRange case %r %r' %
|
152 | (expected, input_str))
|
153 | self.assertEqual(expected,
|
154 | string_ops.EndsWithWhitespaceByteRange(input_str))
|
155 |
|
156 | def testUnarySuffixOpDemo(self):
|
157 | print(string_ops)
|
158 |
|
159 | s = 'abcd'
|
160 | n = len(s)
|
161 |
|
162 | # All of these loops test exactly 4.
|
163 | # NOTE: These are manually copied into DoUnarySuffixOp
|
164 |
|
165 | print('## shortest prefix')
|
166 | for i in xrange(1, n + 1):
|
167 | print('%d test %06r return %06r' % (i, s[:i], s[i:]))
|
168 | print()
|
169 |
|
170 | print('# longest prefix')
|
171 | for i in xrange(n, 0, -1):
|
172 | print('%d test %06r return %06r' % (i, s[:i], s[i:]))
|
173 | print()
|
174 |
|
175 | print('% shortest suffix')
|
176 | for i in xrange(n - 1, -1, -1):
|
177 | print('%d test %06r return %06r' % (i, s[i:], s[:i]))
|
178 | print()
|
179 |
|
180 | print('%% longest suffix')
|
181 | for i in xrange(0, n):
|
182 | print('%d test %06r return %06r' % (i, s[i:], s[:i]))
|
183 | print()
|
184 |
|
185 | def testPatSubAllMatches(self):
|
186 | s = 'oXooXoooX'
|
187 |
|
188 | # Match positions
|
189 | self.assertEqual([(1, 3), (4, 6)],
|
190 | string_ops._AllMatchPositions(s, '(X.)'))
|
191 |
|
192 | # No match
|
193 | self.assertEqual([], string_ops._AllMatchPositions(s, '(z)'))
|
194 |
|
195 | # Replacement
|
196 | self.assertEqual('o_o_ooX', string_ops._PatSubAll(s, '(X.)', '_'))
|
197 |
|
198 | # Replacement with no match
|
199 | self.assertEqual(s, string_ops._PatSubAll(s, '(z)', '_'))
|
200 |
|
201 |
|
202 | if __name__ == '__main__':
|
203 | unittest.main()
|