| 1 | #!/usr/bin/env python2 | 
| 2 | """ | 
| 3 | string_ops_test.py: Tests for string_ops.py | 
| 4 | """ | 
| 5 | from __future__ import print_function | 
| 6 |  | 
| 7 | import unittest | 
| 8 |  | 
| 9 | from core import error | 
| 10 | from osh import string_ops  # module under test | 
| 11 |  | 
| 12 |  | 
| 13 | class LibStrTest(unittest.TestCase): | 
| 14 |  | 
| 15 | def test_NextUtf8Char(self): | 
| 16 | CASES = [ | 
| 17 | ([1, 3, 6, 10], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'), | 
| 18 | ([ | 
| 19 | 1, 3, | 
| 20 | 'UTF-8 decode: Bad encoding at offset 3 in string of 6 bytes' | 
| 21 | ], '\x24\xC2\xA2\xE0\xE0\xA4'), | 
| 22 | ([ | 
| 23 | 1, 3, 6, | 
| 24 | 'UTF-8 decode: Bad encoding at offset 6 in string of 7 bytes' | 
| 25 | ], '\x24\xC2\xA2\xE0\xA4\xA4\xB9'), | 
| 26 | ([ | 
| 27 | 1, 3, | 
| 28 | 'UTF-8 decode: Bad encoding at offset 3 in string of 4 bytes' | 
| 29 | ], '\x24\xC2\xA2\xFF'), | 
| 30 | ([ | 
| 31 | 1, | 
| 32 | 'UTF-8 decode: Truncated bytes at offset 1 in string of 4 bytes' | 
| 33 | ], '\x24\xF0\x90\x8D'), | 
| 34 | ] | 
| 35 | for expected_indexes, input_str in CASES: | 
| 36 | print() | 
| 37 | print('NextUtf8Char case %r %r' % (expected_indexes, input_str)) | 
| 38 | i = 0 | 
| 39 | actual_indexes = [] | 
| 40 | while True: | 
| 41 | try: | 
| 42 | i = string_ops.NextUtf8Char(input_str, i) | 
| 43 | actual_indexes.append(i) | 
| 44 | if i >= len(input_str): | 
| 45 | break | 
| 46 | except error.Strict as e: | 
| 47 | actual_indexes.append(e.msg) | 
| 48 | break | 
| 49 | self.assertEqual(expected_indexes, actual_indexes) | 
| 50 |  | 
| 51 | def test_DecodeNextUtf8Char(self): | 
| 52 | s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80' | 
| 53 | codepoints = [0x61, 0xCA, 0x10A0, 0x13000] | 
| 54 | start = 0 | 
| 55 | for codepoint in codepoints: | 
| 56 | end = string_ops.NextUtf8Char(s, start) | 
| 57 | codepoint = string_ops.DecodeUtf8Char(s, start) | 
| 58 | self.assertEqual(codepoint, codepoint) | 
| 59 | start = end | 
| 60 |  | 
| 61 | def test_DecodePrevUtf8Char(self): | 
| 62 | s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80' | 
| 63 | codepoints = [0x61, 0xCA, 0x10A0, 0x13000] | 
| 64 | end = len(s) | 
| 65 | for codepoint in reversed(codepoints): | 
| 66 | start = string_ops.PreviousUtf8Char(s, end) | 
| 67 | codepoint = string_ops.DecodeUtf8Char(s, start) | 
| 68 | self.assertEqual(codepoint, codepoint) | 
| 69 | end = start | 
| 70 |  | 
| 71 | def test_DecodeUtf8CharError(self): | 
| 72 | CASES = [ | 
| 73 | ('UTF-8 decode: Truncated bytes at offset 0 in string of 1 bytes', | 
| 74 | '\xC0'), | 
| 75 | ('UTF-8 decode: Bad encoding at offset 0 in string of 2 bytes', | 
| 76 | '\xC0\x01'), | 
| 77 | ('UTF-8 decode: Bad encoding at offset 0 in string of 1 bytes', | 
| 78 | '\xff'), | 
| 79 | ] | 
| 80 | for msg, input in CASES: | 
| 81 | with self.assertRaises(error.Expr) as ctx: | 
| 82 | string_ops.DecodeUtf8Char(input, 0) | 
| 83 | self.assertEqual(ctx.exception.msg, msg) | 
| 84 |  | 
| 85 | def test_PreviousUtf8Char(self): | 
| 86 | # The error messages could probably be improved for more consistency | 
| 87 | # with NextUtf8Char, at the expense of more complexity. | 
| 88 | CASES = [ | 
| 89 | ([6, 3, 1, 0], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'), | 
| 90 | ([6, 3, 1, 'Invalid start of UTF-8 sequence'], | 
| 91 | '\xA2\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'), | 
| 92 | ([10, 'Invalid start of UTF-8 sequence'], | 
| 93 | '\xF0\x90\x8D\x88\x90\x8D\x88\x90\x8D\x88\x24'), | 
| 94 | ([3, 'Invalid start of UTF-8 sequence'], '\xF0\x90\x8D\x24'), | 
| 95 | ] | 
| 96 | for expected_indexes, input_str in CASES: | 
| 97 | print() | 
| 98 | print('PreviousUtf8Char case %r %r' % | 
| 99 | (expected_indexes, input_str)) | 
| 100 | i = len(input_str) | 
| 101 | actual_indexes = [] | 
| 102 | while True: | 
| 103 | try: | 
| 104 | i = string_ops.PreviousUtf8Char(input_str, i) | 
| 105 | actual_indexes.append(i) | 
| 106 | if i == 0: | 
| 107 | break | 
| 108 | except error.Strict as e: | 
| 109 | actual_indexes.append(e.msg) | 
| 110 | break | 
| 111 | self.assertEqual(expected_indexes, actual_indexes) | 
| 112 |  | 
| 113 | # The UTF-8 encoding of all the characters from string_ops.SPACES. | 
| 114 | # See comments there about why that set of characters was chosen. | 
| 115 | # | 
| 116 | # Generated by evaluating this Python3 fragment: | 
| 117 | # | 
| 118 | # ``` | 
| 119 | # print('\u0009\u000a\u000b\u000c\u000d\u0020\u00a0\ufeff'.encode('utf-8')) | 
| 120 | # ``` | 
| 121 | ALL_WHITESPACES_UTF8 = '\t\n\x0b\x0c\r \xc2\xa0\xef\xbb\xbf' | 
| 122 |  | 
| 123 | def test_StartsWithWhitespaceByteRange(self): | 
| 124 | CASES = [ | 
| 125 | ((0, 0), ''), | 
| 126 | ((0, 0), 'x'), | 
| 127 | ((0, 1), ' x'), | 
| 128 | ((0, 1), ' x '), | 
| 129 | ((0, 2), '\t x '), | 
| 130 | ((0, 11), LibStrTest.ALL_WHITESPACES_UTF8), | 
| 131 | ] | 
| 132 | for expected, input_str in CASES: | 
| 133 | print() | 
| 134 | print('StartsWithWhitespaceByteRange case %r %r' % | 
| 135 | (expected, input_str)) | 
| 136 | self.assertEqual( | 
| 137 | expected, string_ops.StartsWithWhitespaceByteRange(input_str)) | 
| 138 |  | 
| 139 | def test_EndsWithWhitespaceByteRange(self): | 
| 140 | CASES = [ | 
| 141 | ((0, 0), ''), | 
| 142 | ((1, 1), 'x'), | 
| 143 | ((2, 2), ' x'), | 
| 144 | ((2, 3), ' x '), | 
| 145 | ((2, 4), ' x \t'), | 
| 146 | ((0, 11), LibStrTest.ALL_WHITESPACES_UTF8), | 
| 147 | ] | 
| 148 |  | 
| 149 | for expected, input_str in CASES: | 
| 150 | print() | 
| 151 | print('EndsWithWhitespaceByteRange case %r %r' % | 
| 152 | (expected, input_str)) | 
| 153 | self.assertEqual(expected, | 
| 154 | string_ops.EndsWithWhitespaceByteRange(input_str)) | 
| 155 |  | 
| 156 | def testUnarySuffixOpDemo(self): | 
| 157 | print(string_ops) | 
| 158 |  | 
| 159 | s = 'abcd' | 
| 160 | n = len(s) | 
| 161 |  | 
| 162 | # All of these loops test exactly 4. | 
| 163 | # NOTE: These are manually copied into DoUnarySuffixOp | 
| 164 |  | 
| 165 | print('## shortest prefix') | 
| 166 | for i in xrange(1, n + 1): | 
| 167 | print('%d test %06r return %06r' % (i, s[:i], s[i:])) | 
| 168 | print() | 
| 169 |  | 
| 170 | print('# longest prefix') | 
| 171 | for i in xrange(n, 0, -1): | 
| 172 | print('%d test %06r return %06r' % (i, s[:i], s[i:])) | 
| 173 | print() | 
| 174 |  | 
| 175 | print('% shortest suffix') | 
| 176 | for i in xrange(n - 1, -1, -1): | 
| 177 | print('%d test %06r return %06r' % (i, s[i:], s[:i])) | 
| 178 | print() | 
| 179 |  | 
| 180 | print('%% longest suffix') | 
| 181 | for i in xrange(0, n): | 
| 182 | print('%d test %06r return %06r' % (i, s[i:], s[:i])) | 
| 183 | print() | 
| 184 |  | 
| 185 | def testPatSubAllMatches(self): | 
| 186 | s = 'oXooXoooX' | 
| 187 |  | 
| 188 | # Match positions | 
| 189 | self.assertEqual([(1, 3), (4, 6)], | 
| 190 | string_ops._AllMatchPositions(s, '(X.)')) | 
| 191 |  | 
| 192 | # No match | 
| 193 | self.assertEqual([], string_ops._AllMatchPositions(s, '(z)')) | 
| 194 |  | 
| 195 | # Replacement | 
| 196 | self.assertEqual('o_o_ooX', string_ops._PatSubAll(s, '(X.)', '_')) | 
| 197 |  | 
| 198 | # Replacement with no match | 
| 199 | self.assertEqual(s, string_ops._PatSubAll(s, '(z)', '_')) | 
| 200 |  | 
| 201 |  | 
| 202 | if __name__ == '__main__': | 
| 203 | unittest.main() |