| 1 | #include "data_lang/utf8.h"
|
| 2 |
|
| 3 | #include <inttypes.h>
|
| 4 |
|
| 5 | //#include "mycpp/runtime.h"
|
| 6 | #include "vendor/greatest.h"
|
| 7 |
|
| 8 | // Copied from UTF-8 proc
|
| 9 | // https://github.com/JuliaStrings/utf8proc/blob/master/utf8proc.c#L177
|
| 10 | int utf8proc_encode_char(uint32_t uc, uint8_t* dst) {
|
| 11 | if (uc < 0x80) {
|
| 12 | dst[0] = (uint8_t)uc;
|
| 13 | return 1;
|
| 14 | } else if (uc < 0x800) {
|
| 15 | dst[0] = (uint8_t)(0xC0 + (uc >> 6));
|
| 16 | dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
|
| 17 | return 2;
|
| 18 | // Note: we allow encoding 0xd800-0xdfff here, so as not to change
|
| 19 | // the API, however, these are actually invalid in UTF-8
|
| 20 | } else if (uc < 0x10000) {
|
| 21 | dst[0] = (uint8_t)(0xE0 + (uc >> 12));
|
| 22 | dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
| 23 | dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
|
| 24 | return 3;
|
| 25 | } else if (uc < 0x110000) {
|
| 26 | dst[0] = (uint8_t)(0xF0 + (uc >> 18));
|
| 27 | dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
|
| 28 | dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
| 29 | dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
|
| 30 | return 4;
|
| 31 | } else
|
| 32 | return 0;
|
| 33 | }
|
| 34 |
|
| 35 | TEST identity_test() {
|
| 36 | // check that decode(encode(x)) = x for all code-points (and surrogates)
|
| 37 | uint8_t buf[5] = {0};
|
| 38 | for (uint32_t cp = 1; cp < 0x10FFFF; ++cp) {
|
| 39 | int len = utf8proc_encode_char(cp, buf);
|
| 40 | Utf8Result result;
|
| 41 | utf8_decode(buf, &result);
|
| 42 |
|
| 43 | if (cp < 0xD800 || cp > 0xDFFF) {
|
| 44 | ASSERT_EQ(result.error, UTF8_OK);
|
| 45 | } else {
|
| 46 | ASSERT_EQ(result.error, UTF8_ERR_SURROGATE);
|
| 47 | }
|
| 48 | ASSERT_EQ(result.codepoint, cp);
|
| 49 | ASSERT_EQ(result.bytes_read, static_cast<size_t>(len));
|
| 50 | }
|
| 51 |
|
| 52 | PASS();
|
| 53 | }
|
| 54 |
|
| 55 | TEST overlong_test() {
|
| 56 | // All encode U+41 ('A')
|
| 57 | Utf8Result ok, overlong2, overlong3, overlong4;
|
| 58 | utf8_decode((unsigned char*)"\x41", &ok);
|
| 59 | utf8_decode((unsigned char*)"\xC1\x81", &overlong2);
|
| 60 | utf8_decode((unsigned char*)"\xE0\x81\x81", &overlong3);
|
| 61 | utf8_decode((unsigned char*)"\xF0\x80\x81\x81", &overlong4);
|
| 62 |
|
| 63 | ASSERT_EQ(ok.error, UTF8_OK);
|
| 64 | ASSERT_EQ(overlong2.error, UTF8_ERR_OVERLONG);
|
| 65 | ASSERT_EQ(overlong3.error, UTF8_ERR_OVERLONG);
|
| 66 | ASSERT_EQ(overlong4.error, UTF8_ERR_OVERLONG);
|
| 67 |
|
| 68 | ASSERT_EQ(ok.codepoint, 0x41);
|
| 69 | ASSERT_EQ(overlong2.codepoint, 0x41);
|
| 70 | ASSERT_EQ(overlong3.codepoint, 0x41);
|
| 71 | ASSERT_EQ(overlong4.codepoint, 0x41);
|
| 72 |
|
| 73 | ASSERT_EQ(ok.bytes_read, 1);
|
| 74 | ASSERT_EQ(overlong2.bytes_read, 2);
|
| 75 | ASSERT_EQ(overlong3.bytes_read, 3);
|
| 76 | ASSERT_EQ(overlong4.bytes_read, 4);
|
| 77 |
|
| 78 | PASS();
|
| 79 | }
|
| 80 |
|
| 81 | TEST too_large_test() {
|
| 82 | // Encoding of 0x111111 (via Table 3-6)
|
| 83 | // = 00010001 00010001 00010001
|
| 84 | // uuuuu -> 10001
|
| 85 | // zzzz -> 00001
|
| 86 | // yyyyyy -> 000100
|
| 87 | // xxxxxx -> 010001
|
| 88 | //
|
| 89 | // -> 11110100 10010001 10000100 10010001
|
| 90 | // = F4 91 84 91
|
| 91 | Utf8Result result;
|
| 92 | utf8_decode((unsigned char*)"\xF4\x91\x84\x91", &result);
|
| 93 |
|
| 94 | ASSERT_EQ(result.error, UTF8_ERR_TOO_LARGE);
|
| 95 | ASSERT_EQ(result.codepoint, 0x111111);
|
| 96 | ASSERT_EQ(result.bytes_read, 4);
|
| 97 |
|
| 98 | PASS();
|
| 99 | }
|
| 100 |
|
| 101 | TEST truncated_test() {
|
| 102 | Utf8Result result;
|
| 103 |
|
| 104 | constexpr const int NUM_INPUTS = 6;
|
| 105 | const char* inputs[NUM_INPUTS] = {
|
| 106 | "\xC5", "\xED", "\xED\x9F", "\xF4", "\xF4\x80", "\xF4\x80\x80",
|
| 107 | };
|
| 108 |
|
| 109 | for (int i = 0; i < NUM_INPUTS; i++) {
|
| 110 | utf8_decode((unsigned char*)inputs[i], &result);
|
| 111 | ASSERT_EQ(result.error, UTF8_ERR_TRUNCATED_BYTES);
|
| 112 | ASSERT_EQ(result.bytes_read, strlen(inputs[i]));
|
| 113 | }
|
| 114 |
|
| 115 | PASS();
|
| 116 | }
|
| 117 |
|
| 118 | GREATEST_MAIN_DEFS();
|
| 119 |
|
| 120 | int main(int argc, char** argv) {
|
| 121 | GREATEST_MAIN_BEGIN();
|
| 122 |
|
| 123 | RUN_TEST(identity_test);
|
| 124 | RUN_TEST(overlong_test);
|
| 125 | RUN_TEST(too_large_test);
|
| 126 | RUN_TEST(truncated_test);
|
| 127 |
|
| 128 | GREATEST_MAIN_END();
|
| 129 | return 0;
|
| 130 | }
|