| 1 | #include "data_lang/utf8.h"
 | 
| 2 | 
 | 
| 3 | #include <inttypes.h>
 | 
| 4 | 
 | 
| 5 | //#include "mycpp/runtime.h"
 | 
| 6 | #include "vendor/greatest.h"
 | 
| 7 | 
 | 
| 8 | // Copied from UTF-8 proc
 | 
| 9 | // https://github.com/JuliaStrings/utf8proc/blob/master/utf8proc.c#L177
 | 
| 10 | int utf8proc_encode_char(uint32_t uc, uint8_t* dst) {
 | 
| 11 |   if (uc < 0x80) {
 | 
| 12 |     dst[0] = (uint8_t)uc;
 | 
| 13 |     return 1;
 | 
| 14 |   } else if (uc < 0x800) {
 | 
| 15 |     dst[0] = (uint8_t)(0xC0 + (uc >> 6));
 | 
| 16 |     dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
 | 
| 17 |     return 2;
 | 
| 18 |     // Note: we allow encoding 0xd800-0xdfff here, so as not to change
 | 
| 19 |     // the API, however, these are actually invalid in UTF-8
 | 
| 20 |   } else if (uc < 0x10000) {
 | 
| 21 |     dst[0] = (uint8_t)(0xE0 + (uc >> 12));
 | 
| 22 |     dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
 | 
| 23 |     dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
 | 
| 24 |     return 3;
 | 
| 25 |   } else if (uc < 0x110000) {
 | 
| 26 |     dst[0] = (uint8_t)(0xF0 + (uc >> 18));
 | 
| 27 |     dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
 | 
| 28 |     dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
 | 
| 29 |     dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
 | 
| 30 |     return 4;
 | 
| 31 |   } else
 | 
| 32 |     return 0;
 | 
| 33 | }
 | 
| 34 | 
 | 
| 35 | TEST identity_test() {
 | 
| 36 |   // check that decode(encode(x)) = x for all code-points (and surrogates)
 | 
| 37 |   uint8_t buf[5] = {0};
 | 
| 38 |   for (uint32_t cp = 1; cp < 0x10FFFF; ++cp) {
 | 
| 39 |     int len = utf8proc_encode_char(cp, buf);
 | 
| 40 |     Utf8Result result;
 | 
| 41 |     utf8_decode(buf, &result);
 | 
| 42 | 
 | 
| 43 |     if (cp < 0xD800 || cp > 0xDFFF) {
 | 
| 44 |       ASSERT_EQ(result.error, UTF8_OK);
 | 
| 45 |     } else {
 | 
| 46 |       ASSERT_EQ(result.error, UTF8_ERR_SURROGATE);
 | 
| 47 |     }
 | 
| 48 |     ASSERT_EQ(result.codepoint, cp);
 | 
| 49 |     ASSERT_EQ(result.bytes_read, static_cast<size_t>(len));
 | 
| 50 |   }
 | 
| 51 | 
 | 
| 52 |   PASS();
 | 
| 53 | }
 | 
| 54 | 
 | 
| 55 | TEST overlong_test() {
 | 
| 56 |   // All encode U+41 ('A')
 | 
| 57 |   Utf8Result ok, overlong2, overlong3, overlong4;
 | 
| 58 |   utf8_decode((unsigned char*)"\x41", &ok);
 | 
| 59 |   utf8_decode((unsigned char*)"\xC1\x81", &overlong2);
 | 
| 60 |   utf8_decode((unsigned char*)"\xE0\x81\x81", &overlong3);
 | 
| 61 |   utf8_decode((unsigned char*)"\xF0\x80\x81\x81", &overlong4);
 | 
| 62 | 
 | 
| 63 |   ASSERT_EQ(ok.error, UTF8_OK);
 | 
| 64 |   ASSERT_EQ(overlong2.error, UTF8_ERR_OVERLONG);
 | 
| 65 |   ASSERT_EQ(overlong3.error, UTF8_ERR_OVERLONG);
 | 
| 66 |   ASSERT_EQ(overlong4.error, UTF8_ERR_OVERLONG);
 | 
| 67 | 
 | 
| 68 |   ASSERT_EQ(ok.codepoint, 0x41);
 | 
| 69 |   ASSERT_EQ(overlong2.codepoint, 0x41);
 | 
| 70 |   ASSERT_EQ(overlong3.codepoint, 0x41);
 | 
| 71 |   ASSERT_EQ(overlong4.codepoint, 0x41);
 | 
| 72 | 
 | 
| 73 |   ASSERT_EQ(ok.bytes_read, 1);
 | 
| 74 |   ASSERT_EQ(overlong2.bytes_read, 2);
 | 
| 75 |   ASSERT_EQ(overlong3.bytes_read, 3);
 | 
| 76 |   ASSERT_EQ(overlong4.bytes_read, 4);
 | 
| 77 | 
 | 
| 78 |   PASS();
 | 
| 79 | }
 | 
| 80 | 
 | 
| 81 | TEST too_large_test() {
 | 
| 82 |   // Encoding of 0x111111 (via Table 3-6)
 | 
| 83 |   //  = 00010001 00010001 00010001
 | 
| 84 |   //   uuuuu -> 10001
 | 
| 85 |   //    zzzz -> 00001
 | 
| 86 |   //  yyyyyy -> 000100
 | 
| 87 |   //  xxxxxx -> 010001
 | 
| 88 |   //
 | 
| 89 |   //  -> 11110100 10010001 10000100 10010001
 | 
| 90 |   //   = F4 91 84 91
 | 
| 91 |   Utf8Result result;
 | 
| 92 |   utf8_decode((unsigned char*)"\xF4\x91\x84\x91", &result);
 | 
| 93 | 
 | 
| 94 |   ASSERT_EQ(result.error, UTF8_ERR_TOO_LARGE);
 | 
| 95 |   ASSERT_EQ(result.codepoint, 0x111111);
 | 
| 96 |   ASSERT_EQ(result.bytes_read, 4);
 | 
| 97 | 
 | 
| 98 |   PASS();
 | 
| 99 | }
 | 
| 100 | 
 | 
| 101 | TEST truncated_test() {
 | 
| 102 |   Utf8Result result;
 | 
| 103 | 
 | 
| 104 |   constexpr const int NUM_INPUTS = 6;
 | 
| 105 |   const char* inputs[NUM_INPUTS] = {
 | 
| 106 |       "\xC5", "\xED", "\xED\x9F", "\xF4", "\xF4\x80", "\xF4\x80\x80",
 | 
| 107 |   };
 | 
| 108 | 
 | 
| 109 |   for (int i = 0; i < NUM_INPUTS; i++) {
 | 
| 110 |     utf8_decode((unsigned char*)inputs[i], &result);
 | 
| 111 |     ASSERT_EQ(result.error, UTF8_ERR_TRUNCATED_BYTES);
 | 
| 112 |     ASSERT_EQ(result.bytes_read, strlen(inputs[i]));
 | 
| 113 |   }
 | 
| 114 | 
 | 
| 115 |   PASS();
 | 
| 116 | }
 | 
| 117 | 
 | 
| 118 | GREATEST_MAIN_DEFS();
 | 
| 119 | 
 | 
| 120 | int main(int argc, char** argv) {
 | 
| 121 |   GREATEST_MAIN_BEGIN();
 | 
| 122 | 
 | 
| 123 |   RUN_TEST(identity_test);
 | 
| 124 |   RUN_TEST(overlong_test);
 | 
| 125 |   RUN_TEST(too_large_test);
 | 
| 126 |   RUN_TEST(truncated_test);
 | 
| 127 | 
 | 
| 128 |   GREATEST_MAIN_END();
 | 
| 129 |   return 0;
 | 
| 130 | }
 |