| 1 | #ifndef DATA_LANG_UTF8_H
 | 
| 2 | #define DATA_LANG_UTF8_H
 | 
| 3 | 
 | 
| 4 | #include <stddef.h>  // size_t
 | 
| 5 | #include <stdint.h>  // uint32_t
 | 
| 6 | #include <stdio.h>
 | 
| 7 | 
 | 
| 8 | /**
 | 
| 9 |  *              ---- Quick reference about the encoding ----
 | 
| 10 |  *
 | 
| 11 |  * First, all valid UTF-8 sequences follow of bit "patterns" (Table 3-6.) The
 | 
| 12 |  * first byte determines the length of the sequence and then the next 0-3 bytes
 | 
| 13 |  * are "continuation bytes."
 | 
| 14 |  *
 | 
| 15 |  * +----------------------------+----------+----------+----------+----------+
 | 
| 16 |  * | Scalar Value               | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
 | 
| 17 |  * +----------------------------+----------+----------+----------+----------+
 | 
| 18 |  * | 00000000 0xxxxxxx          | 0xxxxxxx |          |          |          |
 | 
| 19 |  * | 00000yyy yyxxxxxx          | 110yyyyy | 10xxxxxx |          |          |
 | 
| 20 |  * | zzzzyyyy yyxxxxxx          | 1110zzzz | 10yyyyyy | 10xxxxxx |          |
 | 
| 21 |  * | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx |
 | 
| 22 |  * +----------------------------+----------+----------+----------+----------+
 | 
| 23 |  *
 | 
| 24 |  *      Table 3-6 from Unicode Standard 15.0.0 Ch3. UTF-8 bit patterns
 | 
| 25 |  *
 | 
| 26 |  * There are 3 further restrictions which make some valid bit patterns
 | 
| 27 |  * *invalid*:
 | 
| 28 |  *  1. Overlongs: eg, <0x41> and <0xC1 0x81> both store U+41, but the second
 | 
| 29 |  *     sequence is longer and thus an error.
 | 
| 30 |  *  2. Surrogates: Any codepoint between U+D800 and U+DFFF (inclusive) is a
 | 
| 31 |  *     surrogate. It is an error to encode surrogates in UTF-8.
 | 
| 32 |  *  3. Too Large: Any decoded value over 0x10FFFF is not a Unicode codepoint,
 | 
| 33 |  *     and must be rejected as an error.
 | 
| 34 |  *
 | 
| 35 |  * See https://aolsen.ca/writings/everything-about-utf8 for more details about
 | 
| 36 |  * the encoding.
 | 
| 37 |  */
 | 
| 38 | 
 | 
| 39 | typedef enum Utf8Error {
 | 
| 40 |   UTF8_OK = 0,
 | 
| 41 | 
 | 
| 42 |   // Encodes a codepoint in more bytes than necessary
 | 
| 43 |   UTF8_ERR_OVERLONG = 1,
 | 
| 44 | 
 | 
| 45 |   // Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF, inclusive)
 | 
| 46 |   UTF8_ERR_SURROGATE = 2,
 | 
| 47 | 
 | 
| 48 |   // Encodes a value greater than the max codepoint U+10FFFF
 | 
| 49 |   UTF8_ERR_TOO_LARGE = 3,
 | 
| 50 | 
 | 
| 51 |   // Encoding doesn't conform to the UTF-8 bit patterns
 | 
| 52 |   UTF8_ERR_BAD_ENCODING = 4,
 | 
| 53 | 
 | 
| 54 |   // It looks like there is another codepoint, but it has been truncated.
 | 
| 55 |   UTF8_ERR_TRUNCATED_BYTES = 5,
 | 
| 56 | } Utf8Error_t;
 | 
| 57 | 
 | 
| 58 | typedef struct Utf8Result {
 | 
| 59 |   Utf8Error_t error;
 | 
| 60 |   uint32_t codepoint;
 | 
| 61 |   size_t bytes_read;
 | 
| 62 | } Utf8Result_t;
 | 
| 63 | 
 | 
| 64 | static inline void _cont(const unsigned char *input, Utf8Result_t *result) {
 | 
| 65 |   if (result->error) return;
 | 
| 66 | 
 | 
| 67 |   int byte = input[result->bytes_read];
 | 
| 68 |   if (byte == '\0') {
 | 
| 69 |     result->error = UTF8_ERR_TRUNCATED_BYTES;
 | 
| 70 |     return;
 | 
| 71 |   }
 | 
| 72 |   result->bytes_read += 1;
 | 
| 73 | 
 | 
| 74 |   // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a)
 | 
| 75 |   // validate the pattern and b) remove the leading '10'.
 | 
| 76 |   if ((byte & 0xC0) == 0x80) {
 | 
| 77 |     result->codepoint <<= 6;
 | 
| 78 |     result->codepoint |= byte & 0x3F;
 | 
| 79 |   } else {
 | 
| 80 |     result->error = UTF8_ERR_BAD_ENCODING;
 | 
| 81 |   }
 | 
| 82 | }
 | 
| 83 | 
 | 
| 84 | /**
 | 
| 85 |  * Given a nul-terminated string `input`, try to decode the next codepoint from
 | 
| 86 |  * that string.
 | 
| 87 |  *
 | 
| 88 |  * It is required that `input` does not point to the nul-terminator. If
 | 
| 89 |  * `*input == '\0'`, then it is assumed that the zero-byte is meant to encode
 | 
| 90 |  * U+00, not a sentinel. The nul-terminator is still necessary because we need
 | 
| 91 |  * it to prevent buffer overrun in the case of a truncated byte sequence, for
 | 
| 92 |  * example '\xC2'. This oddity is to facilitate strings which may contain U+00
 | 
| 93 |  * codepoints.
 | 
| 94 |  *
 | 
| 95 |  * If there was a surrogate, overlong or codepoint to large error then
 | 
| 96 |  * `result.codepoint` will contain the recovered value.
 | 
| 97 |  */
 | 
| 98 | static inline void utf8_decode(const unsigned char *input,
 | 
| 99 |                                Utf8Result_t *result) {
 | 
| 100 |   result->error = UTF8_OK;
 | 
| 101 |   result->codepoint = 0;
 | 
| 102 |   result->bytes_read = 0;
 | 
| 103 | 
 | 
| 104 |   int first = *input;
 | 
| 105 |   result->bytes_read = 1;
 | 
| 106 | 
 | 
| 107 |   if ((first & 0x80) == 0) {
 | 
| 108 |     // 1-byte long (ASCII subset)
 | 
| 109 |     result->codepoint = first;
 | 
| 110 |     return;
 | 
| 111 |   }
 | 
| 112 | 
 | 
| 113 |   if ((first & 0xE0) == 0xC0) {
 | 
| 114 |     // 2-bytes long
 | 
| 115 |     result->codepoint = first & 0x1F;
 | 
| 116 | 
 | 
| 117 |     _cont(input, result);
 | 
| 118 |     if (result->error) return;
 | 
| 119 | 
 | 
| 120 |     if (result->codepoint < 0x80) {
 | 
| 121 |       result->error = UTF8_ERR_OVERLONG;
 | 
| 122 |     }
 | 
| 123 | 
 | 
| 124 |     return;
 | 
| 125 |   }
 | 
| 126 | 
 | 
| 127 |   if ((first & 0xF0) == 0xE0) {
 | 
| 128 |     // 3-bytes long
 | 
| 129 |     result->codepoint = first & 0x0F;
 | 
| 130 | 
 | 
| 131 |     _cont(input, result);
 | 
| 132 |     _cont(input, result);
 | 
| 133 |     if (result->error) return;
 | 
| 134 | 
 | 
| 135 |     if (result->codepoint < 0x800) {
 | 
| 136 |       result->error = UTF8_ERR_OVERLONG;
 | 
| 137 |     }
 | 
| 138 | 
 | 
| 139 |     if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) {
 | 
| 140 |       result->error = UTF8_ERR_SURROGATE;
 | 
| 141 |     }
 | 
| 142 | 
 | 
| 143 |     return;
 | 
| 144 |   }
 | 
| 145 | 
 | 
| 146 |   if ((first & 0xF8) == 0xF0) {
 | 
| 147 |     // 4-bytes long
 | 
| 148 |     result->codepoint = first & 0x07;
 | 
| 149 | 
 | 
| 150 |     _cont(input, result);
 | 
| 151 |     _cont(input, result);
 | 
| 152 |     _cont(input, result);
 | 
| 153 |     if (result->error) return;
 | 
| 154 | 
 | 
| 155 |     if (result->codepoint < 0x10000) {
 | 
| 156 |       result->error = UTF8_ERR_OVERLONG;
 | 
| 157 |     }
 | 
| 158 | 
 | 
| 159 |     if (result->codepoint > 0x10FFFF) {
 | 
| 160 |       result->error = UTF8_ERR_TOO_LARGE;
 | 
| 161 |     }
 | 
| 162 | 
 | 
| 163 |     return;
 | 
| 164 |   }
 | 
| 165 | 
 | 
| 166 |   result->error = UTF8_ERR_BAD_ENCODING;
 | 
| 167 |   return;
 | 
| 168 | }
 | 
| 169 | 
 | 
| 170 | #endif  // DATA_LANG_UTF8_H
 |