OILS / data_lang / utf8.h View on Github | oilshell.org

170 lines, 71 significant
1#ifndef DATA_LANG_UTF8_H
2#define DATA_LANG_UTF8_H
3
4#include <stddef.h> // size_t
5#include <stdint.h> // uint32_t
6#include <stdio.h>
7
8/**
9 * ---- Quick reference about the encoding ----
10 *
11 * First, all valid UTF-8 sequences follow of bit "patterns" (Table 3-6.) The
12 * first byte determines the length of the sequence and then the next 0-3 bytes
13 * are "continuation bytes."
14 *
15 * +----------------------------+----------+----------+----------+----------+
16 * | Scalar Value | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
17 * +----------------------------+----------+----------+----------+----------+
18 * | 00000000 0xxxxxxx | 0xxxxxxx | | | |
19 * | 00000yyy yyxxxxxx | 110yyyyy | 10xxxxxx | | |
20 * | zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx | |
21 * | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx |
22 * +----------------------------+----------+----------+----------+----------+
23 *
24 * Table 3-6 from Unicode Standard 15.0.0 Ch3. UTF-8 bit patterns
25 *
26 * There are 3 further restrictions which make some valid bit patterns
27 * *invalid*:
28 * 1. Overlongs: eg, <0x41> and <0xC1 0x81> both store U+41, but the second
29 * sequence is longer and thus an error.
30 * 2. Surrogates: Any codepoint between U+D800 and U+DFFF (inclusive) is a
31 * surrogate. It is an error to encode surrogates in UTF-8.
32 * 3. Too Large: Any decoded value over 0x10FFFF is not a Unicode codepoint,
33 * and must be rejected as an error.
34 *
35 * See https://aolsen.ca/writings/everything-about-utf8 for more details about
36 * the encoding.
37 */
38
39typedef enum Utf8Error {
40 UTF8_OK = 0,
41
42 // Encodes a codepoint in more bytes than necessary
43 UTF8_ERR_OVERLONG = 1,
44
45 // Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF, inclusive)
46 UTF8_ERR_SURROGATE = 2,
47
48 // Encodes a value greater than the max codepoint U+10FFFF
49 UTF8_ERR_TOO_LARGE = 3,
50
51 // Encoding doesn't conform to the UTF-8 bit patterns
52 UTF8_ERR_BAD_ENCODING = 4,
53
54 // It looks like there is another codepoint, but it has been truncated.
55 UTF8_ERR_TRUNCATED_BYTES = 5,
56} Utf8Error_t;
57
58typedef struct Utf8Result {
59 Utf8Error_t error;
60 uint32_t codepoint;
61 size_t bytes_read;
62} Utf8Result_t;
63
64static inline void _cont(const unsigned char *input, Utf8Result_t *result) {
65 if (result->error) return;
66
67 int byte = input[result->bytes_read];
68 if (byte == '\0') {
69 result->error = UTF8_ERR_TRUNCATED_BYTES;
70 return;
71 }
72 result->bytes_read += 1;
73
74 // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a)
75 // validate the pattern and b) remove the leading '10'.
76 if ((byte & 0xC0) == 0x80) {
77 result->codepoint <<= 6;
78 result->codepoint |= byte & 0x3F;
79 } else {
80 result->error = UTF8_ERR_BAD_ENCODING;
81 }
82}
83
84/**
85 * Given a nul-terminated string `input`, try to decode the next codepoint from
86 * that string.
87 *
88 * It is required that `input` does not point to the nul-terminator. If
89 * `*input == '\0'`, then it is assumed that the zero-byte is meant to encode
90 * U+00, not a sentinel. The nul-terminator is still necessary because we need
91 * it to prevent buffer overrun in the case of a truncated byte sequence, for
92 * example '\xC2'. This oddity is to facilitate strings which may contain U+00
93 * codepoints.
94 *
95 * If there was a surrogate, overlong or codepoint to large error then
96 * `result.codepoint` will contain the recovered value.
97 */
98static inline void utf8_decode(const unsigned char *input,
99 Utf8Result_t *result) {
100 result->error = UTF8_OK;
101 result->codepoint = 0;
102 result->bytes_read = 0;
103
104 int first = *input;
105 result->bytes_read = 1;
106
107 if ((first & 0x80) == 0) {
108 // 1-byte long (ASCII subset)
109 result->codepoint = first;
110 return;
111 }
112
113 if ((first & 0xE0) == 0xC0) {
114 // 2-bytes long
115 result->codepoint = first & 0x1F;
116
117 _cont(input, result);
118 if (result->error) return;
119
120 if (result->codepoint < 0x80) {
121 result->error = UTF8_ERR_OVERLONG;
122 }
123
124 return;
125 }
126
127 if ((first & 0xF0) == 0xE0) {
128 // 3-bytes long
129 result->codepoint = first & 0x0F;
130
131 _cont(input, result);
132 _cont(input, result);
133 if (result->error) return;
134
135 if (result->codepoint < 0x800) {
136 result->error = UTF8_ERR_OVERLONG;
137 }
138
139 if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) {
140 result->error = UTF8_ERR_SURROGATE;
141 }
142
143 return;
144 }
145
146 if ((first & 0xF8) == 0xF0) {
147 // 4-bytes long
148 result->codepoint = first & 0x07;
149
150 _cont(input, result);
151 _cont(input, result);
152 _cont(input, result);
153 if (result->error) return;
154
155 if (result->codepoint < 0x10000) {
156 result->error = UTF8_ERR_OVERLONG;
157 }
158
159 if (result->codepoint > 0x10FFFF) {
160 result->error = UTF8_ERR_TOO_LARGE;
161 }
162
163 return;
164 }
165
166 result->error = UTF8_ERR_BAD_ENCODING;
167 return;
168}
169
170#endif // DATA_LANG_UTF8_H