data_lang/utf8.h

OILS / data_lang / utf8.h View on Github | oilshell.org

170 lines, 71 significant

1	#ifndef DATA_LANG_UTF8_H
2	#define DATA_LANG_UTF8_H
3
4	#include <stddef.h> // size_t
5	#include <stdint.h> // uint32_t
6	#include <stdio.h>
7
8	/**
9	* ---- Quick reference about the encoding ----
10	*
11	* First, all valid UTF-8 sequences follow of bit "patterns" (Table 3-6.) The
12	* first byte determines the length of the sequence and then the next 0-3 bytes
13	* are "continuation bytes."
14	*
15	* +----------------------------+----------+----------+----------+----------+
16	* \| Scalar Value \| 1st Byte \| 2nd Byte \| 3rd Byte \| 4th Byte \|
17	* +----------------------------+----------+----------+----------+----------+
18	* \| 00000000 0xxxxxxx \| 0xxxxxxx \| \| \| \|
19	* \| 00000yyy yyxxxxxx \| 110yyyyy \| 10xxxxxx \| \| \|
20	* \| zzzzyyyy yyxxxxxx \| 1110zzzz \| 10yyyyyy \| 10xxxxxx \| \|
21	* \| 000uuuuu zzzzyyyy yyxxxxxx \| 11110uuu \| 10uuzzzz \| 10yyyyyy \| 10xxxxxx \|
22	* +----------------------------+----------+----------+----------+----------+
23	*
24	* Table 3-6 from Unicode Standard 15.0.0 Ch3. UTF-8 bit patterns
25	*
26	* There are 3 further restrictions which make some valid bit patterns
27	* invalid:
28	* 1. Overlongs: eg, <0x41> and <0xC1 0x81> both store U+41, but the second
29	* sequence is longer and thus an error.
30	* 2. Surrogates: Any codepoint between U+D800 and U+DFFF (inclusive) is a
31	* surrogate. It is an error to encode surrogates in UTF-8.
32	* 3. Too Large: Any decoded value over 0x10FFFF is not a Unicode codepoint,
33	* and must be rejected as an error.
34	*
35	* See https://aolsen.ca/writings/everything-about-utf8 for more details about
36	* the encoding.
37	*/
38
39	typedef enum Utf8Error {
40	UTF8_OK = 0,
41
42	// Encodes a codepoint in more bytes than necessary
43	UTF8_ERR_OVERLONG = 1,
44
45	// Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF, inclusive)
46	UTF8_ERR_SURROGATE = 2,
47
48	// Encodes a value greater than the max codepoint U+10FFFF
49	UTF8_ERR_TOO_LARGE = 3,
50
51	// Encoding doesn't conform to the UTF-8 bit patterns
52	UTF8_ERR_BAD_ENCODING = 4,
53
54	// It looks like there is another codepoint, but it has been truncated.
55	UTF8_ERR_TRUNCATED_BYTES = 5,
56	} Utf8Error_t;
57
58	typedef struct Utf8Result {
59	Utf8Error_t error;
60	uint32_t codepoint;
61	size_t bytes_read;
62	} Utf8Result_t;
63
64	static inline void _cont(const unsigned char input, Utf8Result_t result) {
65	if (result->error) return;
66
67	int byte = input[result->bytes_read];
68	if (byte == '\0') {
69	result->error = UTF8_ERR_TRUNCATED_BYTES;
70	return;
71	}
72	result->bytes_read += 1;
73
74	// Continuation bytes follow the bit pattern 10xx_xxxx. We need to a)
75	// validate the pattern and b) remove the leading '10'.
76	if ((byte & 0xC0) == 0x80) {
77	result->codepoint <<= 6;
78	result->codepoint \|= byte & 0x3F;
79	} else {
80	result->error = UTF8_ERR_BAD_ENCODING;
81	}
82	}
83
84	/**
85	* Given a nul-terminated string `input`, try to decode the next codepoint from
86	* that string.
87	*
88	* It is required that `input` does not point to the nul-terminator. If
89	* `*input == '\0'`, then it is assumed that the zero-byte is meant to encode
90	* U+00, not a sentinel. The nul-terminator is still necessary because we need
91	* it to prevent buffer overrun in the case of a truncated byte sequence, for
92	* example '\xC2'. This oddity is to facilitate strings which may contain U+00
93	* codepoints.
94	*
95	* If there was a surrogate, overlong or codepoint to large error then
96	* `result.codepoint` will contain the recovered value.
97	*/
98	static inline void utf8_decode(const unsigned char *input,
99	Utf8Result_t *result) {
100	result->error = UTF8_OK;
101	result->codepoint = 0;
102	result->bytes_read = 0;
103
104	int first = *input;
105	result->bytes_read = 1;
106
107	if ((first & 0x80) == 0) {
108	// 1-byte long (ASCII subset)
109	result->codepoint = first;
110	return;
111	}
112
113	if ((first & 0xE0) == 0xC0) {
114	// 2-bytes long
115	result->codepoint = first & 0x1F;
116
117	_cont(input, result);
118	if (result->error) return;
119
120	if (result->codepoint < 0x80) {
121	result->error = UTF8_ERR_OVERLONG;
122	}
123
124	return;
125	}
126
127	if ((first & 0xF0) == 0xE0) {
128	// 3-bytes long
129	result->codepoint = first & 0x0F;
130
131	_cont(input, result);
132	_cont(input, result);
133	if (result->error) return;
134
135	if (result->codepoint < 0x800) {
136	result->error = UTF8_ERR_OVERLONG;
137	}
138
139	if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) {
140	result->error = UTF8_ERR_SURROGATE;
141	}
142
143	return;
144	}
145
146	if ((first & 0xF8) == 0xF0) {
147	// 4-bytes long
148	result->codepoint = first & 0x07;
149
150	_cont(input, result);
151	_cont(input, result);
152	_cont(input, result);
153	if (result->error) return;
154
155	if (result->codepoint < 0x10000) {
156	result->error = UTF8_ERR_OVERLONG;
157	}
158
159	if (result->codepoint > 0x10FFFF) {
160	result->error = UTF8_ERR_TOO_LARGE;
161	}
162
163	return;
164	}
165
166	result->error = UTF8_ERR_BAD_ENCODING;
167	return;
168	}
169
170	#endif // DATA_LANG_UTF8_H