1 | #include "cpp/libc.h"
|
2 |
|
3 | #include <locale.h> // setlocale()
|
4 | #include <regex.h> // regcomp()
|
5 | #include <unistd.h> // gethostname()
|
6 | #include <wctype.h> // towupper()
|
7 |
|
8 | #include "mycpp/runtime.h"
|
9 | #include "vendor/greatest.h"
|
10 |
|
11 | TEST hostname_test() {
|
12 | BigStr* s0 = libc::gethostname();
|
13 | ASSERT(s0 != nullptr);
|
14 |
|
15 | char buf[1024];
|
16 | ASSERT(gethostname(buf, HOST_NAME_MAX) == 0);
|
17 | ASSERT(str_equals(s0, StrFromC(buf)));
|
18 |
|
19 | PASS();
|
20 | }
|
21 |
|
22 | TEST realpath_test() {
|
23 | BigStr* result = libc::realpath(StrFromC("/"));
|
24 | ASSERT(str_equals(StrFromC("/"), result));
|
25 |
|
26 | bool caught = false;
|
27 | try {
|
28 | libc::realpath(StrFromC("/nonexistent_ZZZ"));
|
29 | } catch (IOError_OSError* e) {
|
30 | caught = true;
|
31 | }
|
32 | ASSERT(caught);
|
33 |
|
34 | PASS();
|
35 | }
|
36 |
|
37 | TEST libc_test() {
|
38 | log("sizeof(wchar_t) = %d", sizeof(wchar_t));
|
39 |
|
40 | int width = 0;
|
41 |
|
42 | // TODO: enable this test. Is it not picking LC_CTYPE?
|
43 | // Do we have to do some initialization like libc.cpython_reset_locale() ?
|
44 | #if 0
|
45 | try {
|
46 | // mu character \u{03bc} in utf-8
|
47 | width = libc::wcswidth(StrFromC("\xce\xbc"));
|
48 | } catch (UnicodeError* e) {
|
49 | log("UnicodeError %s", e->message->data_);
|
50 | }
|
51 | ASSERT_EQ_FMT(2, width, "%d");
|
52 | #endif
|
53 |
|
54 | BigStr* h = libc::gethostname();
|
55 | log("gethostname() = %s %d", h->data_, len(h));
|
56 |
|
57 | width = libc::wcswidth(StrFromC("foo"));
|
58 | ASSERT_EQ(3, width);
|
59 |
|
60 | libc::print_time(0.1, 0.2, 0.3);
|
61 |
|
62 | PASS();
|
63 | }
|
64 |
|
65 | static List<BigStr*>* Groups(BigStr* s, List<int>* indices) {
|
66 | List<BigStr*>* groups = NewList<BigStr*>();
|
67 | int n = len(indices) / 2;
|
68 | for (int i = 0; i < n; ++i) {
|
69 | int start = indices->at(2 * i);
|
70 | int end = indices->at(2 * i + 1);
|
71 | if (start == -1) {
|
72 | groups->append(nullptr);
|
73 | } else {
|
74 | groups->append(s->slice(start, end));
|
75 | }
|
76 | }
|
77 | return groups;
|
78 | }
|
79 |
|
80 | TEST regex_wrapper_test() {
|
81 | BigStr* s1 = StrFromC("-abaacaaa");
|
82 | List<int>* indices = libc::regex_search(StrFromC("(a+).(a+)"), 0, s1, 0);
|
83 | List<BigStr*>* results = Groups(s1, indices);
|
84 | ASSERT_EQ_FMT(3, len(results), "%d");
|
85 | ASSERT(str_equals(StrFromC("abaa"), results->at(0))); // whole match
|
86 | ASSERT(str_equals(StrFromC("a"), results->at(1)));
|
87 | ASSERT(str_equals(StrFromC("aa"), results->at(2)));
|
88 |
|
89 | indices = libc::regex_search(StrFromC("z+"), 0, StrFromC("abaacaaa"), 0);
|
90 | ASSERT_EQ(nullptr, indices);
|
91 |
|
92 | // Alternation gives unmatched group
|
93 | BigStr* s2 = StrFromC("b");
|
94 | indices = libc::regex_search(StrFromC("(a)|(b)"), 0, s2, 0);
|
95 | results = Groups(s2, indices);
|
96 | ASSERT_EQ_FMT(3, len(results), "%d");
|
97 | ASSERT(str_equals(StrFromC("b"), results->at(0))); // whole match
|
98 | ASSERT_EQ(nullptr, results->at(1));
|
99 | ASSERT(str_equals(StrFromC("b"), results->at(2)));
|
100 |
|
101 | // Like Unicode test below
|
102 | indices = libc::regex_search(StrFromC("_._"), 0, StrFromC("_x_"), 0);
|
103 | ASSERT(indices != nullptr);
|
104 | ASSERT_EQ_FMT(2, len(indices), "%d");
|
105 | ASSERT_EQ_FMT(0, indices->at(0), "%d");
|
106 | ASSERT_EQ_FMT(3, indices->at(1), "%d");
|
107 |
|
108 | // TODO(unicode)
|
109 | #if 0
|
110 | //indices = libc::regex_search(StrFromC("_._"), 0, StrFromC("_\u03bc_"), 0);
|
111 | indices = libc::regex_search(StrFromC("_._"), 0, StrFromC("_μ_"), 0);
|
112 | ASSERT(indices != nullptr);
|
113 | ASSERT_EQ_FMT(2, len(indices), "%d");
|
114 | ASSERT_EQ_FMT(0, indices->at(0), "%d");
|
115 | ASSERT_EQ_FMT(0, indices->at(0), "%d");
|
116 | #endif
|
117 |
|
118 | Tuple2<int, int>* result;
|
119 | BigStr* s = StrFromC("oXooXoooXoX");
|
120 | result = libc::regex_first_group_match(StrFromC("(X.)"), s, 0);
|
121 | ASSERT_EQ_FMT(1, result->at0(), "%d");
|
122 | ASSERT_EQ_FMT(3, result->at1(), "%d");
|
123 |
|
124 | result = libc::regex_first_group_match(StrFromC("(X.)"), s, 3);
|
125 | ASSERT_EQ_FMT(4, result->at0(), "%d");
|
126 | ASSERT_EQ_FMT(6, result->at1(), "%d");
|
127 |
|
128 | result = libc::regex_first_group_match(StrFromC("(X.)"), s, 6);
|
129 | ASSERT_EQ_FMT(8, result->at0(), "%d");
|
130 | ASSERT_EQ_FMT(10, result->at1(), "%d");
|
131 |
|
132 | PASS();
|
133 | }
|
134 |
|
135 | TEST glob_test() {
|
136 | // This depends on the file system
|
137 | auto files = libc::glob(StrFromC("*.testdata"));
|
138 | // 3 files are made by the shell wrapper
|
139 | ASSERT_EQ_FMT(3, len(files), "%d");
|
140 |
|
141 | print(files->at(0));
|
142 |
|
143 | auto files2 = libc::glob(StrFromC("*.pyzzz"));
|
144 | ASSERT_EQ_FMT(0, len(files2), "%d");
|
145 |
|
146 | PASS();
|
147 | }
|
148 |
|
149 | TEST fnmatch_test() {
|
150 | BigStr* s1 = (StrFromC("foo.py "))->strip();
|
151 | ASSERT(libc::fnmatch(StrFromC("*.py"), s1));
|
152 | ASSERT(!libc::fnmatch(StrFromC("*.py"), StrFromC("foo.p")));
|
153 |
|
154 | // Unicode - ? is byte or code point?
|
155 | ASSERT(libc::fnmatch(StrFromC("_?_"), StrFromC("_x_")));
|
156 |
|
157 | // TODO(unicode)
|
158 | // ASSERT(libc::fnmatch(StrFromC("_?_"), StrFromC("_\u03bc_")));
|
159 | // ASSERT(libc::fnmatch(StrFromC("_?_"), StrFromC("_μ_")));
|
160 |
|
161 | // extended glob
|
162 | ASSERT(libc::fnmatch(StrFromC("*(foo|bar).py"), StrFromC("foo.py")));
|
163 | ASSERT(!libc::fnmatch(StrFromC("*(foo|bar).py"), StrFromC("foo.p")));
|
164 |
|
165 | PASS();
|
166 | }
|
167 |
|
168 | TEST for_test_coverage() {
|
169 | // Sometimes we're not connected to a terminal
|
170 | try {
|
171 | libc::get_terminal_width();
|
172 | } catch (IOError_OSError* e) {
|
173 | }
|
174 |
|
175 | PASS();
|
176 | }
|
177 |
|
178 | void FindAll(const char* p, const char* s) {
|
179 | regex_t pat;
|
180 |
|
181 | int cflags = REG_EXTENDED;
|
182 | if (regcomp(&pat, p, cflags) != 0) {
|
183 | FAIL();
|
184 | }
|
185 | int outlen = pat.re_nsub + 1; // number of captures
|
186 |
|
187 | // TODO: Could statically allocate 99, and assert that re_nsub is less than
|
188 | // 99. Would speed up loops.
|
189 | regmatch_t* pmatch =
|
190 | static_cast<regmatch_t*>(malloc(sizeof(regmatch_t) * outlen));
|
191 |
|
192 | int cur_pos = 0;
|
193 | // int n = strlen(s);
|
194 | while (true) {
|
195 | // Necessary so ^ doesn't match in the middle!
|
196 | int eflags = cur_pos == 0 ? 0 : REG_NOTBOL;
|
197 | bool match = regexec(&pat, s + cur_pos, outlen, pmatch, eflags) == 0;
|
198 |
|
199 | if (!match) {
|
200 | break;
|
201 | }
|
202 | int i;
|
203 | for (i = 0; i < outlen; i++) {
|
204 | int start = pmatch[i].rm_so;
|
205 | int end = pmatch[i].rm_eo;
|
206 | int len = end - start;
|
207 | BigStr* m = StrFromC(s + cur_pos + start, len);
|
208 | log("%d GROUP %d (%d .. %d) = [%s]", cur_pos, i, start, end, m->data_);
|
209 | }
|
210 | log("");
|
211 | int match_len = pmatch[0].rm_eo;
|
212 | if (match_len == 0) {
|
213 | break;
|
214 | }
|
215 | cur_pos += match_len;
|
216 | }
|
217 |
|
218 | free(pmatch);
|
219 | regfree(&pat);
|
220 | }
|
221 |
|
222 | // adjacent matches
|
223 | const char* s = "a345y-axy- there b789y- cy-";
|
224 |
|
225 | TEST regex_unanchored() {
|
226 | const char* unanchored = "[abc]([0-9]*)(x?)(y)-";
|
227 | FindAll(unanchored, s);
|
228 |
|
229 | PASS();
|
230 | }
|
231 |
|
232 | TEST regex_caret() {
|
233 | const char* anchored = "^[abc]([0-9]*)(x?)(y)-";
|
234 | FindAll(anchored, s);
|
235 |
|
236 | PASS();
|
237 | }
|
238 |
|
239 | TEST regex_lexer() {
|
240 | // like the Yaks / Make-a-Lisp pattern
|
241 | const char* lexer = "([a-z]+)|([0-9]+)|([ ]+)|([+-])";
|
242 | FindAll(lexer, s);
|
243 |
|
244 | PASS();
|
245 | }
|
246 |
|
247 | TEST regex_repeat_with_capture() {
|
248 | const char* lexer = "(([a-z]+)([0-9]+)-)*((A+)|(Z+))*";
|
249 | FindAll(lexer, "a0-b1-c2-AAZZZA");
|
250 | // Groups are weird
|
251 | // whole match 0: a0-b1-c2-
|
252 | // 1: c2- # last repetition
|
253 | // 2: c # last one
|
254 | // 3: 2 # last one
|
255 | //
|
256 | // And then there's an empty match
|
257 | //
|
258 | // Ideas:
|
259 | // - disallow nested groups in Eggex?
|
260 | // - I really care about the inner ones -- groups 2 and 3
|
261 | // - I want flat groups
|
262 |
|
263 | PASS();
|
264 | }
|
265 |
|
266 | // Disallow this in eggex, as well as the above
|
267 | TEST regex_nested_capture() {
|
268 | const char* lexer = "(([a-z]+)([0-9]+))";
|
269 | FindAll(lexer, "a0");
|
270 | PASS();
|
271 | }
|
272 |
|
273 | // I think we allow this in eggex
|
274 | TEST regex_alt_with_capture() {
|
275 | const char* lexer = "([a-z]+)|([0-9]+)(-)";
|
276 | FindAll(lexer, "x-");
|
277 | FindAll(lexer, "7-");
|
278 | PASS();
|
279 | }
|
280 |
|
281 | TEST regex_unicode() {
|
282 | regex_t pat;
|
283 |
|
284 | // 1 or 2 bytes
|
285 | // const char* p = "_..?_";
|
286 | // const char* p = "_[^a]_";
|
287 | const char* p = "_._"; // 1 byte, not code point?
|
288 |
|
289 | if (regcomp(&pat, p, REG_EXTENDED) != 0) {
|
290 | FAIL();
|
291 | }
|
292 | int outlen = pat.re_nsub + 1; // number of captures
|
293 | regmatch_t* pmatch =
|
294 | static_cast<regmatch_t*>(malloc(sizeof(regmatch_t) * outlen));
|
295 |
|
296 | int result;
|
297 |
|
298 | const char* bad = "_xyz_";
|
299 | result = regexec(&pat, bad, outlen, pmatch, 0);
|
300 | ASSERT_EQ_FMT(1, result, "%d"); // does not match
|
301 |
|
302 | const char* a = "_x_";
|
303 | result = regexec(&pat, a, outlen, pmatch, 0);
|
304 | ASSERT_EQ_FMT(0, result, "%d");
|
305 |
|
306 | // Doesn't change anything
|
307 | // int lc_what = LC_ALL;
|
308 | int lc_what = LC_CTYPE;
|
309 |
|
310 | // char* saved_locale = setlocale(LC_ALL, "");
|
311 | // char* saved_locale = setlocale(LC_ALL, NULL);
|
312 |
|
313 | // char* saved_locale = setlocale(lc_what, NULL);
|
314 |
|
315 | #if 0
|
316 | // Doesn't change anything?
|
317 | //if (setlocale(LC_CTYPE, "C.utf8") == NULL) {
|
318 | if (setlocale(LC_CTYPE, "en_US.UTF-8") == NULL) {
|
319 | log("Couldn't set locale to C.utf8");
|
320 | FAIL();
|
321 | }
|
322 | #endif
|
323 |
|
324 | // const char* u = "_μ_";
|
325 | const char* u = "_\u03bc_";
|
326 | log("a = %d bytes", strlen(a));
|
327 | log("u = %d bytes", strlen(u));
|
328 | result = regexec(&pat, u, outlen, pmatch, 0);
|
329 |
|
330 | #if 0
|
331 | if (setlocale(lc_what, saved_locale) == NULL) {
|
332 | log("Couldn't restore locale");
|
333 | FAIL();
|
334 | }
|
335 | #endif
|
336 |
|
337 | free(pmatch); // Clean up before test failures
|
338 | regfree(&pat);
|
339 |
|
340 | // TODO(unicode)
|
341 | // ASSERT_EQ_FMT(0, result, "%d");
|
342 |
|
343 | PASS();
|
344 | }
|
345 |
|
346 | TEST casefold_test() {
|
347 | #if 0
|
348 | // Turkish
|
349 | if (setlocale(LC_CTYPE, "tr_TR.utf8") == NULL) {
|
350 | log("Couldn't set locale to tr_TR.utf8");
|
351 | FAIL();
|
352 | }
|
353 | #endif
|
354 |
|
355 | // LC_CTYPE_MASK instead of LC_CTYPE
|
356 | locale_t turkish = newlocale(LC_CTYPE_MASK, "tr_TR.utf8", NULL);
|
357 |
|
358 | int u = toupper('i');
|
359 | int wu = towupper('i');
|
360 | int wul = towupper_l('i', turkish);
|
361 |
|
362 | // Regular: upper case i is I, 73
|
363 | // Turkish: upper case is 304
|
364 | log("upper = %d", u);
|
365 | log("wide upper = %d", wu);
|
366 | log("wide upper locale = %d", wul);
|
367 |
|
368 | freelocale(turkish);
|
369 |
|
370 | PASS();
|
371 | }
|
372 |
|
373 | GREATEST_MAIN_DEFS();
|
374 |
|
375 | int main(int argc, char** argv) {
|
376 | gHeap.Init();
|
377 |
|
378 | GREATEST_MAIN_BEGIN();
|
379 |
|
380 | RUN_TEST(hostname_test);
|
381 | RUN_TEST(realpath_test);
|
382 | RUN_TEST(libc_test);
|
383 | RUN_TEST(regex_wrapper_test);
|
384 | RUN_TEST(glob_test);
|
385 | RUN_TEST(fnmatch_test);
|
386 | RUN_TEST(for_test_coverage);
|
387 |
|
388 | RUN_TEST(regex_unanchored);
|
389 | RUN_TEST(regex_caret);
|
390 | RUN_TEST(regex_lexer);
|
391 | RUN_TEST(regex_repeat_with_capture);
|
392 | RUN_TEST(regex_alt_with_capture);
|
393 | RUN_TEST(regex_nested_capture);
|
394 | RUN_TEST(regex_unicode);
|
395 |
|
396 | // Crashes in CI? Because of Turkish locale?
|
397 | // RUN_TEST(casefold_test);
|
398 |
|
399 | gHeap.CleanProcessExit();
|
400 |
|
401 | GREATEST_MAIN_END();
|
402 | return 0;
|
403 | }
|