| 1 | #include "cpp/libc.h"
 | 
| 2 | 
 | 
| 3 | #include <locale.h>  // setlocale()
 | 
| 4 | #include <regex.h>   // regcomp()
 | 
| 5 | #include <unistd.h>  // gethostname()
 | 
| 6 | #include <wctype.h>  // towupper()
 | 
| 7 | 
 | 
| 8 | #include "mycpp/runtime.h"
 | 
| 9 | #include "vendor/greatest.h"
 | 
| 10 | 
 | 
| 11 | TEST hostname_test() {
 | 
| 12 |   BigStr* s0 = libc::gethostname();
 | 
| 13 |   ASSERT(s0 != nullptr);
 | 
| 14 | 
 | 
| 15 |   char buf[1024];
 | 
| 16 |   ASSERT(gethostname(buf, HOST_NAME_MAX) == 0);
 | 
| 17 |   ASSERT(str_equals(s0, StrFromC(buf)));
 | 
| 18 | 
 | 
| 19 |   PASS();
 | 
| 20 | }
 | 
| 21 | 
 | 
| 22 | TEST realpath_test() {
 | 
| 23 |   BigStr* result = libc::realpath(StrFromC("/"));
 | 
| 24 |   ASSERT(str_equals(StrFromC("/"), result));
 | 
| 25 | 
 | 
| 26 |   bool caught = false;
 | 
| 27 |   try {
 | 
| 28 |     libc::realpath(StrFromC("/nonexistent_ZZZ"));
 | 
| 29 |   } catch (IOError_OSError* e) {
 | 
| 30 |     caught = true;
 | 
| 31 |   }
 | 
| 32 |   ASSERT(caught);
 | 
| 33 | 
 | 
| 34 |   PASS();
 | 
| 35 | }
 | 
| 36 | 
 | 
| 37 | TEST libc_test() {
 | 
| 38 |   log("sizeof(wchar_t) = %d", sizeof(wchar_t));
 | 
| 39 | 
 | 
| 40 |   int width = 0;
 | 
| 41 | 
 | 
| 42 |   // TODO: enable this test.  Is it not picking LC_CTYPE?
 | 
| 43 |   // Do we have to do some initialization like libc.cpython_reset_locale() ?
 | 
| 44 | #if 0
 | 
| 45 |   try {
 | 
| 46 |     // mu character \u{03bc} in utf-8
 | 
| 47 |     width = libc::wcswidth(StrFromC("\xce\xbc"));
 | 
| 48 |   } catch (UnicodeError* e) {
 | 
| 49 |     log("UnicodeError %s", e->message->data_);
 | 
| 50 |   }
 | 
| 51 |   ASSERT_EQ_FMT(2, width, "%d");
 | 
| 52 | #endif
 | 
| 53 | 
 | 
| 54 |   BigStr* h = libc::gethostname();
 | 
| 55 |   log("gethostname() = %s %d", h->data_, len(h));
 | 
| 56 | 
 | 
| 57 |   width = libc::wcswidth(StrFromC("foo"));
 | 
| 58 |   ASSERT_EQ(3, width);
 | 
| 59 | 
 | 
| 60 |   libc::print_time(0.1, 0.2, 0.3);
 | 
| 61 | 
 | 
| 62 |   PASS();
 | 
| 63 | }
 | 
| 64 | 
 | 
| 65 | static List<BigStr*>* Groups(BigStr* s, List<int>* indices) {
 | 
| 66 |   List<BigStr*>* groups = NewList<BigStr*>();
 | 
| 67 |   int n = len(indices) / 2;
 | 
| 68 |   for (int i = 0; i < n; ++i) {
 | 
| 69 |     int start = indices->at(2 * i);
 | 
| 70 |     int end = indices->at(2 * i + 1);
 | 
| 71 |     if (start == -1) {
 | 
| 72 |       groups->append(nullptr);
 | 
| 73 |     } else {
 | 
| 74 |       groups->append(s->slice(start, end));
 | 
| 75 |     }
 | 
| 76 |   }
 | 
| 77 |   return groups;
 | 
| 78 | }
 | 
| 79 | 
 | 
| 80 | TEST regex_wrapper_test() {
 | 
| 81 |   BigStr* s1 = StrFromC("-abaacaaa");
 | 
| 82 |   List<int>* indices = libc::regex_search(StrFromC("(a+).(a+)"), 0, s1, 0);
 | 
| 83 |   List<BigStr*>* results = Groups(s1, indices);
 | 
| 84 |   ASSERT_EQ_FMT(3, len(results), "%d");
 | 
| 85 |   ASSERT(str_equals(StrFromC("abaa"), results->at(0)));  // whole match
 | 
| 86 |   ASSERT(str_equals(StrFromC("a"), results->at(1)));
 | 
| 87 |   ASSERT(str_equals(StrFromC("aa"), results->at(2)));
 | 
| 88 | 
 | 
| 89 |   indices = libc::regex_search(StrFromC("z+"), 0, StrFromC("abaacaaa"), 0);
 | 
| 90 |   ASSERT_EQ(nullptr, indices);
 | 
| 91 | 
 | 
| 92 |   // Alternation gives unmatched group
 | 
| 93 |   BigStr* s2 = StrFromC("b");
 | 
| 94 |   indices = libc::regex_search(StrFromC("(a)|(b)"), 0, s2, 0);
 | 
| 95 |   results = Groups(s2, indices);
 | 
| 96 |   ASSERT_EQ_FMT(3, len(results), "%d");
 | 
| 97 |   ASSERT(str_equals(StrFromC("b"), results->at(0)));  // whole match
 | 
| 98 |   ASSERT_EQ(nullptr, results->at(1));
 | 
| 99 |   ASSERT(str_equals(StrFromC("b"), results->at(2)));
 | 
| 100 | 
 | 
| 101 |   // Like Unicode test below
 | 
| 102 |   indices = libc::regex_search(StrFromC("_._"), 0, StrFromC("_x_"), 0);
 | 
| 103 |   ASSERT(indices != nullptr);
 | 
| 104 |   ASSERT_EQ_FMT(2, len(indices), "%d");
 | 
| 105 |   ASSERT_EQ_FMT(0, indices->at(0), "%d");
 | 
| 106 |   ASSERT_EQ_FMT(3, indices->at(1), "%d");
 | 
| 107 | 
 | 
| 108 |   // TODO(unicode)
 | 
| 109 | #if 0
 | 
| 110 |   //indices = libc::regex_search(StrFromC("_._"), 0, StrFromC("_\u03bc_"), 0);
 | 
| 111 |   indices = libc::regex_search(StrFromC("_._"), 0, StrFromC("_μ_"), 0);
 | 
| 112 |   ASSERT(indices != nullptr);
 | 
| 113 |   ASSERT_EQ_FMT(2, len(indices), "%d");
 | 
| 114 |   ASSERT_EQ_FMT(0, indices->at(0), "%d");
 | 
| 115 |   ASSERT_EQ_FMT(0, indices->at(0), "%d");
 | 
| 116 | #endif
 | 
| 117 | 
 | 
| 118 |   Tuple2<int, int>* result;
 | 
| 119 |   BigStr* s = StrFromC("oXooXoooXoX");
 | 
| 120 |   result = libc::regex_first_group_match(StrFromC("(X.)"), s, 0);
 | 
| 121 |   ASSERT_EQ_FMT(1, result->at0(), "%d");
 | 
| 122 |   ASSERT_EQ_FMT(3, result->at1(), "%d");
 | 
| 123 | 
 | 
| 124 |   result = libc::regex_first_group_match(StrFromC("(X.)"), s, 3);
 | 
| 125 |   ASSERT_EQ_FMT(4, result->at0(), "%d");
 | 
| 126 |   ASSERT_EQ_FMT(6, result->at1(), "%d");
 | 
| 127 | 
 | 
| 128 |   result = libc::regex_first_group_match(StrFromC("(X.)"), s, 6);
 | 
| 129 |   ASSERT_EQ_FMT(8, result->at0(), "%d");
 | 
| 130 |   ASSERT_EQ_FMT(10, result->at1(), "%d");
 | 
| 131 | 
 | 
| 132 |   PASS();
 | 
| 133 | }
 | 
| 134 | 
 | 
| 135 | TEST glob_test() {
 | 
| 136 |   // This depends on the file system
 | 
| 137 |   auto files = libc::glob(StrFromC("*.testdata"));
 | 
| 138 |   // 3 files are made by the shell wrapper
 | 
| 139 |   ASSERT_EQ_FMT(3, len(files), "%d");
 | 
| 140 | 
 | 
| 141 |   print(files->at(0));
 | 
| 142 | 
 | 
| 143 |   auto files2 = libc::glob(StrFromC("*.pyzzz"));
 | 
| 144 |   ASSERT_EQ_FMT(0, len(files2), "%d");
 | 
| 145 | 
 | 
| 146 |   PASS();
 | 
| 147 | }
 | 
| 148 | 
 | 
| 149 | TEST fnmatch_test() {
 | 
| 150 |   BigStr* s1 = (StrFromC("foo.py "))->strip();
 | 
| 151 |   ASSERT(libc::fnmatch(StrFromC("*.py"), s1));
 | 
| 152 |   ASSERT(!libc::fnmatch(StrFromC("*.py"), StrFromC("foo.p")));
 | 
| 153 | 
 | 
| 154 |   // Unicode - ? is byte or code point?
 | 
| 155 |   ASSERT(libc::fnmatch(StrFromC("_?_"), StrFromC("_x_")));
 | 
| 156 | 
 | 
| 157 |   // TODO(unicode)
 | 
| 158 |   // ASSERT(libc::fnmatch(StrFromC("_?_"), StrFromC("_\u03bc_")));
 | 
| 159 |   // ASSERT(libc::fnmatch(StrFromC("_?_"), StrFromC("_μ_")));
 | 
| 160 | 
 | 
| 161 |   // extended glob
 | 
| 162 |   ASSERT(libc::fnmatch(StrFromC("*(foo|bar).py"), StrFromC("foo.py")));
 | 
| 163 |   ASSERT(!libc::fnmatch(StrFromC("*(foo|bar).py"), StrFromC("foo.p")));
 | 
| 164 | 
 | 
| 165 |   PASS();
 | 
| 166 | }
 | 
| 167 | 
 | 
| 168 | TEST for_test_coverage() {
 | 
| 169 |   // Sometimes we're not connected to a terminal
 | 
| 170 |   try {
 | 
| 171 |     libc::get_terminal_width();
 | 
| 172 |   } catch (IOError_OSError* e) {
 | 
| 173 |   }
 | 
| 174 | 
 | 
| 175 |   PASS();
 | 
| 176 | }
 | 
| 177 | 
 | 
| 178 | void FindAll(const char* p, const char* s) {
 | 
| 179 |   regex_t pat;
 | 
| 180 | 
 | 
| 181 |   int cflags = REG_EXTENDED;
 | 
| 182 |   if (regcomp(&pat, p, cflags) != 0) {
 | 
| 183 |     FAIL();
 | 
| 184 |   }
 | 
| 185 |   int outlen = pat.re_nsub + 1;  // number of captures
 | 
| 186 | 
 | 
| 187 |   // TODO: Could statically allocate 99, and assert that re_nsub is less than
 | 
| 188 |   // 99.  Would speed up loops.
 | 
| 189 |   regmatch_t* pmatch =
 | 
| 190 |       static_cast<regmatch_t*>(malloc(sizeof(regmatch_t) * outlen));
 | 
| 191 | 
 | 
| 192 |   int cur_pos = 0;
 | 
| 193 |   // int n = strlen(s);
 | 
| 194 |   while (true) {
 | 
| 195 |     // Necessary so ^ doesn't match in the middle!
 | 
| 196 |     int eflags = cur_pos == 0 ? 0 : REG_NOTBOL;
 | 
| 197 |     bool match = regexec(&pat, s + cur_pos, outlen, pmatch, eflags) == 0;
 | 
| 198 | 
 | 
| 199 |     if (!match) {
 | 
| 200 |       break;
 | 
| 201 |     }
 | 
| 202 |     int i;
 | 
| 203 |     for (i = 0; i < outlen; i++) {
 | 
| 204 |       int start = pmatch[i].rm_so;
 | 
| 205 |       int end = pmatch[i].rm_eo;
 | 
| 206 |       int len = end - start;
 | 
| 207 |       BigStr* m = StrFromC(s + cur_pos + start, len);
 | 
| 208 |       log("%d GROUP %d (%d .. %d) = [%s]", cur_pos, i, start, end, m->data_);
 | 
| 209 |     }
 | 
| 210 |     log("");
 | 
| 211 |     int match_len = pmatch[0].rm_eo;
 | 
| 212 |     if (match_len == 0) {
 | 
| 213 |       break;
 | 
| 214 |     }
 | 
| 215 |     cur_pos += match_len;
 | 
| 216 |   }
 | 
| 217 | 
 | 
| 218 |   free(pmatch);
 | 
| 219 |   regfree(&pat);
 | 
| 220 | }
 | 
| 221 | 
 | 
| 222 | // adjacent matches
 | 
| 223 | const char* s = "a345y-axy- there b789y- cy-";
 | 
| 224 | 
 | 
| 225 | TEST regex_unanchored() {
 | 
| 226 |   const char* unanchored = "[abc]([0-9]*)(x?)(y)-";
 | 
| 227 |   FindAll(unanchored, s);
 | 
| 228 | 
 | 
| 229 |   PASS();
 | 
| 230 | }
 | 
| 231 | 
 | 
| 232 | TEST regex_caret() {
 | 
| 233 |   const char* anchored = "^[abc]([0-9]*)(x?)(y)-";
 | 
| 234 |   FindAll(anchored, s);
 | 
| 235 | 
 | 
| 236 |   PASS();
 | 
| 237 | }
 | 
| 238 | 
 | 
| 239 | TEST regex_lexer() {
 | 
| 240 |   // like the Yaks / Make-a-Lisp pattern
 | 
| 241 |   const char* lexer = "([a-z]+)|([0-9]+)|([ ]+)|([+-])";
 | 
| 242 |   FindAll(lexer, s);
 | 
| 243 | 
 | 
| 244 |   PASS();
 | 
| 245 | }
 | 
| 246 | 
 | 
| 247 | TEST regex_repeat_with_capture() {
 | 
| 248 |   const char* lexer = "(([a-z]+)([0-9]+)-)*((A+)|(Z+))*";
 | 
| 249 |   FindAll(lexer, "a0-b1-c2-AAZZZA");
 | 
| 250 |   // Groups are weird
 | 
| 251 |   // whole match 0: a0-b1-c2-
 | 
| 252 |   //             1: c2-      # last repetition
 | 
| 253 |   //             2: c        # last one
 | 
| 254 |   //             3: 2        # last one
 | 
| 255 |   //
 | 
| 256 |   // And then there's an empty match
 | 
| 257 |   //
 | 
| 258 |   // Ideas:
 | 
| 259 |   // - disallow nested groups in Eggex?
 | 
| 260 |   // - I really care about the inner ones -- groups 2 and 3
 | 
| 261 |   // - I want flat groups
 | 
| 262 | 
 | 
| 263 |   PASS();
 | 
| 264 | }
 | 
| 265 | 
 | 
| 266 | // Disallow this in eggex, as well as the above
 | 
| 267 | TEST regex_nested_capture() {
 | 
| 268 |   const char* lexer = "(([a-z]+)([0-9]+))";
 | 
| 269 |   FindAll(lexer, "a0");
 | 
| 270 |   PASS();
 | 
| 271 | }
 | 
| 272 | 
 | 
| 273 | // I think we allow this in eggex
 | 
| 274 | TEST regex_alt_with_capture() {
 | 
| 275 |   const char* lexer = "([a-z]+)|([0-9]+)(-)";
 | 
| 276 |   FindAll(lexer, "x-");
 | 
| 277 |   FindAll(lexer, "7-");
 | 
| 278 |   PASS();
 | 
| 279 | }
 | 
| 280 | 
 | 
| 281 | TEST regex_unicode() {
 | 
| 282 |   regex_t pat;
 | 
| 283 | 
 | 
| 284 |   // 1 or 2 bytes
 | 
| 285 |   // const char* p = "_..?_";
 | 
| 286 |   // const char* p = "_[^a]_";
 | 
| 287 |   const char* p = "_._";  // 1 byte, not code point?
 | 
| 288 | 
 | 
| 289 |   if (regcomp(&pat, p, REG_EXTENDED) != 0) {
 | 
| 290 |     FAIL();
 | 
| 291 |   }
 | 
| 292 |   int outlen = pat.re_nsub + 1;  // number of captures
 | 
| 293 |   regmatch_t* pmatch =
 | 
| 294 |       static_cast<regmatch_t*>(malloc(sizeof(regmatch_t) * outlen));
 | 
| 295 | 
 | 
| 296 |   int result;
 | 
| 297 | 
 | 
| 298 |   const char* bad = "_xyz_";
 | 
| 299 |   result = regexec(&pat, bad, outlen, pmatch, 0);
 | 
| 300 |   ASSERT_EQ_FMT(1, result, "%d");  // does not match
 | 
| 301 | 
 | 
| 302 |   const char* a = "_x_";
 | 
| 303 |   result = regexec(&pat, a, outlen, pmatch, 0);
 | 
| 304 |   ASSERT_EQ_FMT(0, result, "%d");
 | 
| 305 | 
 | 
| 306 |   // Doesn't change anything
 | 
| 307 |   // int lc_what = LC_ALL;
 | 
| 308 |   int lc_what = LC_CTYPE;
 | 
| 309 | 
 | 
| 310 |   // char* saved_locale = setlocale(LC_ALL, "");
 | 
| 311 |   // char* saved_locale = setlocale(LC_ALL, NULL);
 | 
| 312 | 
 | 
| 313 |   // char* saved_locale = setlocale(lc_what, NULL);
 | 
| 314 | 
 | 
| 315 | #if 0
 | 
| 316 |   // Doesn't change anything?
 | 
| 317 |   //if (setlocale(LC_CTYPE, "C.utf8") == NULL) {
 | 
| 318 |   if (setlocale(LC_CTYPE, "en_US.UTF-8") == NULL) {
 | 
| 319 |     log("Couldn't set locale to C.utf8");
 | 
| 320 |     FAIL();
 | 
| 321 |   }
 | 
| 322 | #endif
 | 
| 323 | 
 | 
| 324 |   // const char* u = "_μ_";
 | 
| 325 |   const char* u = "_\u03bc_";
 | 
| 326 |   log("a = %d bytes", strlen(a));
 | 
| 327 |   log("u = %d bytes", strlen(u));
 | 
| 328 |   result = regexec(&pat, u, outlen, pmatch, 0);
 | 
| 329 | 
 | 
| 330 | #if 0
 | 
| 331 |   if (setlocale(lc_what, saved_locale) == NULL) {
 | 
| 332 |     log("Couldn't restore locale");
 | 
| 333 |     FAIL();
 | 
| 334 |   }
 | 
| 335 | #endif
 | 
| 336 | 
 | 
| 337 |   free(pmatch);  // Clean up before test failures
 | 
| 338 |   regfree(&pat);
 | 
| 339 | 
 | 
| 340 |   // TODO(unicode)
 | 
| 341 |   // ASSERT_EQ_FMT(0, result, "%d");
 | 
| 342 | 
 | 
| 343 |   PASS();
 | 
| 344 | }
 | 
| 345 | 
 | 
| 346 | TEST casefold_test() {
 | 
| 347 | #if 0
 | 
| 348 |   // Turkish
 | 
| 349 |   if (setlocale(LC_CTYPE, "tr_TR.utf8") == NULL) {
 | 
| 350 |     log("Couldn't set locale to tr_TR.utf8");
 | 
| 351 |     FAIL();
 | 
| 352 |   }
 | 
| 353 | #endif
 | 
| 354 | 
 | 
| 355 |   // LC_CTYPE_MASK instead of LC_CTYPE
 | 
| 356 |   locale_t turkish = newlocale(LC_CTYPE_MASK, "tr_TR.utf8", NULL);
 | 
| 357 | 
 | 
| 358 |   int u = toupper('i');
 | 
| 359 |   int wu = towupper('i');
 | 
| 360 |   int wul = towupper_l('i', turkish);
 | 
| 361 | 
 | 
| 362 |   // Regular: upper case i is I, 73
 | 
| 363 |   // Turkish: upper case is 304
 | 
| 364 |   log("upper = %d", u);
 | 
| 365 |   log("wide upper = %d", wu);
 | 
| 366 |   log("wide upper locale = %d", wul);
 | 
| 367 | 
 | 
| 368 |   freelocale(turkish);
 | 
| 369 | 
 | 
| 370 |   PASS();
 | 
| 371 | }
 | 
| 372 | 
 | 
| 373 | GREATEST_MAIN_DEFS();
 | 
| 374 | 
 | 
| 375 | int main(int argc, char** argv) {
 | 
| 376 |   gHeap.Init();
 | 
| 377 | 
 | 
| 378 |   GREATEST_MAIN_BEGIN();
 | 
| 379 | 
 | 
| 380 |   RUN_TEST(hostname_test);
 | 
| 381 |   RUN_TEST(realpath_test);
 | 
| 382 |   RUN_TEST(libc_test);
 | 
| 383 |   RUN_TEST(regex_wrapper_test);
 | 
| 384 |   RUN_TEST(glob_test);
 | 
| 385 |   RUN_TEST(fnmatch_test);
 | 
| 386 |   RUN_TEST(for_test_coverage);
 | 
| 387 | 
 | 
| 388 |   RUN_TEST(regex_unanchored);
 | 
| 389 |   RUN_TEST(regex_caret);
 | 
| 390 |   RUN_TEST(regex_lexer);
 | 
| 391 |   RUN_TEST(regex_repeat_with_capture);
 | 
| 392 |   RUN_TEST(regex_alt_with_capture);
 | 
| 393 |   RUN_TEST(regex_nested_capture);
 | 
| 394 |   RUN_TEST(regex_unicode);
 | 
| 395 | 
 | 
| 396 |   // Crashes in CI?  Because of Turkish locale?
 | 
| 397 |   // RUN_TEST(casefold_test);
 | 
| 398 | 
 | 
| 399 |   gHeap.CleanProcessExit();
 | 
| 400 | 
 | 
| 401 |   GREATEST_MAIN_END();
 | 
| 402 |   return 0;
 | 
| 403 | }
 |