1 | /*
|
2 | * Souffle - A Datalog Compiler
|
3 | * Copyright (c) 2021, The Souffle Developers. All rights reserved
|
4 | * Licensed under the Universal Permissive License v 1.0 as shown at:
|
5 | * - https://opensource.org/licenses/UPL
|
6 | * - <souffle root>/licenses/SOUFFLE-UPL.txt
|
7 | */
|
8 |
|
9 | /************************************************************************
|
10 | *
|
11 | * @file StringUtil.h
|
12 | *
|
13 | * @brief Datalog project utilities
|
14 | *
|
15 | ***********************************************************************/
|
16 |
|
17 | #pragma once
|
18 |
|
19 | #include "souffle/RamTypes.h"
|
20 | #include <algorithm>
|
21 | #include <cctype>
|
22 | #include <cstdlib>
|
23 | #include <fstream>
|
24 | #include <limits>
|
25 | #include <set>
|
26 | #include <sstream>
|
27 | #include <stdexcept>
|
28 | #include <string>
|
29 | #include <type_traits>
|
30 | #include <typeinfo>
|
31 | #include <vector>
|
32 |
|
33 | namespace souffle {
|
34 |
|
35 | // Forward declaration
|
36 | inline bool isPrefix(const std::string& prefix, const std::string& element);
|
37 |
|
38 | /**
|
39 | * Converts a string to a RamSigned
|
40 | *
|
41 | * This procedure has similar behaviour to std::stoi/stoll.
|
42 | *
|
43 | * The procedure accepts prefixes 0b (if base = 2) and 0x (if base = 16)
|
44 | * If base = 0, the procedure will try to infer the base from the prefix, if present.
|
45 | */
|
46 | inline RamSigned RamSignedFromString(
|
47 | const std::string& str, std::size_t* position = nullptr, const int base = 10) {
|
48 | RamSigned val;
|
49 |
|
50 | if (base == 0) {
|
51 | if (isPrefix("-0b", str) || isPrefix("0b", str)) {
|
52 | return RamSignedFromString(str, position, 2);
|
53 | } else if (isPrefix("-0x", str) || isPrefix("0x", str)) {
|
54 | return RamSignedFromString(str, position, 16);
|
55 | } else {
|
56 | return RamSignedFromString(str, position);
|
57 | }
|
58 | }
|
59 | std::string binaryNumber;
|
60 | bool parsingBinary = base == 2;
|
61 |
|
62 | // stoi/stoll can't handle base 2 prefix by default.
|
63 | if (parsingBinary) {
|
64 | if (isPrefix("-0b", str)) {
|
65 | binaryNumber = "-" + str.substr(3);
|
66 | } else if (isPrefix("0b", str)) {
|
67 | binaryNumber = str.substr(2);
|
68 | }
|
69 | }
|
70 | const std::string& tmp = parsingBinary ? binaryNumber : str;
|
71 |
|
72 | #if RAM_DOMAIN_SIZE == 64
|
73 | val = std::stoll(tmp, position, base);
|
74 | #else
|
75 | val = std::stoi(tmp, position, base);
|
76 | #endif
|
77 |
|
78 | if (parsingBinary && position != nullptr) {
|
79 | *position += 2;
|
80 | }
|
81 |
|
82 | return val;
|
83 | }
|
84 |
|
85 | /**
|
86 | * Converts a string to a RamFloat
|
87 | */
|
88 | inline RamFloat RamFloatFromString(const std::string& str, std::size_t* position = nullptr) {
|
89 | RamFloat val;
|
90 | #if RAM_DOMAIN_SIZE == 64
|
91 | val = std::stod(str, position);
|
92 | #else
|
93 | val = std::stof(str, position);
|
94 | #endif
|
95 | return static_cast<RamFloat>(val);
|
96 | }
|
97 | /**
|
98 | * Converts a string to a RamUnsigned
|
99 | *
|
100 | * This procedure has similar behaviour to std::stoul/stoull.
|
101 | *
|
102 | * The procedure accepts prefixes 0b (if base = 2) and 0x (if base = 16)
|
103 | * If base = 0, the procedure will try to infer the base from the prefix, if present.
|
104 | */
|
105 | inline RamUnsigned RamUnsignedFromString(
|
106 | const std::string& str, std::size_t* position = nullptr, const int base = 10) {
|
107 | // Be default C++ (stoul) allows unsigned numbers starting with "-".
|
108 | if (isPrefix("-", str)) {
|
109 | throw std::invalid_argument("Unsigned number can't start with minus.");
|
110 | }
|
111 |
|
112 | if (base == 0) {
|
113 | if (isPrefix("0b", str)) {
|
114 | return RamUnsignedFromString(str, position, 2);
|
115 | } else if (isPrefix("0x", str)) {
|
116 | return RamUnsignedFromString(str, position, 16);
|
117 | } else {
|
118 | return RamUnsignedFromString(str, position);
|
119 | }
|
120 | }
|
121 |
|
122 | // stoul/stoull can't handle binary prefix by default.
|
123 | std::string binaryNumber;
|
124 | bool parsingBinary = false;
|
125 | if (base == 2 && isPrefix("0b", str)) {
|
126 | binaryNumber = str.substr(2);
|
127 | parsingBinary = true;
|
128 | }
|
129 | const std::string& tmp = parsingBinary ? binaryNumber : str;
|
130 |
|
131 | RamUnsigned val;
|
132 | #if RAM_DOMAIN_SIZE == 64
|
133 | val = std::stoull(tmp, position, base);
|
134 | #else
|
135 | val = std::stoul(tmp, position, base);
|
136 | #endif
|
137 |
|
138 | if (parsingBinary && position != nullptr) {
|
139 | *position += 2;
|
140 | }
|
141 |
|
142 | // check if it's safe to cast (stoul returns unsigned long)
|
143 | if (val > std::numeric_limits<RamUnsigned>::max()) {
|
144 | throw std::invalid_argument("Unsigned number of of bounds");
|
145 | }
|
146 |
|
147 | return static_cast<RamUnsigned>(val);
|
148 | }
|
149 |
|
150 | /**
|
151 | * Can a string be parsed as RamSigned.
|
152 | *
|
153 | * Souffle (parser, not fact file readers) accepts: hex, binary and base 10.
|
154 | * Integer can be negative, in all 3 formats this means that it
|
155 | * starts with minus (c++ default semantics).
|
156 | */
|
157 | inline bool canBeParsedAsRamSigned(const std::string& string) {
|
158 | std::size_t charactersRead = 0;
|
159 |
|
160 | try {
|
161 | RamSignedFromString(string, &charactersRead, 0);
|
162 | } catch (...) {
|
163 | return false;
|
164 | }
|
165 |
|
166 | return charactersRead == string.size();
|
167 | }
|
168 |
|
169 | /**
|
170 | * Can a string be parsed as RamUnsigned.
|
171 | *
|
172 | * Souffle accepts: hex, binary and base 10.
|
173 | */
|
174 | inline bool canBeParsedAsRamUnsigned(const std::string& string) {
|
175 | std::size_t charactersRead = 0;
|
176 | try {
|
177 | RamUnsignedFromString(string, &charactersRead, 0);
|
178 | } catch (...) {
|
179 | return false;
|
180 | }
|
181 | return charactersRead == string.size();
|
182 | }
|
183 |
|
184 | /**
|
185 | * Can a string be parsed as RamFloat.
|
186 | */
|
187 | inline bool canBeParsedAsRamFloat(const std::string& string) {
|
188 | std::size_t charactersRead = 0;
|
189 | try {
|
190 | RamFloatFromString(string, &charactersRead);
|
191 | } catch (...) {
|
192 | return false;
|
193 | }
|
194 | return charactersRead == string.size();
|
195 | }
|
196 |
|
197 | #if RAM_DOMAIN_SIZE == 64
|
198 | inline RamDomain stord(const std::string& str, std::size_t* pos = nullptr, int base = 10) {
|
199 | return static_cast<RamDomain>(std::stoull(str, pos, base));
|
200 | }
|
201 | #elif RAM_DOMAIN_SIZE == 32
|
202 | inline RamDomain stord(const std::string& str, std::size_t* pos = nullptr, int base = 10) {
|
203 | return static_cast<RamDomain>(std::stoul(str, pos, base));
|
204 | }
|
205 | #else
|
206 | #error RAM Domain is neither 32bit nor 64bit
|
207 | #endif
|
208 |
|
209 | /**
|
210 | * Check whether a string is a sequence of digits
|
211 | */
|
212 | inline bool isNumber(const char* str) {
|
213 | if (str == nullptr) {
|
214 | return false;
|
215 | }
|
216 |
|
217 | while (*str != 0) {
|
218 | if (isdigit(*str) == 0) {
|
219 | return false;
|
220 | }
|
221 | str++;
|
222 | }
|
223 | return true;
|
224 | }
|
225 |
|
226 | /**
|
227 | * A generic function converting strings into strings (trivial case).
|
228 | */
|
229 | inline const std::string& toString(const std::string& str) {
|
230 | return str;
|
231 | }
|
232 |
|
233 | namespace detail {
|
234 |
|
235 | /**
|
236 | * A type trait to check whether a given type is printable.
|
237 | * In this general case, nothing is printable.
|
238 | */
|
239 | template <typename T, typename filter = void>
|
240 | struct is_printable : public std::false_type {};
|
241 |
|
242 | /**
|
243 | * A type trait to check whether a given type is printable.
|
244 | * This specialization makes types with an output operator printable.
|
245 | */
|
246 | template <typename T>
|
247 | struct is_printable<T, typename std::conditional<false,
|
248 | decltype(std::declval<std::ostream&>() << std::declval<T>()), void>::type>
|
249 | : public std::true_type {};
|
250 |
|
251 | template <typename T, typename filter = void>
|
252 | struct is_html_printable : public std::false_type {};
|
253 |
|
254 | template <typename T>
|
255 | struct is_html_printable<T,
|
256 | typename std::conditional<false, decltype(std::declval<T>().printHTML(std::declval<std::ostream&>())),
|
257 | void>::type> : public std::true_type {};
|
258 |
|
259 | } // namespace detail
|
260 |
|
261 | /**
|
262 | * A generic function converting arbitrary objects to strings by utilizing
|
263 | * their print capability.
|
264 | *
|
265 | * This function is mainly intended for implementing test cases and debugging
|
266 | * operations.
|
267 | */
|
268 | template <typename T>
|
269 | typename std::enable_if<detail::is_printable<T>::value, std::string>::type toString(const T& value) {
|
270 | // write value into stream and return result
|
271 | std::stringstream ss;
|
272 | ss << value;
|
273 | return ss.str();
|
274 | }
|
275 |
|
276 | /**
|
277 | * A fallback for the to-string function in case an unprintable object is supposed
|
278 | * to be printed.
|
279 | */
|
280 | template <typename T>
|
281 | typename std::enable_if<!detail::is_printable<T>::value, std::string>::type toString(const T&) {
|
282 | std::stringstream ss;
|
283 | ss << "(print for type ";
|
284 | ss << typeid(T).name();
|
285 | ss << " not supported)";
|
286 | return ss.str();
|
287 | }
|
288 |
|
289 | template <typename T>
|
290 | auto toHtml(const T& obj) -> typename std::enable_if<detail::is_html_printable<T>::value, std::string>::type {
|
291 | std::stringstream out;
|
292 | obj.printHTML(out);
|
293 | return out.str();
|
294 | }
|
295 |
|
296 | /** Fallback to `toString` */
|
297 | template <typename T>
|
298 | auto toHtml(const T& obj) ->
|
299 | typename std::enable_if<not detail::is_html_printable<T>::value, std::string>::type {
|
300 | return toString(obj);
|
301 | }
|
302 |
|
303 | // -------------------------------------------------------------------------------
|
304 | // String Utils
|
305 | // -------------------------------------------------------------------------------
|
306 |
|
307 | /**
|
308 | * Determine if one string is a prefix of another
|
309 | */
|
310 | inline bool isPrefix(const std::string& prefix, const std::string& element) {
|
311 | auto itPrefix = prefix.begin();
|
312 | auto itElement = element.begin();
|
313 |
|
314 | while (itPrefix != prefix.end() && itElement != element.end()) {
|
315 | if (*itPrefix != *itElement) {
|
316 | break;
|
317 | }
|
318 | ++itPrefix;
|
319 | ++itElement;
|
320 | }
|
321 |
|
322 | return itPrefix == prefix.end();
|
323 | }
|
324 |
|
325 | /**
|
326 | * Determines whether the given value string ends with the given
|
327 | * end string.
|
328 | */
|
329 | inline bool endsWith(const std::string& value, const std::string& ending) {
|
330 | if (value.size() < ending.size()) {
|
331 | return false;
|
332 | }
|
333 | return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
|
334 | }
|
335 |
|
336 | /**
|
337 | * Splits a string given a delimiter
|
338 | */
|
339 | inline std::vector<std::string_view> splitView(std::string_view toSplit, std::string_view delimiter) {
|
340 | if (toSplit.empty()) return {toSplit};
|
341 |
|
342 | auto delimLen = std::max<size_t>(1, delimiter.size()); // ensure we advance even w/ an empty needle
|
343 |
|
344 | std::vector<std::string_view> parts;
|
345 | for (auto tail = toSplit;;) {
|
346 | auto pos = tail.find(delimiter);
|
347 | parts.push_back(tail.substr(0, pos));
|
348 | if (pos == tail.npos) break;
|
349 |
|
350 | tail = tail.substr(pos + delimLen);
|
351 | }
|
352 |
|
353 | return parts;
|
354 | }
|
355 |
|
356 | /**
|
357 | * Splits a string given a delimiter
|
358 | */
|
359 | inline std::vector<std::string> splitString(std::string_view str, char delimiter) {
|
360 | std::vector<std::string> xs;
|
361 | for (auto&& x : splitView(str, std::string_view{&delimiter, 1}))
|
362 | xs.push_back(std::string(x));
|
363 | return xs;
|
364 | }
|
365 |
|
366 | /**
|
367 | * Strips the prefix of a given string if it exists. No change otherwise.
|
368 | */
|
369 | inline std::string stripPrefix(const std::string& prefix, const std::string& element) {
|
370 | return isPrefix(prefix, element) ? element.substr(prefix.length()) : element;
|
371 | }
|
372 |
|
373 | /**
|
374 | * Stringify a string using escapes for escape, newline, tab, double-quotes and semicolons
|
375 | */
|
376 | inline std::string stringify(const std::string& input) {
|
377 | std::string str(input);
|
378 |
|
379 | // replace escapes with double escape sequence
|
380 | std::size_t start_pos = 0;
|
381 | while ((start_pos = str.find('\\', start_pos)) != std::string::npos) {
|
382 | str.replace(start_pos, 1, "\\\\");
|
383 | start_pos += 2;
|
384 | }
|
385 | // replace semicolons with escape sequence
|
386 | start_pos = 0;
|
387 | while ((start_pos = str.find(';', start_pos)) != std::string::npos) {
|
388 | str.replace(start_pos, 1, "\\;");
|
389 | start_pos += 2;
|
390 | }
|
391 | // replace double-quotes with escape sequence
|
392 | start_pos = 0;
|
393 | while ((start_pos = str.find('"', start_pos)) != std::string::npos) {
|
394 | str.replace(start_pos, 1, "\\\"");
|
395 | start_pos += 2;
|
396 | }
|
397 | // replace newline with escape sequence
|
398 | start_pos = 0;
|
399 | while ((start_pos = str.find('\n', start_pos)) != std::string::npos) {
|
400 | str.replace(start_pos, 1, "\\n");
|
401 | start_pos += 2;
|
402 | }
|
403 | // replace tab with escape sequence
|
404 | start_pos = 0;
|
405 | while ((start_pos = str.find('\t', start_pos)) != std::string::npos) {
|
406 | str.replace(start_pos, 1, "\\t");
|
407 | start_pos += 2;
|
408 | }
|
409 | return str;
|
410 | }
|
411 |
|
412 | /**
|
413 | * Escape JSON string.
|
414 | */
|
415 | inline std::string escapeJSONstring(const std::string& JSONstr) {
|
416 | std::ostringstream destination;
|
417 |
|
418 | // Iterate over all characters except first and last
|
419 | for (char c : JSONstr) {
|
420 | if (c == '\"') {
|
421 | destination << "\\";
|
422 | }
|
423 | destination << c;
|
424 | }
|
425 | return destination.str();
|
426 | }
|
427 |
|
428 | /** Valid C++ identifier, note that this does not ensure the uniqueness of identifiers returned. */
|
429 | inline std::string identifier(std::string id) {
|
430 | for (std::size_t i = 0; i < id.length(); i++) {
|
431 | if (((isalpha(id[i]) == 0) && i == 0) || ((isalnum(id[i]) == 0) && id[i] != '_')) {
|
432 | id[i] = '_';
|
433 | }
|
434 | }
|
435 | return id;
|
436 | }
|
437 |
|
438 | // TODO (b-scholz): tidy up unescape/escape functions
|
439 |
|
440 | inline std::string unescape(
|
441 | const std::string& inputString, const std::string& needle, const std::string& replacement) {
|
442 | std::string result = inputString;
|
443 | std::size_t pos = 0;
|
444 | while ((pos = result.find(needle, pos)) != std::string::npos) {
|
445 | result = result.replace(pos, needle.length(), replacement);
|
446 | pos += replacement.length();
|
447 | }
|
448 | return result;
|
449 | }
|
450 |
|
451 | inline std::string unescape(const std::string& inputString) {
|
452 | std::string unescaped = unescape(inputString, "\\\"", "\"");
|
453 | unescaped = unescape(unescaped, "\\t", "\t");
|
454 | unescaped = unescape(unescaped, "\\r", "\r");
|
455 | unescaped = unescape(unescaped, "\\n", "\n");
|
456 | return unescaped;
|
457 | }
|
458 |
|
459 | inline std::string escape(
|
460 | const std::string& inputString, const std::string& needle, const std::string& replacement) {
|
461 | std::string result = inputString;
|
462 | std::size_t pos = 0;
|
463 | while ((pos = result.find(needle, pos)) != std::string::npos) {
|
464 | result = result.replace(pos, needle.length(), replacement);
|
465 | pos += replacement.length();
|
466 | }
|
467 | return result;
|
468 | }
|
469 |
|
470 | inline std::string escape(const std::string& inputString) {
|
471 | std::string escaped = escape(inputString, "\"", "\\\"");
|
472 | escaped = escape(escaped, "\t", "\\t");
|
473 | escaped = escape(escaped, "\r", "\\r");
|
474 | escaped = escape(escaped, "\n", "\\n");
|
475 | return escaped;
|
476 | }
|
477 |
|
478 | template <typename C>
|
479 | auto escape(C&& os, std::string_view str, std::set<char> const& needs_escape, std::string_view esc) {
|
480 | for (auto&& x : str) {
|
481 | if (needs_escape.find(x) != needs_escape.end()) {
|
482 | os << esc;
|
483 | }
|
484 | os << x;
|
485 | }
|
486 |
|
487 | return std::forward<C>(os);
|
488 | }
|
489 |
|
490 | inline std::string escape(std::string_view str, std::set<char> const& needs_escape, std::string_view esc) {
|
491 | return escape(std::stringstream{}, str, needs_escape, esc).str();
|
492 | }
|
493 |
|
494 | } // end namespace souffle
|