| 1 | // data_lang.cc
 | 
| 2 | 
 | 
| 3 | #include "cpp/data_lang.h"
 | 
| 4 | 
 | 
| 5 | #include "data_lang/j8.h"
 | 
| 6 | #include "data_lang/utf8.h"
 | 
| 7 | 
 | 
| 8 | // TODO: remove duplication
 | 
| 9 | #define LOSSY_JSON (1 << 3)
 | 
| 10 | 
 | 
| 11 | namespace {
 | 
| 12 | 
 | 
| 13 | void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
 | 
| 14 |   uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
 | 
| 15 |   uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
 | 
| 16 | 
 | 
| 17 |   buf->WriteConst("b'");
 | 
| 18 | 
 | 
| 19 |   // Set up pointers after writing opening quote
 | 
| 20 |   uint8_t* out = buf->LengthPointer();  // mutated
 | 
| 21 |   uint8_t* out_end = buf->CapacityPointer();
 | 
| 22 | 
 | 
| 23 |   while (true) {
 | 
| 24 |     J8EncodeChunk(&in, in_end, &out, out_end, true);  // Fill as much as we can
 | 
| 25 |     buf->SetLengthFrom(out);
 | 
| 26 | 
 | 
| 27 |     if (in >= in_end) {
 | 
| 28 |       break;
 | 
| 29 |     }
 | 
| 30 | 
 | 
| 31 |     // Same growth policy as below
 | 
| 32 |     capacity = capacity * 3 / 2;
 | 
| 33 |     // printf("[2] new capacity %d\n", capacity);
 | 
| 34 |     buf->EnsureMoreSpace(capacity);
 | 
| 35 | 
 | 
| 36 |     // Recompute pointers
 | 
| 37 |     out = buf->LengthPointer();
 | 
| 38 |     out_end = buf->CapacityPointer();
 | 
| 39 |   }
 | 
| 40 | 
 | 
| 41 |   buf->WriteConst("'");
 | 
| 42 | }
 | 
| 43 | 
 | 
| 44 | void WriteBashDollarString(BigStr* s, mylib::BufWriter* buf, int capacity) {
 | 
| 45 |   uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
 | 
| 46 |   uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
 | 
| 47 | 
 | 
| 48 |   buf->WriteConst("$'");
 | 
| 49 | 
 | 
| 50 |   // Set up pointers after writing opening quote
 | 
| 51 |   uint8_t* out = buf->LengthPointer();  // mutated
 | 
| 52 |   uint8_t* out_end = buf->CapacityPointer();
 | 
| 53 | 
 | 
| 54 |   while (true) {
 | 
| 55 |     BashDollarEncodeChunk(&in, in_end, &out,
 | 
| 56 |                           out_end);  // Fill as much as we can
 | 
| 57 |     buf->SetLengthFrom(out);
 | 
| 58 | 
 | 
| 59 |     if (in >= in_end) {
 | 
| 60 |       break;
 | 
| 61 |     }
 | 
| 62 | 
 | 
| 63 |     // Same growth policy as below
 | 
| 64 |     capacity = capacity * 3 / 2;
 | 
| 65 |     // printf("[2] new capacity %d\n", capacity);
 | 
| 66 |     buf->EnsureMoreSpace(capacity);
 | 
| 67 | 
 | 
| 68 |     // Recompute pointers
 | 
| 69 |     out = buf->LengthPointer();
 | 
| 70 |     out_end = buf->CapacityPointer();
 | 
| 71 |   }
 | 
| 72 | 
 | 
| 73 |   buf->WriteConst("'");
 | 
| 74 | }
 | 
| 75 | 
 | 
| 76 | // Style is COPIED from pyj8::WriteString()
 | 
| 77 | // Functionality is like j8_libc.c ShellEncodeString, that is:
 | 
| 78 | //
 | 
| 79 | // call BourneShellEncodeChunk()
 | 
| 80 | // then either
 | 
| 81 | //   WriteBString()
 | 
| 82 | //   WriteBashDollarString()
 | 
| 83 | 
 | 
| 84 | void ShellEncodeString(BigStr* s, int ysh_fallback, mylib::BufWriter* buf) {
 | 
| 85 |   uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
 | 
| 86 |   uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
 | 
| 87 | 
 | 
| 88 |   // Growth policy: Start at a fixed size max(N + 3 + 2, J8_MIN_CAPACITY)
 | 
| 89 |   int capacity = len(s) + 3 + 2;     // 3 for quotes, 2 potential \" \n
 | 
| 90 |   if (capacity < J8_MIN_CAPACITY) {  // account for J8_MAX_BYTES_PER_INPUT_BYTE
 | 
| 91 |     capacity = J8_MIN_CAPACITY;
 | 
| 92 |   }
 | 
| 93 |   // printf("[1] capacity %d\n", capacity);
 | 
| 94 | 
 | 
| 95 |   buf->EnsureMoreSpace(capacity);
 | 
| 96 | 
 | 
| 97 |   int begin = buf->Length();  // maybe Truncate to this position
 | 
| 98 |   buf->WriteConst("'");
 | 
| 99 | 
 | 
| 100 |   // Set up pointers after writing opening quote
 | 
| 101 |   uint8_t* out = buf->LengthPointer();  // mutated
 | 
| 102 |   uint8_t* out_end = buf->CapacityPointer();
 | 
| 103 | 
 | 
| 104 |   while (true) {
 | 
| 105 |     // Fill in as much as we can
 | 
| 106 |     int cannot_encode = BourneShellEncodeChunk(&in, in_end, &out, out_end);
 | 
| 107 |     if (cannot_encode) {
 | 
| 108 |       buf->Truncate(begin);
 | 
| 109 |       if (ysh_fallback) {
 | 
| 110 |         WriteBString(s, buf, capacity);  // fall back to b''
 | 
| 111 |       } else {
 | 
| 112 |         WriteBashDollarString(s, buf, capacity);  // fall back to $''
 | 
| 113 |       }
 | 
| 114 |       return;
 | 
| 115 |     }
 | 
| 116 |     buf->SetLengthFrom(out);
 | 
| 117 | 
 | 
| 118 |     // printf("[1] len %d\n", out_buf->len);
 | 
| 119 | 
 | 
| 120 |     if (in >= in_end) {
 | 
| 121 |       break;
 | 
| 122 |     }
 | 
| 123 | 
 | 
| 124 |     // Growth policy: every time through the loop, increase 1.5x
 | 
| 125 |     //
 | 
| 126 |     // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
 | 
| 127 |     // This seems like a reasonable tradeoff between over-allocating and too
 | 
| 128 |     // many realloc().
 | 
| 129 |     capacity = capacity * 3 / 2;
 | 
| 130 |     // printf("[1] new capacity %d\n", capacity);
 | 
| 131 |     buf->EnsureMoreSpace(capacity);
 | 
| 132 | 
 | 
| 133 |     // Recompute pointers
 | 
| 134 |     out = buf->LengthPointer();  // mutated
 | 
| 135 |     out_end = buf->CapacityPointer();
 | 
| 136 |     // printf("[1] out %p out_end %p\n", out, out_end);
 | 
| 137 |   }
 | 
| 138 | 
 | 
| 139 |   buf->WriteConst("'");
 | 
| 140 | }
 | 
| 141 | 
 | 
| 142 | }  // namespace
 | 
| 143 | 
 | 
| 144 | namespace fastfunc {
 | 
| 145 | 
 | 
| 146 | bool CanOmitQuotes(BigStr* s) {
 | 
| 147 |   return ::CanOmitQuotes(reinterpret_cast<unsigned char*>(s->data_), len(s));
 | 
| 148 | }
 | 
| 149 | 
 | 
| 150 | BigStr* J8EncodeString(BigStr* s, int j8_fallback) {
 | 
| 151 |   auto buf = Alloc<mylib::BufWriter>();
 | 
| 152 |   int options = j8_fallback ? 0 : LOSSY_JSON;
 | 
| 153 |   pyj8::WriteString(s, options, buf);
 | 
| 154 |   return buf->getvalue();
 | 
| 155 | }
 | 
| 156 | 
 | 
| 157 | BigStr* ShellEncodeString(BigStr* s, int ysh_fallback) {
 | 
| 158 |   auto buf = Alloc<mylib::BufWriter>();
 | 
| 159 |   ::ShellEncodeString(s, ysh_fallback, buf);
 | 
| 160 |   return buf->getvalue();
 | 
| 161 | }
 | 
| 162 | 
 | 
| 163 | Tuple2<int, int> Utf8DecodeOne(BigStr* s, int start) {
 | 
| 164 |   // Bounds check for safety
 | 
| 165 |   DCHECK(0 <= start && start < len(s));
 | 
| 166 | 
 | 
| 167 |   const unsigned char* string = reinterpret_cast<unsigned char*>(s->data());
 | 
| 168 | 
 | 
| 169 |   Utf8Result_t decode_result;
 | 
| 170 |   utf8_decode(string + start, &decode_result);
 | 
| 171 |   int32_t codepoint_or_error;
 | 
| 172 |   if (decode_result.error) {
 | 
| 173 |     codepoint_or_error = -decode_result.error;
 | 
| 174 |   } else {
 | 
| 175 |     codepoint_or_error = decode_result.codepoint;
 | 
| 176 |   }
 | 
| 177 | 
 | 
| 178 |   return Tuple2<int, int>(codepoint_or_error, decode_result.bytes_read);
 | 
| 179 | }
 | 
| 180 | 
 | 
| 181 | }  // namespace fastfunc
 | 
| 182 | 
 | 
| 183 | namespace pyj8 {
 | 
| 184 | 
 | 
| 185 | bool PartIsUtf8(BigStr* s, int start, int end) {
 | 
| 186 |   Utf8Result result;
 | 
| 187 | 
 | 
| 188 |   for (int i = start; i < end;) {
 | 
| 189 |     utf8_decode(reinterpret_cast<unsigned char*>(s->data_ + i), &result);
 | 
| 190 |     if (result.error) {
 | 
| 191 |       return false;
 | 
| 192 |     }
 | 
| 193 | 
 | 
| 194 |     i += result.bytes_read;
 | 
| 195 |   }
 | 
| 196 | 
 | 
| 197 |   return true;
 | 
| 198 | }
 | 
| 199 | 
 | 
| 200 | void WriteString(BigStr* s, int options, mylib::BufWriter* buf) {
 | 
| 201 |   bool j8_fallback = !(options & LOSSY_JSON);
 | 
| 202 | 
 | 
| 203 |   uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
 | 
| 204 |   uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
 | 
| 205 | 
 | 
| 206 |   // Growth policy: Start at a fixed size max(N + 3 + 2, J8_MIN_CAPACITY)
 | 
| 207 |   int capacity = len(s) + 3 + 2;     // 3 for quotes, 2 potential \" \n
 | 
| 208 |   if (capacity < J8_MIN_CAPACITY) {  // account for J8_MAX_BYTES_PER_INPUT_BYTE
 | 
| 209 |     capacity = J8_MIN_CAPACITY;
 | 
| 210 |   }
 | 
| 211 |   // printf("[1] capacity %d\n", capacity);
 | 
| 212 | 
 | 
| 213 |   buf->EnsureMoreSpace(capacity);
 | 
| 214 | 
 | 
| 215 |   int begin = buf->Length();  // maybe Truncate to this position
 | 
| 216 |   buf->WriteConst("\"");
 | 
| 217 | 
 | 
| 218 |   // Set up pointers after writing opening quote
 | 
| 219 |   uint8_t* out = buf->LengthPointer();  // mutated
 | 
| 220 |   uint8_t* out_end = buf->CapacityPointer();
 | 
| 221 | 
 | 
| 222 |   while (true) {
 | 
| 223 |     // Fill in as much as we can
 | 
| 224 |     int invalid_utf8 = J8EncodeChunk(&in, in_end, &out, out_end, false);
 | 
| 225 |     if (invalid_utf8 && j8_fallback) {
 | 
| 226 |       buf->Truncate(begin);
 | 
| 227 |       WriteBString(s, buf, capacity);  // fall back to b''
 | 
| 228 |       return;
 | 
| 229 |     }
 | 
| 230 |     buf->SetLengthFrom(out);
 | 
| 231 | 
 | 
| 232 |     // printf("[1] len %d\n", out_buf->len);
 | 
| 233 | 
 | 
| 234 |     if (in >= in_end) {
 | 
| 235 |       break;
 | 
| 236 |     }
 | 
| 237 | 
 | 
| 238 |     // Growth policy: every time through the loop, increase 1.5x
 | 
| 239 |     //
 | 
| 240 |     // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
 | 
| 241 |     // This seems like a reasonable tradeoff between over-allocating and too
 | 
| 242 |     // many realloc().
 | 
| 243 |     capacity = capacity * 3 / 2;
 | 
| 244 |     // printf("[1] new capacity %d\n", capacity);
 | 
| 245 |     buf->EnsureMoreSpace(capacity);
 | 
| 246 | 
 | 
| 247 |     // Recompute pointers
 | 
| 248 |     out = buf->LengthPointer();  // mutated
 | 
| 249 |     out_end = buf->CapacityPointer();
 | 
| 250 |     // printf("[1] out %p out_end %p\n", out, out_end);
 | 
| 251 |   }
 | 
| 252 | 
 | 
| 253 |   buf->WriteConst("\"");
 | 
| 254 | }
 | 
| 255 | 
 | 
| 256 | }  // namespace pyj8
 | 
| 257 | 
 | 
| 258 | namespace j8 {
 | 
| 259 | 
 | 
| 260 | int HeapValueId(value_asdl::value_t* val) {
 | 
| 261 | #ifndef OPTIMIZED
 | 
| 262 |   // ASDL generates headers with HeapTag::Scanned, but HeapTag::FixedSize would
 | 
| 263 |   // also be valid.
 | 
| 264 |   ObjHeader* h = ObjHeader::FromObject(val);
 | 
| 265 |   DCHECK(h->heap_tag == HeapTag::Scanned || h->heap_tag == HeapTag::FixedSize);
 | 
| 266 | #endif
 | 
| 267 | 
 | 
| 268 |   return ObjectId(val);
 | 
| 269 | }
 | 
| 270 | 
 | 
| 271 | }  // namespace j8
 |