| 1 | #!/usr/bin/env python2
 | 
| 2 | """
 | 
| 3 | oheap2.py
 | 
| 4 | 
 | 
| 5 | OVM2 stores objects in a data structure called OHeap2.  It has the following
 | 
| 6 | concepts.
 | 
| 7 | 
 | 
| 8 | - Handles: 4-byte / 32-bit integers that point to Cells.
 | 
| 9 | - Cells: 16-byte structs with a type tag, in a contiguous array.  This array is
 | 
| 10 |   scanned by the GC.
 | 
| 11 | - Slabs: Variable-length arrays of memory that are owned by a Cell.  Each slab
 | 
| 12 |   may be malloc'd individually.
 | 
| 13 |   - In the case of a string, it's opaque to the GC.
 | 
| 14 |   - In the case of a tuple or struct, it must be scanned by the GC.
 | 
| 15 | 
 | 
| 16 | This is called the "4-16-N design".
 | 
| 17 | 
 | 
| 18 | Every cell has a type tag, which is simply an integer.  Negative integers
 | 
| 19 | indicate native types implemented in C.  Positive integers are for user-defined
 | 
| 20 | types.
 | 
| 21 | 
 | 
| 22 | Cells also have an is_slab bit and a length for small.  TODO: is_slab should be
 | 
| 23 | is_small?
 | 
| 24 | 
 | 
| 25 | Operations on OHeap
 | 
| 26 | -------------------
 | 
| 27 | 
 | 
| 28 | 1. Allocate a new cell, possibly resizing
 | 
| 29 | 
 | 
| 30 | 2. Garbage Collect
 | 
| 31 |   - Walk all the cells.  Free unused cells along with their slabs.
 | 
| 32 | 
 | 
| 33 | 3. Save to file
 | 
| 34 |   Slabs: Native Pointers to Offsets
 | 
| 35 | 
 | 
| 36 | 3a. Copy an object tree to a fresh oheap?  e.g. for IPC.  I guess this is like "Save".
 | 
| 37 |    - Walk all the cells and slabs
 | 
| 38 | 
 | 
| 39 | 4. Load from file into fresh OHeap
 | 
| 40 |   - Patch Slab Offsets to Native Pointers
 | 
| 41 | 
 | 
| 42 | 4a. Load from file and merge into existing OHeap?
 | 
| 43 |     - Inside a namespace?
 | 
| 44 |    - Import?
 | 
| 45 |    - Patch all cell refs!
 | 
| 46 | 
 | 
| 47 | 5. Make permanent?  Move to another "generation"?  OPy classes and bytecode
 | 
| 48 | perhaps fit in this category.
 | 
| 49 | 
 | 
| 50 | TODO
 | 
| 51 | ----
 | 
| 52 | 
 | 
| 53 | - Bits on handle: whether it's const or not?
 | 
| 54 |   - negative handles could be const?
 | 
| 55 | 
 | 
| 56 | - Singleton values for None/True/False
 | 
| 57 | - interning of strings
 | 
| 58 | - hash tables, and hashes of strings
 | 
| 59 | 
 | 
| 60 | Introspection
 | 
| 61 | -------------
 | 
| 62 | 
 | 
| 63 | These operations should work:
 | 
| 64 | 
 | 
| 65 |   type()
 | 
| 66 |   getattr() -- hm in OVM I don't want to look at base classes
 | 
| 67 |   dir() -- list all attributes.  ditto -- don't look at base classes?
 | 
| 68 |     hm except this returns different stuff for modules, classes, and objects
 | 
| 69 |     it looks at superclasses
 | 
| 70 |   isinstance(obj, typ) - check if the type tag of obj points at typ or a superclass
 | 
| 71 | """
 | 
| 72 | from __future__ import print_function
 | 
| 73 | 
 | 
| 74 | import struct
 | 
| 75 | import sys
 | 
| 76 | import types
 | 
| 77 | 
 | 
| 78 | from core.util import log
 | 
| 79 | 
 | 
| 80 | 
 | 
| 81 | TAG_NONE = -1
 | 
| 82 | TAG_BOOL = -2
 | 
| 83 | TAG_INT = -3
 | 
| 84 | TAG_FLOAT = -4
 | 
| 85 | TAG_STR = -5
 | 
| 86 | TAG_TUPLE = -6
 | 
| 87 | TAG_CODE = -7
 | 
| 88 | 
 | 
| 89 | MIN_SMALL_INT = -(1 << 63)
 | 
| 90 | MAX_SMALL_INT = (1 << 63) - 1
 | 
| 91 | 
 | 
| 92 | MAX_LEN_SMALL_STR = 11  # 12 bytes - 1 for NUL
 | 
| 93 | 
 | 
| 94 | MAX_LEN_SMALL_TUPLE = 3  # 4 Handles in 12 bytes?
 | 
| 95 |                          # We can steal a bit from the tag for the small/big
 | 
| 96 |                          # bit.
 | 
| 97 | 
 | 
| 98 | def u8(i):  # 1 byte unsigned
 | 
| 99 |   return struct.pack('B', i)
 | 
| 100 | 
 | 
| 101 | def i16(i):  # 2 bytes
 | 
| 102 |   return struct.pack('h', i)
 | 
| 103 | 
 | 
| 104 | def i32(i):  # 4 bytes
 | 
| 105 |   return struct.pack('i', i) 
 | 
| 106 | 
 | 
| 107 | def i64(i):  # 8 bytes (long long)
 | 
| 108 |   return struct.pack('q', i)
 | 
| 109 | 
 | 
| 110 | def f64(i):  # 8 byte double
 | 
| 111 |   return struct.pack('d', i)
 | 
| 112 | 
 | 
| 113 | 
 | 
| 114 | def Align4(i):
 | 
| 115 |   """Round up to the nearest multiple of 4.  See unit tests."""
 | 
| 116 |   return ((i-1) | 3) + 1
 | 
| 117 | 
 | 
| 118 | 
 | 
| 119 | def Align16(i):
 | 
| 120 |   """Round up to the nearest multiple of 16.  See unit tests."""
 | 
| 121 |   return ((i-1) | 15) + 1
 | 
| 122 | 
 | 
| 123 | 
 | 
| 124 | class Encoder(object):
 | 
| 125 |   """
 | 
| 126 |   Write objects into an OHeap2 structure that can be lazily loaded.
 | 
| 127 | 
 | 
| 128 |   First pass:
 | 
| 129 |     Append to cells and append to slabs
 | 
| 130 |   Second pass:
 | 
| 131 |     Write slabs as bytes, and then patch offsets in cells?
 | 
| 132 |     Write all the cells
 | 
| 133 |     Write the root object at the front of the file?  Or do it at the end?
 | 
| 134 |     OHeap writes it at the beginning after
 | 
| 135 |   """
 | 
| 136 | 
 | 
| 137 |   def __init__(self):
 | 
| 138 |     self.chunk = bytearray()
 | 
| 139 |     # An array of cells
 | 
| 140 |     self.cells = []
 | 
| 141 |     # Write all these first?  So that the cells can point to them.
 | 
| 142 |     self.slabs = []
 | 
| 143 | 
 | 
| 144 |   def Any(self, obj):
 | 
| 145 |     """
 | 
| 146 |     Encode an object and return its id.
 | 
| 147 |     """
 | 
| 148 |     id_ = len(self.cells)
 | 
| 149 | 
 | 
| 150 |     # TODO: Enforce that None is a singleton.  But what about joining OHeaps?
 | 
| 151 |     if isinstance(obj, types.NoneType):
 | 
| 152 |       self.cells.append((TAG_NONE, False, None))
 | 
| 153 | 
 | 
| 154 |     # TODO: Enforce that True/False is a singleton.  But what about joining
 | 
| 155 |     # OHeaps?
 | 
| 156 |     elif isinstance(obj, bool):
 | 
| 157 |       self.cells.append((TAG_BOOL, False, obj))
 | 
| 158 | 
 | 
| 159 |     elif isinstance(obj, int):
 | 
| 160 |       i = obj
 | 
| 161 |       if MIN_SMALL_INT < i < MAX_SMALL_INT:
 | 
| 162 |         self.cells.append((TAG_INT, False, i))
 | 
| 163 |       else:
 | 
| 164 |         raise NotImplementedError
 | 
| 165 | 
 | 
| 166 |     elif isinstance(obj, float):
 | 
| 167 |       raise NotImplementedError
 | 
| 168 | 
 | 
| 169 |     # TODO: Identical strings could be "interned" here.
 | 
| 170 |     elif isinstance(obj, str):
 | 
| 171 |       s = obj
 | 
| 172 |       n = len(s)
 | 
| 173 |       if n < MAX_LEN_SMALL_STR:
 | 
| 174 |         self.cells.append((TAG_STR, False, s))
 | 
| 175 |       else:
 | 
| 176 |         # length and they payload
 | 
| 177 |         slab_index = len(self.slabs)
 | 
| 178 |         self.slabs.append((n, s))
 | 
| 179 |         self.cells.append((TAG_STR, True, slab_index))
 | 
| 180 | 
 | 
| 181 |     elif isinstance(obj, tuple):
 | 
| 182 |       t = obj
 | 
| 183 |       refs = []
 | 
| 184 |       for item in t:
 | 
| 185 |         refs.append(self.Any(item))  # Depth-first.
 | 
| 186 | 
 | 
| 187 |       # Compute ID after adding all the children.
 | 
| 188 |       id_ = len(self.cells)
 | 
| 189 | 
 | 
| 190 |       n = len(t)
 | 
| 191 |       if n < MAX_LEN_SMALL_TUPLE:
 | 
| 192 |         # TODO: How do we know how long it is?
 | 
| 193 |         self.cells.append((TAG_TUPLE, False, refs))
 | 
| 194 |       else:
 | 
| 195 |         slab_index = len(self.slabs)
 | 
| 196 |         self.slabs.append((n, refs))
 | 
| 197 |         self.cells.append((TAG_TUPLE, True, slab_index))
 | 
| 198 | 
 | 
| 199 |     elif isinstance(obj, types.CodeType):
 | 
| 200 |       co = obj
 | 
| 201 |       refs = []
 | 
| 202 | 
 | 
| 203 |       # Ints first
 | 
| 204 |       refs.append(self.Any(co.co_argcount))
 | 
| 205 |       refs.append(self.Any(co.co_nlocals))
 | 
| 206 |       refs.append(self.Any(co.co_stacksize))
 | 
| 207 |       refs.append(self.Any(co.co_flags))
 | 
| 208 |       refs.append(self.Any(co.co_firstlineno))
 | 
| 209 | 
 | 
| 210 |       # Strings
 | 
| 211 |       refs.append(self.Any(co.co_name))
 | 
| 212 |       refs.append(self.Any(co.co_filename))
 | 
| 213 |       refs.append(self.Any(co.co_code))  # bytecode
 | 
| 214 | 
 | 
| 215 |       # Tuples
 | 
| 216 |       refs.append(self.Any(co.co_names))
 | 
| 217 |       refs.append(self.Any(co.co_varnames))
 | 
| 218 |       refs.append(self.Any(co.co_consts))
 | 
| 219 | 
 | 
| 220 |       slab_index = len(self.slabs)
 | 
| 221 |       self.slabs.append((len(refs), refs))
 | 
| 222 |       self.cells.append((TAG_CODE, True, slab_index))
 | 
| 223 | 
 | 
| 224 |     else:
 | 
| 225 |       raise AssertionError
 | 
| 226 | 
 | 
| 227 |     return id_
 | 
| 228 | 
 | 
| 229 |   def Write(self, f):
 | 
| 230 |     f.write('OHP2')  # magic header
 | 
| 231 |     f.write(i32(0))  # placeholder for cell offset
 | 
| 232 |     f.write(i32(0))  # placeholder for root object.  NOTE: We could always
 | 
| 233 |                      # fseek(SEEK_END, -4)?  But I think we want it to more
 | 
| 234 |                      # self-describing.
 | 
| 235 | 
 | 
| 236 |     # Write slabs first, so we know their offsets.
 | 
| 237 |     slab_offsets = []
 | 
| 238 |     pos = 0  # Reader should patch relative to 12
 | 
| 239 | 
 | 
| 240 |     for length, payload in self.slabs:
 | 
| 241 | 
 | 
| 242 |       if isinstance(payload, str):
 | 
| 243 |         slab_offsets.append(pos)
 | 
| 244 | 
 | 
| 245 |         f.write(i32(length))  # length in bytes
 | 
| 246 |         pos += 4
 | 
| 247 | 
 | 
| 248 |         n = len(payload)
 | 
| 249 |         aligned = Align4(n+1)  # at least NUL byte for terminator
 | 
| 250 |         f.write(payload)
 | 
| 251 |         f.write('\0' * (aligned - n))  # padding
 | 
| 252 |         pos += aligned
 | 
| 253 | 
 | 
| 254 |       elif isinstance(payload, list):  # list of references
 | 
| 255 |         slab_offsets.append(pos)
 | 
| 256 | 
 | 
| 257 |         # length in references.  Could be unsigned?
 | 
| 258 |         f.write(i32(length))
 | 
| 259 | 
 | 
| 260 |         # NOTE: There is no GC offset, since all of them are scanned?
 | 
| 261 |         for ref in payload:
 | 
| 262 |           f.write(i32(ref))
 | 
| 263 |         pos += 4
 | 
| 264 |         pos += len(payload) * 4
 | 
| 265 | 
 | 
| 266 |       else:
 | 
| 267 |         raise AssertionError(payload)
 | 
| 268 | 
 | 
| 269 |     log('Slab offsets: %s', slab_offsets)
 | 
| 270 | 
 | 
| 271 |     # Pad out the slabs so that the cells begins at a multiple of 16.
 | 
| 272 |     total_slab_size = Align16(pos)  # including pad, but not including header.
 | 
| 273 |     f.write('\0' * (total_slab_size - pos))  # Write the pad
 | 
| 274 | 
 | 
| 275 |     # Encode it into 16 bytes
 | 
| 276 |     for tag, is_slab, val in self.cells:
 | 
| 277 |       #log('%s %s %s', tag, is_slab, val)
 | 
| 278 | 
 | 
| 279 |       # 4 byte tag.  This may be patched into a type Handle?
 | 
| 280 |       # Reserve LSB for is_slab?  Or highest bit?
 | 
| 281 |       # The tag could be (i >> 1) then?
 | 
| 282 |       f.write(i16(tag))
 | 
| 283 | 
 | 
| 284 |       if tag == TAG_NONE:
 | 
| 285 |         f.write(i16(0))
 | 
| 286 |         f.write(i32(0))
 | 
| 287 |         f.write(i32(0))
 | 
| 288 |         f.write(i32(0))
 | 
| 289 | 
 | 
| 290 |       elif tag == TAG_BOOL:
 | 
| 291 |         f.write(i16(0))  # pad
 | 
| 292 |         f.write(i32(0))  # pad
 | 
| 293 |         f.write(i32(0))  # pad
 | 
| 294 |         f.write(i32(int(val)))  # 0 or 1
 | 
| 295 | 
 | 
| 296 |       elif tag == TAG_INT:
 | 
| 297 |         assert not is_slab, val
 | 
| 298 |         f.write(i16(0))  # Padding
 | 
| 299 |         f.write(i32(0))  # Padding
 | 
| 300 |         f.write(i64(val))
 | 
| 301 | 
 | 
| 302 |       elif tag == TAG_FLOAT:
 | 
| 303 |         assert not is_slab, val
 | 
| 304 |         f.write(i16(0))  # Padding
 | 
| 305 |         f.write(i32(0))  # Padding
 | 
| 306 |         f.write(f64(val))
 | 
| 307 | 
 | 
| 308 |       elif tag == TAG_STR:
 | 
| 309 |         if is_slab:
 | 
| 310 |           # For a string, a big/small bit could technically be in the last
 | 
| 311 |           # byte.  To reuse NUL terminator.  But I want the patching process to
 | 
| 312 |           # be consistent.
 | 
| 313 |           slab_index = val
 | 
| 314 |           offset = slab_offsets[slab_index]
 | 
| 315 |           f.write(u8(1))  # is_slab
 | 
| 316 |           f.write(u8(0))  # length stored in slab
 | 
| 317 |           f.write(i32(0))  # pad
 | 
| 318 |           f.write(i32(0))  # pad
 | 
| 319 |           f.write(i32(offset))
 | 
| 320 |         else:
 | 
| 321 |           n = len(val)
 | 
| 322 |           f.write(u8(0))  # is_slab
 | 
| 323 |           f.write(u8(n))  # length stored here
 | 
| 324 |           f.write(val)
 | 
| 325 |           num_pad = 12 - n  # at least one NUL
 | 
| 326 |           f.write('\0' * num_pad)
 | 
| 327 | 
 | 
| 328 |       elif tag == TAG_TUPLE:
 | 
| 329 |         if is_slab:
 | 
| 330 |           slab_index = val
 | 
| 331 |           offset = slab_offsets[slab_index]
 | 
| 332 |           f.write(u8(1))  # is_slab
 | 
| 333 |           f.write(u8(0))  # length stored in slab
 | 
| 334 |           f.write(i32(0))  # pad
 | 
| 335 |           f.write(i32(0))  # pad
 | 
| 336 |           f.write(i32(offset))
 | 
| 337 |         else:
 | 
| 338 |           n = len(val)
 | 
| 339 |           f.write(u8(0))  # is_slab
 | 
| 340 |           f.write(u8(n))
 | 
| 341 |           # TODO: how is length represented?
 | 
| 342 |           for ref in val:
 | 
| 343 |             f.write(i32(ref))
 | 
| 344 |           num_pad = 3 - len(val)
 | 
| 345 |           for i in xrange(num_pad):
 | 
| 346 |             f.write(i32(0))
 | 
| 347 | 
 | 
| 348 |       elif tag == TAG_CODE:
 | 
| 349 |         assert is_slab, val
 | 
| 350 |         slab_index = val
 | 
| 351 |         #log('slab_index %d', slab_index)
 | 
| 352 |         offset = slab_offsets[slab_index]
 | 
| 353 |         f.write(u8(1)) # is_slab
 | 
| 354 |         f.write(u8(0)) # length stored in slab
 | 
| 355 |         f.write(i32(0))  # pad
 | 
| 356 |         f.write(i32(0))  # pad
 | 
| 357 |         f.write(i32(offset))
 | 
| 358 | 
 | 
| 359 |       else:
 | 
| 360 |         raise AssertionError(tag)
 | 
| 361 | 
 | 
| 362 |     log('')
 | 
| 363 |     log('slabs')
 | 
| 364 |     for slab in self.slabs:
 | 
| 365 |       log('\t%r', slab)
 | 
| 366 | 
 | 
| 367 |     log('cells')
 | 
| 368 |     for c in self.cells:
 | 
| 369 |       #log('\t%r', c)
 | 
| 370 |       pass
 | 
| 371 | 
 | 
| 372 |     log('%d slabs in %d bytes', len(self.slabs), total_slab_size)
 | 
| 373 |     log('%d cells in %d bytes', len(self.cells), f.tell() - 12 - total_slab_size)
 | 
| 374 | 
 | 
| 375 |     # Fill in the cell position
 | 
| 376 |     f.seek(4)
 | 
| 377 |     f.write(i32(total_slab_size))
 | 
| 378 |     f.write(i32(len(self.cells)))
 | 
| 379 | 
 | 
| 380 | 
 | 
| 381 | def Write(co, f):
 | 
| 382 |   print(co)
 | 
| 383 |   enc = Encoder()
 | 
| 384 |   enc.Any(co)
 | 
| 385 |   enc.Write(f)
 | 
| 386 | 
 | 
| 387 | 
 | 
| 388 | def main(argv):
 | 
| 389 |   chunk = bytearray()
 | 
| 390 |   chunk.extend('hello')
 | 
| 391 |   chunk.append('\0')
 | 
| 392 | 
 | 
| 393 |   print('Hello from oheap2.py')
 | 
| 394 |   print(repr(chunk))
 | 
| 395 | 
 | 
| 396 | 
 | 
| 397 | if __name__ == '__main__':
 | 
| 398 |   try:
 | 
| 399 |     main(sys.argv)
 | 
| 400 |   except RuntimeError as e:
 | 
| 401 |     print('FATAL: %s' % e, file=sys.stderr)
 | 
| 402 |     sys.exit(1)
 |