1 | #!/usr/bin/env python2
|
2 | """
|
3 | oheap2.py
|
4 |
|
5 | OVM2 stores objects in a data structure called OHeap2. It has the following
|
6 | concepts.
|
7 |
|
8 | - Handles: 4-byte / 32-bit integers that point to Cells.
|
9 | - Cells: 16-byte structs with a type tag, in a contiguous array. This array is
|
10 | scanned by the GC.
|
11 | - Slabs: Variable-length arrays of memory that are owned by a Cell. Each slab
|
12 | may be malloc'd individually.
|
13 | - In the case of a string, it's opaque to the GC.
|
14 | - In the case of a tuple or struct, it must be scanned by the GC.
|
15 |
|
16 | This is called the "4-16-N design".
|
17 |
|
18 | Every cell has a type tag, which is simply an integer. Negative integers
|
19 | indicate native types implemented in C. Positive integers are for user-defined
|
20 | types.
|
21 |
|
22 | Cells also have an is_slab bit and a length for small. TODO: is_slab should be
|
23 | is_small?
|
24 |
|
25 | Operations on OHeap
|
26 | -------------------
|
27 |
|
28 | 1. Allocate a new cell, possibly resizing
|
29 |
|
30 | 2. Garbage Collect
|
31 | - Walk all the cells. Free unused cells along with their slabs.
|
32 |
|
33 | 3. Save to file
|
34 | Slabs: Native Pointers to Offsets
|
35 |
|
36 | 3a. Copy an object tree to a fresh oheap? e.g. for IPC. I guess this is like "Save".
|
37 | - Walk all the cells and slabs
|
38 |
|
39 | 4. Load from file into fresh OHeap
|
40 | - Patch Slab Offsets to Native Pointers
|
41 |
|
42 | 4a. Load from file and merge into existing OHeap?
|
43 | - Inside a namespace?
|
44 | - Import?
|
45 | - Patch all cell refs!
|
46 |
|
47 | 5. Make permanent? Move to another "generation"? OPy classes and bytecode
|
48 | perhaps fit in this category.
|
49 |
|
50 | TODO
|
51 | ----
|
52 |
|
53 | - Bits on handle: whether it's const or not?
|
54 | - negative handles could be const?
|
55 |
|
56 | - Singleton values for None/True/False
|
57 | - interning of strings
|
58 | - hash tables, and hashes of strings
|
59 |
|
60 | Introspection
|
61 | -------------
|
62 |
|
63 | These operations should work:
|
64 |
|
65 | type()
|
66 | getattr() -- hm in OVM I don't want to look at base classes
|
67 | dir() -- list all attributes. ditto -- don't look at base classes?
|
68 | hm except this returns different stuff for modules, classes, and objects
|
69 | it looks at superclasses
|
70 | isinstance(obj, typ) - check if the type tag of obj points at typ or a superclass
|
71 | """
|
72 | from __future__ import print_function
|
73 |
|
74 | import struct
|
75 | import sys
|
76 | import types
|
77 |
|
78 | from core.util import log
|
79 |
|
80 |
|
81 | TAG_NONE = -1
|
82 | TAG_BOOL = -2
|
83 | TAG_INT = -3
|
84 | TAG_FLOAT = -4
|
85 | TAG_STR = -5
|
86 | TAG_TUPLE = -6
|
87 | TAG_CODE = -7
|
88 |
|
89 | MIN_SMALL_INT = -(1 << 63)
|
90 | MAX_SMALL_INT = (1 << 63) - 1
|
91 |
|
92 | MAX_LEN_SMALL_STR = 11 # 12 bytes - 1 for NUL
|
93 |
|
94 | MAX_LEN_SMALL_TUPLE = 3 # 4 Handles in 12 bytes?
|
95 | # We can steal a bit from the tag for the small/big
|
96 | # bit.
|
97 |
|
98 | def u8(i): # 1 byte unsigned
|
99 | return struct.pack('B', i)
|
100 |
|
101 | def i16(i): # 2 bytes
|
102 | return struct.pack('h', i)
|
103 |
|
104 | def i32(i): # 4 bytes
|
105 | return struct.pack('i', i)
|
106 |
|
107 | def i64(i): # 8 bytes (long long)
|
108 | return struct.pack('q', i)
|
109 |
|
110 | def f64(i): # 8 byte double
|
111 | return struct.pack('d', i)
|
112 |
|
113 |
|
114 | def Align4(i):
|
115 | """Round up to the nearest multiple of 4. See unit tests."""
|
116 | return ((i-1) | 3) + 1
|
117 |
|
118 |
|
119 | def Align16(i):
|
120 | """Round up to the nearest multiple of 16. See unit tests."""
|
121 | return ((i-1) | 15) + 1
|
122 |
|
123 |
|
124 | class Encoder(object):
|
125 | """
|
126 | Write objects into an OHeap2 structure that can be lazily loaded.
|
127 |
|
128 | First pass:
|
129 | Append to cells and append to slabs
|
130 | Second pass:
|
131 | Write slabs as bytes, and then patch offsets in cells?
|
132 | Write all the cells
|
133 | Write the root object at the front of the file? Or do it at the end?
|
134 | OHeap writes it at the beginning after
|
135 | """
|
136 |
|
137 | def __init__(self):
|
138 | self.chunk = bytearray()
|
139 | # An array of cells
|
140 | self.cells = []
|
141 | # Write all these first? So that the cells can point to them.
|
142 | self.slabs = []
|
143 |
|
144 | def Any(self, obj):
|
145 | """
|
146 | Encode an object and return its id.
|
147 | """
|
148 | id_ = len(self.cells)
|
149 |
|
150 | # TODO: Enforce that None is a singleton. But what about joining OHeaps?
|
151 | if isinstance(obj, types.NoneType):
|
152 | self.cells.append((TAG_NONE, False, None))
|
153 |
|
154 | # TODO: Enforce that True/False is a singleton. But what about joining
|
155 | # OHeaps?
|
156 | elif isinstance(obj, bool):
|
157 | self.cells.append((TAG_BOOL, False, obj))
|
158 |
|
159 | elif isinstance(obj, int):
|
160 | i = obj
|
161 | if MIN_SMALL_INT < i < MAX_SMALL_INT:
|
162 | self.cells.append((TAG_INT, False, i))
|
163 | else:
|
164 | raise NotImplementedError
|
165 |
|
166 | elif isinstance(obj, float):
|
167 | raise NotImplementedError
|
168 |
|
169 | # TODO: Identical strings could be "interned" here.
|
170 | elif isinstance(obj, str):
|
171 | s = obj
|
172 | n = len(s)
|
173 | if n < MAX_LEN_SMALL_STR:
|
174 | self.cells.append((TAG_STR, False, s))
|
175 | else:
|
176 | # length and they payload
|
177 | slab_index = len(self.slabs)
|
178 | self.slabs.append((n, s))
|
179 | self.cells.append((TAG_STR, True, slab_index))
|
180 |
|
181 | elif isinstance(obj, tuple):
|
182 | t = obj
|
183 | refs = []
|
184 | for item in t:
|
185 | refs.append(self.Any(item)) # Depth-first.
|
186 |
|
187 | # Compute ID after adding all the children.
|
188 | id_ = len(self.cells)
|
189 |
|
190 | n = len(t)
|
191 | if n < MAX_LEN_SMALL_TUPLE:
|
192 | # TODO: How do we know how long it is?
|
193 | self.cells.append((TAG_TUPLE, False, refs))
|
194 | else:
|
195 | slab_index = len(self.slabs)
|
196 | self.slabs.append((n, refs))
|
197 | self.cells.append((TAG_TUPLE, True, slab_index))
|
198 |
|
199 | elif isinstance(obj, types.CodeType):
|
200 | co = obj
|
201 | refs = []
|
202 |
|
203 | # Ints first
|
204 | refs.append(self.Any(co.co_argcount))
|
205 | refs.append(self.Any(co.co_nlocals))
|
206 | refs.append(self.Any(co.co_stacksize))
|
207 | refs.append(self.Any(co.co_flags))
|
208 | refs.append(self.Any(co.co_firstlineno))
|
209 |
|
210 | # Strings
|
211 | refs.append(self.Any(co.co_name))
|
212 | refs.append(self.Any(co.co_filename))
|
213 | refs.append(self.Any(co.co_code)) # bytecode
|
214 |
|
215 | # Tuples
|
216 | refs.append(self.Any(co.co_names))
|
217 | refs.append(self.Any(co.co_varnames))
|
218 | refs.append(self.Any(co.co_consts))
|
219 |
|
220 | slab_index = len(self.slabs)
|
221 | self.slabs.append((len(refs), refs))
|
222 | self.cells.append((TAG_CODE, True, slab_index))
|
223 |
|
224 | else:
|
225 | raise AssertionError
|
226 |
|
227 | return id_
|
228 |
|
229 | def Write(self, f):
|
230 | f.write('OHP2') # magic header
|
231 | f.write(i32(0)) # placeholder for cell offset
|
232 | f.write(i32(0)) # placeholder for root object. NOTE: We could always
|
233 | # fseek(SEEK_END, -4)? But I think we want it to more
|
234 | # self-describing.
|
235 |
|
236 | # Write slabs first, so we know their offsets.
|
237 | slab_offsets = []
|
238 | pos = 0 # Reader should patch relative to 12
|
239 |
|
240 | for length, payload in self.slabs:
|
241 |
|
242 | if isinstance(payload, str):
|
243 | slab_offsets.append(pos)
|
244 |
|
245 | f.write(i32(length)) # length in bytes
|
246 | pos += 4
|
247 |
|
248 | n = len(payload)
|
249 | aligned = Align4(n+1) # at least NUL byte for terminator
|
250 | f.write(payload)
|
251 | f.write('\0' * (aligned - n)) # padding
|
252 | pos += aligned
|
253 |
|
254 | elif isinstance(payload, list): # list of references
|
255 | slab_offsets.append(pos)
|
256 |
|
257 | # length in references. Could be unsigned?
|
258 | f.write(i32(length))
|
259 |
|
260 | # NOTE: There is no GC offset, since all of them are scanned?
|
261 | for ref in payload:
|
262 | f.write(i32(ref))
|
263 | pos += 4
|
264 | pos += len(payload) * 4
|
265 |
|
266 | else:
|
267 | raise AssertionError(payload)
|
268 |
|
269 | log('Slab offsets: %s', slab_offsets)
|
270 |
|
271 | # Pad out the slabs so that the cells begins at a multiple of 16.
|
272 | total_slab_size = Align16(pos) # including pad, but not including header.
|
273 | f.write('\0' * (total_slab_size - pos)) # Write the pad
|
274 |
|
275 | # Encode it into 16 bytes
|
276 | for tag, is_slab, val in self.cells:
|
277 | #log('%s %s %s', tag, is_slab, val)
|
278 |
|
279 | # 4 byte tag. This may be patched into a type Handle?
|
280 | # Reserve LSB for is_slab? Or highest bit?
|
281 | # The tag could be (i >> 1) then?
|
282 | f.write(i16(tag))
|
283 |
|
284 | if tag == TAG_NONE:
|
285 | f.write(i16(0))
|
286 | f.write(i32(0))
|
287 | f.write(i32(0))
|
288 | f.write(i32(0))
|
289 |
|
290 | elif tag == TAG_BOOL:
|
291 | f.write(i16(0)) # pad
|
292 | f.write(i32(0)) # pad
|
293 | f.write(i32(0)) # pad
|
294 | f.write(i32(int(val))) # 0 or 1
|
295 |
|
296 | elif tag == TAG_INT:
|
297 | assert not is_slab, val
|
298 | f.write(i16(0)) # Padding
|
299 | f.write(i32(0)) # Padding
|
300 | f.write(i64(val))
|
301 |
|
302 | elif tag == TAG_FLOAT:
|
303 | assert not is_slab, val
|
304 | f.write(i16(0)) # Padding
|
305 | f.write(i32(0)) # Padding
|
306 | f.write(f64(val))
|
307 |
|
308 | elif tag == TAG_STR:
|
309 | if is_slab:
|
310 | # For a string, a big/small bit could technically be in the last
|
311 | # byte. To reuse NUL terminator. But I want the patching process to
|
312 | # be consistent.
|
313 | slab_index = val
|
314 | offset = slab_offsets[slab_index]
|
315 | f.write(u8(1)) # is_slab
|
316 | f.write(u8(0)) # length stored in slab
|
317 | f.write(i32(0)) # pad
|
318 | f.write(i32(0)) # pad
|
319 | f.write(i32(offset))
|
320 | else:
|
321 | n = len(val)
|
322 | f.write(u8(0)) # is_slab
|
323 | f.write(u8(n)) # length stored here
|
324 | f.write(val)
|
325 | num_pad = 12 - n # at least one NUL
|
326 | f.write('\0' * num_pad)
|
327 |
|
328 | elif tag == TAG_TUPLE:
|
329 | if is_slab:
|
330 | slab_index = val
|
331 | offset = slab_offsets[slab_index]
|
332 | f.write(u8(1)) # is_slab
|
333 | f.write(u8(0)) # length stored in slab
|
334 | f.write(i32(0)) # pad
|
335 | f.write(i32(0)) # pad
|
336 | f.write(i32(offset))
|
337 | else:
|
338 | n = len(val)
|
339 | f.write(u8(0)) # is_slab
|
340 | f.write(u8(n))
|
341 | # TODO: how is length represented?
|
342 | for ref in val:
|
343 | f.write(i32(ref))
|
344 | num_pad = 3 - len(val)
|
345 | for i in xrange(num_pad):
|
346 | f.write(i32(0))
|
347 |
|
348 | elif tag == TAG_CODE:
|
349 | assert is_slab, val
|
350 | slab_index = val
|
351 | #log('slab_index %d', slab_index)
|
352 | offset = slab_offsets[slab_index]
|
353 | f.write(u8(1)) # is_slab
|
354 | f.write(u8(0)) # length stored in slab
|
355 | f.write(i32(0)) # pad
|
356 | f.write(i32(0)) # pad
|
357 | f.write(i32(offset))
|
358 |
|
359 | else:
|
360 | raise AssertionError(tag)
|
361 |
|
362 | log('')
|
363 | log('slabs')
|
364 | for slab in self.slabs:
|
365 | log('\t%r', slab)
|
366 |
|
367 | log('cells')
|
368 | for c in self.cells:
|
369 | #log('\t%r', c)
|
370 | pass
|
371 |
|
372 | log('%d slabs in %d bytes', len(self.slabs), total_slab_size)
|
373 | log('%d cells in %d bytes', len(self.cells), f.tell() - 12 - total_slab_size)
|
374 |
|
375 | # Fill in the cell position
|
376 | f.seek(4)
|
377 | f.write(i32(total_slab_size))
|
378 | f.write(i32(len(self.cells)))
|
379 |
|
380 |
|
381 | def Write(co, f):
|
382 | print(co)
|
383 | enc = Encoder()
|
384 | enc.Any(co)
|
385 | enc.Write(f)
|
386 |
|
387 |
|
388 | def main(argv):
|
389 | chunk = bytearray()
|
390 | chunk.extend('hello')
|
391 | chunk.append('\0')
|
392 |
|
393 | print('Hello from oheap2.py')
|
394 | print(repr(chunk))
|
395 |
|
396 |
|
397 | if __name__ == '__main__':
|
398 | try:
|
399 | main(sys.argv)
|
400 | except RuntimeError as e:
|
401 | print('FATAL: %s' % e, file=sys.stderr)
|
402 | sys.exit(1)
|