1 | """
|
2 | encode.py
|
3 | """
|
4 |
|
5 | from asdl import asdl_ as asdl
|
6 | from asdl import runtime
|
7 | from asdl import const
|
8 |
|
9 | from core import util
|
10 |
|
11 |
|
12 | class EncodeError(Exception):
|
13 | def __init__(self, *args, **kwargs):
|
14 | Exception.__init__(self, *args, **kwargs)
|
15 | self.details_printed = False
|
16 |
|
17 |
|
18 | _DEFAULT_ALIGNMENT = 4
|
19 |
|
20 | class BinOutput(object):
|
21 | """Write aligned blocks here. Keeps track of block indexes for refs."""
|
22 |
|
23 | def __init__(self, f, alignment=_DEFAULT_ALIGNMENT):
|
24 | self.f = f
|
25 | # index of last block, to return as a ref.
|
26 | self.last_block = 0
|
27 | self.alignment = alignment
|
28 |
|
29 | def WriteRootRef(self, chunk):
|
30 | self.f.seek(5) # seek past 'OHP\x01\x04'
|
31 |
|
32 | assert len(chunk) == 3
|
33 | self.f.write(chunk)
|
34 |
|
35 | def Write(self, chunk):
|
36 | """
|
37 | Return a block pointer/index.
|
38 | """
|
39 | # Input should be padded
|
40 | assert len(chunk) % self.alignment == 0
|
41 | self.f.write(chunk)
|
42 |
|
43 | ref = self.last_block
|
44 | num_blocks = len(chunk) // self.alignment # int division
|
45 | #print('WROTE %d blocks' % num_blocks)
|
46 | self.last_block += num_blocks
|
47 |
|
48 | # Return a reference to the beginning
|
49 | return ref
|
50 |
|
51 |
|
52 | class Params(object):
|
53 | """Encoding parameters.
|
54 |
|
55 | Hm most of these settings should be per-field, expressed in the schema. The
|
56 | only global one is the ref/pointer alignment. 4 and 8 are the most likely
|
57 | choices, and 4 is probably fine, because you have 64 MB of addressable memory
|
58 | with 24 bit pointers.
|
59 | """
|
60 |
|
61 | def __init__(self, alignment=_DEFAULT_ALIGNMENT,
|
62 | int_width=const.DEFAULT_INT_WIDTH):
|
63 | self.alignment = alignment
|
64 | self.pointer_type = 'uint32_t'
|
65 |
|
66 | self.tag_width = 1 # for ArithVar vs ArithWord.
|
67 | self.int_width = int_width
|
68 | self.ref_width = int_width # Constant 3, used by gen_cpp
|
69 |
|
70 | # used for fd, line/col
|
71 | # also I guess steuff like SimpleCommand
|
72 | self.index_width = 2 # 16 bits, e.g. max 64K entries in an array
|
73 |
|
74 | self.max_int = 1 << (self.int_width * 8)
|
75 | self.max_index = 1 << (self.index_width * 8)
|
76 | self.max_tag = 1 << (self.tag_width * 8)
|
77 |
|
78 | def Tag(self, i, chunk):
|
79 | if i > self.max_tag:
|
80 | raise AssertionError('Invalid id %r' % i)
|
81 | chunk.append(i & 0xFF)
|
82 |
|
83 | def Int(self, n, chunk):
|
84 | if n < 0:
|
85 | raise EncodeError(
|
86 | "ASDL can't currently encode negative numbers. Got %d" % n)
|
87 | if n > self.max_int:
|
88 | raise EncodeError(
|
89 | '%d is too big to fit in %d bytes' % (n, self.int_width))
|
90 |
|
91 | for i in xrange(self.int_width):
|
92 | chunk.append(n & 0xFF)
|
93 | n >>= 8
|
94 |
|
95 | def Ref(self, n, chunk):
|
96 | # NOTE: ref width is currently the same as int width. Could be different.
|
97 | self.Int(n, chunk)
|
98 |
|
99 | def _Pad(self, chunk):
|
100 | n = len(chunk)
|
101 | a = self.alignment
|
102 | if n % a != 0:
|
103 | chunk.extend(b'\x00' * (a - (n % a)))
|
104 | return chunk
|
105 |
|
106 | # Right now all strings are references. Later they could be inline.
|
107 | def Str(self, s, chunk):
|
108 | # NOTE: For variable, proc, and function names, it could make sense to
|
109 | # pre-compute and store a hash value. They will be looked up in the stack
|
110 | # and so forth.
|
111 | # - You could also return a obj number or object ID.
|
112 | chunk.extend(s)
|
113 | chunk.append(0) # NUL terminator
|
114 |
|
115 | def PaddedStr(self, s):
|
116 | # NOTE:
|
117 | # - The encoder could also have an intern table to save space.
|
118 | # - Str and PaddedStr will both return char* ? Should we allow them to
|
119 | # VARY with the same schema, is a value/ref type PART of the schema? It's
|
120 | # basically small size optimization and "flexible array" optimization. I
|
121 | # think you want that possibility.
|
122 | chunk = bytearray()
|
123 | self.Str(s, chunk)
|
124 | return self._Pad(chunk)
|
125 |
|
126 | def Bytes(self, buf, chunk):
|
127 | n = len(buf)
|
128 | if n >= self.max_index:
|
129 | raise EncodeError("bytes object is too long (%d)" % n)
|
130 | for i in xrange(self.index_width):
|
131 | chunk.append(n & 0xFF)
|
132 | n >>= 8
|
133 | chunk.extend(buf)
|
134 |
|
135 | def PaddedBytes(self, buf):
|
136 | chunk = bytearray()
|
137 | self.Bytes(buf, chunk)
|
138 | return self._Pad(chunk)
|
139 |
|
140 | def PaddedBlock(self, chunk):
|
141 | return self._Pad(chunk)
|
142 |
|
143 |
|
144 | def EncodeArray(obj_list, item_desc, enc, out):
|
145 | """
|
146 | Args:
|
147 | obj_list: List of Obj values
|
148 |
|
149 | Returns:
|
150 | ref
|
151 | """
|
152 | array_chunk = bytearray()
|
153 | enc.Int(len(obj_list), array_chunk) # Length prefix
|
154 |
|
155 | if isinstance(item_desc, asdl.IntType) or \
|
156 | isinstance(item_desc, asdl.BoolType):
|
157 | for item in obj_list:
|
158 | enc.Int(item, array_chunk)
|
159 |
|
160 | elif isinstance(item_desc, asdl.UserType):
|
161 | # Assume Id for now
|
162 | for item in obj_list:
|
163 | enc.Int(item.enum_value, array_chunk)
|
164 |
|
165 | elif isinstance(item_desc, asdl.StrType):
|
166 | for item in obj_list:
|
167 | ref = out.Write(enc.PaddedStr(item))
|
168 | enc.Ref(ref, array_chunk)
|
169 |
|
170 | elif isinstance(item_desc, asdl.Sum) and asdl.is_simple(item_desc):
|
171 | for item in obj_list:
|
172 | enc.Int(item.enum_id, array_chunk)
|
173 |
|
174 | else:
|
175 | # A simple value is either an int, enum, or pointer. (Later: Iter<Str>
|
176 | # might be possible for locality.)
|
177 | assert \
|
178 | isinstance(item_desc, asdl.SumType) or \
|
179 | isinstance(item_desc, asdl.CompoundType), item_desc
|
180 |
|
181 | # This is like vector<T*>
|
182 | # Later:
|
183 | # - Product types can be put in line
|
184 | # - Sum types can even be put in line, if you have List<T> rather than
|
185 | # Array<T>. Array implies O(1) random access; List doesn't.
|
186 | for item in obj_list:
|
187 | try:
|
188 | ref = EncodeObj(item, enc, out)
|
189 | except EncodeError as e:
|
190 | if not e.details_printed:
|
191 | util.log("Error encoding array: %s (item %s)", e, item)
|
192 | e.details_printed = True
|
193 | raise
|
194 | enc.Ref(ref, array_chunk)
|
195 |
|
196 | this_ref = out.Write(enc.PaddedBlock(array_chunk))
|
197 | return this_ref
|
198 |
|
199 |
|
200 | def EncodeObj(obj, enc, out):
|
201 | """
|
202 | Args:
|
203 | obj: Obj to encode
|
204 | enc: encoding params
|
205 | out: output file
|
206 |
|
207 | Returns:
|
208 | ref: Reference to the last block
|
209 | """
|
210 | # Algorithm: Depth first, post-order traversal. First obj is the first leaf.
|
211 | # last obj is the root.
|
212 | #
|
213 | # Array is a homogeneous type.
|
214 |
|
215 | this_chunk = bytearray()
|
216 | assert isinstance(obj, runtime.CompoundObj), \
|
217 | '%r is not a compound obj (%r)' % (obj, obj.__class__)
|
218 |
|
219 | # Constructor objects have a tag.
|
220 | if isinstance(obj.ASDL_TYPE, asdl.Constructor):
|
221 | enc.Tag(obj.tag, this_chunk)
|
222 |
|
223 | for name, desc in obj.ASDL_TYPE.GetFields(): # encode in order
|
224 | field_val = getattr(obj, name)
|
225 |
|
226 | # TODO:
|
227 | # - Float would be inline, etc.
|
228 | # - Repeated value: write them all adjacent to each other?
|
229 |
|
230 | is_maybe = False
|
231 | if isinstance(desc, asdl.MaybeType):
|
232 | is_maybe = True
|
233 | desc = desc.desc # descent
|
234 |
|
235 | #
|
236 | # Now look at types
|
237 | #
|
238 |
|
239 | if isinstance(desc, asdl.IntType) or isinstance(desc, asdl.BoolType):
|
240 | enc.Int(field_val, this_chunk)
|
241 |
|
242 | elif isinstance(desc, asdl.Sum) and asdl.is_simple(desc):
|
243 | # Encode enums as integers. TODO later: Don't use 3 bytes! Can use 1
|
244 | # byte for most enums.
|
245 | enc.Int(field_val.enum_id, this_chunk)
|
246 |
|
247 | # Write variable length field first, assuming that it's a ref/pointer.
|
248 | # TODO: allow one inline, hanging string or array per record.
|
249 | elif isinstance(desc, asdl.StrType):
|
250 | ref = out.Write(enc.PaddedStr(field_val))
|
251 | enc.Ref(ref, this_chunk)
|
252 |
|
253 | elif isinstance(desc, asdl.ArrayType):
|
254 | item_desc = desc.desc
|
255 | ref = EncodeArray(field_val, item_desc, enc, out)
|
256 | enc.Ref(ref, this_chunk)
|
257 |
|
258 | elif isinstance(desc, asdl.UserType):
|
259 | if is_maybe and field_val is None: # e.g. id? prefix_op
|
260 | enc.Ref(0, this_chunk)
|
261 | else:
|
262 | # Assume Id for now
|
263 | enc.Int(field_val.enum_value, this_chunk)
|
264 |
|
265 | else:
|
266 | if is_maybe and field_val is None:
|
267 | enc.Ref(0, this_chunk)
|
268 | else:
|
269 | try:
|
270 | ref = EncodeObj(field_val, enc, out)
|
271 | except EncodeError as e:
|
272 | if not e.details_printed:
|
273 | util.log("Error encoding %s : %s (val %s)", name, e, field_val)
|
274 | e.details_printed = True
|
275 | raise
|
276 | enc.Ref(ref, this_chunk)
|
277 |
|
278 | # Write the parent record
|
279 | this_ref = out.Write(enc.PaddedBlock(this_chunk))
|
280 | return this_ref
|
281 |
|
282 |
|
283 | def EncodeRoot(obj, enc, out):
|
284 | ref = out.Write(b'OHP\x01') # header, version 1
|
285 | assert ref == 0
|
286 | # 4-byte alignment, then 3 byte placeholder for the root ref.
|
287 | ref = out.Write(b'\4\0\0\0')
|
288 | assert ref == 1
|
289 |
|
290 | root_ref = EncodeObj(obj, enc, out)
|
291 | chunk = bytearray()
|
292 | enc.Ref(root_ref, chunk)
|
293 | out.WriteRootRef(chunk) # back up and write it
|
294 |
|
295 | #print("Root obj ref:", root_ref)
|