OILS / demo / old / encode.py View on Github | oilshell.org

295 lines, 163 significant
1"""
2encode.py
3"""
4
5from asdl import asdl_ as asdl
6from asdl import runtime
7from asdl import const
8
9from core import util
10
11
12class EncodeError(Exception):
13 def __init__(self, *args, **kwargs):
14 Exception.__init__(self, *args, **kwargs)
15 self.details_printed = False
16
17
18_DEFAULT_ALIGNMENT = 4
19
20class BinOutput(object):
21 """Write aligned blocks here. Keeps track of block indexes for refs."""
22
23 def __init__(self, f, alignment=_DEFAULT_ALIGNMENT):
24 self.f = f
25 # index of last block, to return as a ref.
26 self.last_block = 0
27 self.alignment = alignment
28
29 def WriteRootRef(self, chunk):
30 self.f.seek(5) # seek past 'OHP\x01\x04'
31
32 assert len(chunk) == 3
33 self.f.write(chunk)
34
35 def Write(self, chunk):
36 """
37 Return a block pointer/index.
38 """
39 # Input should be padded
40 assert len(chunk) % self.alignment == 0
41 self.f.write(chunk)
42
43 ref = self.last_block
44 num_blocks = len(chunk) // self.alignment # int division
45 #print('WROTE %d blocks' % num_blocks)
46 self.last_block += num_blocks
47
48 # Return a reference to the beginning
49 return ref
50
51
52class Params(object):
53 """Encoding parameters.
54
55 Hm most of these settings should be per-field, expressed in the schema. The
56 only global one is the ref/pointer alignment. 4 and 8 are the most likely
57 choices, and 4 is probably fine, because you have 64 MB of addressable memory
58 with 24 bit pointers.
59 """
60
61 def __init__(self, alignment=_DEFAULT_ALIGNMENT,
62 int_width=const.DEFAULT_INT_WIDTH):
63 self.alignment = alignment
64 self.pointer_type = 'uint32_t'
65
66 self.tag_width = 1 # for ArithVar vs ArithWord.
67 self.int_width = int_width
68 self.ref_width = int_width # Constant 3, used by gen_cpp
69
70 # used for fd, line/col
71 # also I guess steuff like SimpleCommand
72 self.index_width = 2 # 16 bits, e.g. max 64K entries in an array
73
74 self.max_int = 1 << (self.int_width * 8)
75 self.max_index = 1 << (self.index_width * 8)
76 self.max_tag = 1 << (self.tag_width * 8)
77
78 def Tag(self, i, chunk):
79 if i > self.max_tag:
80 raise AssertionError('Invalid id %r' % i)
81 chunk.append(i & 0xFF)
82
83 def Int(self, n, chunk):
84 if n < 0:
85 raise EncodeError(
86 "ASDL can't currently encode negative numbers. Got %d" % n)
87 if n > self.max_int:
88 raise EncodeError(
89 '%d is too big to fit in %d bytes' % (n, self.int_width))
90
91 for i in xrange(self.int_width):
92 chunk.append(n & 0xFF)
93 n >>= 8
94
95 def Ref(self, n, chunk):
96 # NOTE: ref width is currently the same as int width. Could be different.
97 self.Int(n, chunk)
98
99 def _Pad(self, chunk):
100 n = len(chunk)
101 a = self.alignment
102 if n % a != 0:
103 chunk.extend(b'\x00' * (a - (n % a)))
104 return chunk
105
106 # Right now all strings are references. Later they could be inline.
107 def Str(self, s, chunk):
108 # NOTE: For variable, proc, and function names, it could make sense to
109 # pre-compute and store a hash value. They will be looked up in the stack
110 # and so forth.
111 # - You could also return a obj number or object ID.
112 chunk.extend(s)
113 chunk.append(0) # NUL terminator
114
115 def PaddedStr(self, s):
116 # NOTE:
117 # - The encoder could also have an intern table to save space.
118 # - Str and PaddedStr will both return char* ? Should we allow them to
119 # VARY with the same schema, is a value/ref type PART of the schema? It's
120 # basically small size optimization and "flexible array" optimization. I
121 # think you want that possibility.
122 chunk = bytearray()
123 self.Str(s, chunk)
124 return self._Pad(chunk)
125
126 def Bytes(self, buf, chunk):
127 n = len(buf)
128 if n >= self.max_index:
129 raise EncodeError("bytes object is too long (%d)" % n)
130 for i in xrange(self.index_width):
131 chunk.append(n & 0xFF)
132 n >>= 8
133 chunk.extend(buf)
134
135 def PaddedBytes(self, buf):
136 chunk = bytearray()
137 self.Bytes(buf, chunk)
138 return self._Pad(chunk)
139
140 def PaddedBlock(self, chunk):
141 return self._Pad(chunk)
142
143
144def EncodeArray(obj_list, item_desc, enc, out):
145 """
146 Args:
147 obj_list: List of Obj values
148
149 Returns:
150 ref
151 """
152 array_chunk = bytearray()
153 enc.Int(len(obj_list), array_chunk) # Length prefix
154
155 if isinstance(item_desc, asdl.IntType) or \
156 isinstance(item_desc, asdl.BoolType):
157 for item in obj_list:
158 enc.Int(item, array_chunk)
159
160 elif isinstance(item_desc, asdl.UserType):
161 # Assume Id for now
162 for item in obj_list:
163 enc.Int(item.enum_value, array_chunk)
164
165 elif isinstance(item_desc, asdl.StrType):
166 for item in obj_list:
167 ref = out.Write(enc.PaddedStr(item))
168 enc.Ref(ref, array_chunk)
169
170 elif isinstance(item_desc, asdl.Sum) and asdl.is_simple(item_desc):
171 for item in obj_list:
172 enc.Int(item.enum_id, array_chunk)
173
174 else:
175 # A simple value is either an int, enum, or pointer. (Later: Iter<Str>
176 # might be possible for locality.)
177 assert \
178 isinstance(item_desc, asdl.SumType) or \
179 isinstance(item_desc, asdl.CompoundType), item_desc
180
181 # This is like vector<T*>
182 # Later:
183 # - Product types can be put in line
184 # - Sum types can even be put in line, if you have List<T> rather than
185 # Array<T>. Array implies O(1) random access; List doesn't.
186 for item in obj_list:
187 try:
188 ref = EncodeObj(item, enc, out)
189 except EncodeError as e:
190 if not e.details_printed:
191 util.log("Error encoding array: %s (item %s)", e, item)
192 e.details_printed = True
193 raise
194 enc.Ref(ref, array_chunk)
195
196 this_ref = out.Write(enc.PaddedBlock(array_chunk))
197 return this_ref
198
199
200def EncodeObj(obj, enc, out):
201 """
202 Args:
203 obj: Obj to encode
204 enc: encoding params
205 out: output file
206
207 Returns:
208 ref: Reference to the last block
209 """
210 # Algorithm: Depth first, post-order traversal. First obj is the first leaf.
211 # last obj is the root.
212 #
213 # Array is a homogeneous type.
214
215 this_chunk = bytearray()
216 assert isinstance(obj, runtime.CompoundObj), \
217 '%r is not a compound obj (%r)' % (obj, obj.__class__)
218
219 # Constructor objects have a tag.
220 if isinstance(obj.ASDL_TYPE, asdl.Constructor):
221 enc.Tag(obj.tag, this_chunk)
222
223 for name, desc in obj.ASDL_TYPE.GetFields(): # encode in order
224 field_val = getattr(obj, name)
225
226 # TODO:
227 # - Float would be inline, etc.
228 # - Repeated value: write them all adjacent to each other?
229
230 is_maybe = False
231 if isinstance(desc, asdl.MaybeType):
232 is_maybe = True
233 desc = desc.desc # descent
234
235 #
236 # Now look at types
237 #
238
239 if isinstance(desc, asdl.IntType) or isinstance(desc, asdl.BoolType):
240 enc.Int(field_val, this_chunk)
241
242 elif isinstance(desc, asdl.Sum) and asdl.is_simple(desc):
243 # Encode enums as integers. TODO later: Don't use 3 bytes! Can use 1
244 # byte for most enums.
245 enc.Int(field_val.enum_id, this_chunk)
246
247 # Write variable length field first, assuming that it's a ref/pointer.
248 # TODO: allow one inline, hanging string or array per record.
249 elif isinstance(desc, asdl.StrType):
250 ref = out.Write(enc.PaddedStr(field_val))
251 enc.Ref(ref, this_chunk)
252
253 elif isinstance(desc, asdl.ArrayType):
254 item_desc = desc.desc
255 ref = EncodeArray(field_val, item_desc, enc, out)
256 enc.Ref(ref, this_chunk)
257
258 elif isinstance(desc, asdl.UserType):
259 if is_maybe and field_val is None: # e.g. id? prefix_op
260 enc.Ref(0, this_chunk)
261 else:
262 # Assume Id for now
263 enc.Int(field_val.enum_value, this_chunk)
264
265 else:
266 if is_maybe and field_val is None:
267 enc.Ref(0, this_chunk)
268 else:
269 try:
270 ref = EncodeObj(field_val, enc, out)
271 except EncodeError as e:
272 if not e.details_printed:
273 util.log("Error encoding %s : %s (val %s)", name, e, field_val)
274 e.details_printed = True
275 raise
276 enc.Ref(ref, this_chunk)
277
278 # Write the parent record
279 this_ref = out.Write(enc.PaddedBlock(this_chunk))
280 return this_ref
281
282
283def EncodeRoot(obj, enc, out):
284 ref = out.Write(b'OHP\x01') # header, version 1
285 assert ref == 0
286 # 4-byte alignment, then 3 byte placeholder for the root ref.
287 ref = out.Write(b'\4\0\0\0')
288 assert ref == 1
289
290 root_ref = EncodeObj(obj, enc, out)
291 chunk = bytearray()
292 enc.Ref(root_ref, chunk)
293 out.WriteRootRef(chunk) # back up and write it
294
295 #print("Root obj ref:", root_ref)