OILS / opy / _regtest / src / asdl / encode.py View on Github | oilshell.org

294 lines, 162 significant
1"""
2encode.py
3"""
4
5from asdl import asdl_ as asdl
6from asdl import py_meta
7from asdl import const
8
9from core import util
10
11
12class EncodeError(Exception):
13 def __init__(self, *args, **kwargs):
14 Exception.__init__(self, *args, **kwargs)
15 self.details_printed = False
16
17
18_DEFAULT_ALIGNMENT = 4
19
20class BinOutput(object):
21 """Write aligned blocks here. Keeps track of block indexes for refs."""
22
23 def __init__(self, f, alignment=_DEFAULT_ALIGNMENT):
24 self.f = f
25 # index of last block, to return as a ref.
26 self.last_block = 0
27 self.alignment = alignment
28
29 def WriteRootRef(self, chunk):
30 self.f.seek(5) # seek past 'OHP\x01\x04'
31
32 assert len(chunk) == 3
33 self.f.write(chunk)
34
35 def Write(self, chunk):
36 """
37 Return a block pointer/index.
38 """
39 # Input should be padded
40 assert len(chunk) % self.alignment == 0
41 self.f.write(chunk)
42
43 ref = self.last_block
44 num_blocks = len(chunk) // self.alignment # int division
45 #print('WROTE %d blocks' % num_blocks)
46 self.last_block += num_blocks
47
48 # Return a reference to the beginning
49 return ref
50
51
52class Params(object):
53 """Encoding parameters.
54
55 Hm most of these settings should be per-field, expressed in the schema. The
56 only global one is the ref/pointer alignment. 4 and 8 are the most likely
57 choices, and 4 is probably fine, because you have 64 MB of addressable memory
58 with 24 bit pointers.
59 """
60
61 def __init__(self, alignment=_DEFAULT_ALIGNMENT,
62 int_width=const.DEFAULT_INT_WIDTH):
63 self.alignment = alignment
64 self.pointer_type = 'uint32_t'
65
66 self.tag_width = 1 # for ArithVar vs ArithWord.
67 self.int_width = int_width
68 self.ref_width = int_width # Constant 3, used by gen_cpp
69
70 # used for fd, line/col
71 # also I guess steuff like SimpleCommand
72 self.index_width = 2 # 16 bits, e.g. max 64K entries in an array
73
74 self.max_int = 1 << (self.int_width * 8)
75 self.max_index = 1 << (self.index_width * 8)
76 self.max_tag = 1 << (self.tag_width * 8)
77
78 def Tag(self, i, chunk):
79 if i > self.max_tag:
80 raise AssertionError('Invalid id %r' % i)
81 chunk.append(i & 0xFF)
82
83 def Int(self, n, chunk):
84 if n < 0:
85 raise EncodeError(
86 "ASDL can't currently encode negative numbers. Got %d" % n)
87 if n > self.max_int:
88 raise EncodeError(
89 '%d is too big to fit in %d bytes' % (n, self.int_width))
90
91 for i in range(self.int_width):
92 chunk.append(n & 0xFF)
93 n >>= 8
94
95 def Ref(self, n, chunk):
96 # NOTE: ref width is currently the same as int width. Could be different.
97 self.Int(n, chunk)
98
99 def _Pad(self, chunk):
100 n = len(chunk)
101 a = self.alignment
102 if n % a != 0:
103 chunk.extend(b'\x00' * (a - (n % a)))
104 return chunk
105
106 # Right now all strings are references. Later they could be inline.
107 def Str(self, s, chunk):
108 # NOTE: For variable, proc, and function names, it could make sense to
109 # pre-compute and store a hash value. They will be looked up in the stack
110 # and so forth.
111 # - You could also return a obj number or object ID.
112 chunk.extend(s)
113 chunk.append(0) # NUL terminator
114
115 def PaddedStr(self, s):
116 # NOTE:
117 # - The encoder could also have an intern table to save space.
118 # - Str and PaddedStr will both return char* ? Should we allow them to
119 # VARY with the same schema, is a value/ref type PART of the schema? It's
120 # basically small size optimization and "flexible array" optimization. I
121 # think you want that possibility.
122 chunk = bytearray()
123 self.Str(s, chunk)
124 return self._Pad(chunk)
125
126 def Bytes(self, buf, chunk):
127 n = len(buf)
128 if n >= self.max_index:
129 raise EncodeError("bytes object is too long (%d)" % n)
130 for i in range(self.index_width):
131 chunk.append(n & 0xFF)
132 n >>= 8
133 chunk.extend(buf)
134
135 def PaddedBytes(self, buf):
136 chunk = bytearray()
137 self.Bytes(buf, chunk)
138 return self._Pad(chunk)
139
140 def PaddedBlock(self, chunk):
141 return self._Pad(chunk)
142
143
144def EncodeArray(obj_list, item_desc, enc, out):
145 """
146 Args:
147 obj_list: List of Obj values
148
149 Returns:
150 ref
151 """
152 array_chunk = bytearray()
153 enc.Int(len(obj_list), array_chunk) # Length prefix
154
155 if isinstance(item_desc, asdl.IntType) or \
156 isinstance(item_desc, asdl.BoolType):
157 for item in obj_list:
158 enc.Int(item, array_chunk)
159
160 elif isinstance(item_desc, asdl.UserType):
161 # Assume Id for now
162 for item in obj_list:
163 enc.Int(item.enum_value, array_chunk)
164
165 elif isinstance(item_desc, asdl.StrType):
166 for item in obj_list:
167 ref = out.Write(enc.PaddedStr(item))
168 enc.Ref(ref, array_chunk)
169
170 elif isinstance(item_desc, asdl.Sum) and asdl.is_simple(item_desc):
171 for item in obj_list:
172 enc.Int(item.enum_id, array_chunk)
173
174 else:
175 # A simple value is either an int, enum, or pointer. (Later: Iter<Str>
176 # might be possible for locality.)
177 assert isinstance(item_desc, asdl.Sum) or \
178 isinstance(item_desc, asdl.Product), item_desc
179
180 # This is like vector<T*>
181 # Later:
182 # - Product types can be put in line
183 # - Sum types can even be put in line, if you have List<T> rather than
184 # Array<T>. Array implies O(1) random access; List doesn't.
185 for item in obj_list:
186 try:
187 ref = EncodeObj(item, enc, out)
188 except EncodeError as e:
189 if not e.details_printed:
190 util.log("Error encoding array: %s (item %s)", e, item)
191 e.details_printed = True
192 raise
193 enc.Ref(ref, array_chunk)
194
195 this_ref = out.Write(enc.PaddedBlock(array_chunk))
196 return this_ref
197
198
199def EncodeObj(obj, enc, out):
200 """
201 Args:
202 obj: Obj to encode
203 enc: encoding params
204 out: output file
205
206 Returns:
207 ref: Reference to the last block
208 """
209 # Algorithm: Depth first, post-order traversal. First obj is the first leaf.
210 # last obj is the root.
211 #
212 # Array is a homogeneous type.
213
214 this_chunk = bytearray()
215 assert isinstance(obj, py_meta.CompoundObj), \
216 '%r is not a compound obj (%r)' % (obj, obj.__class__)
217
218 # Constructor objects have a tag.
219 if isinstance(obj.ASDL_TYPE, asdl.Constructor):
220 enc.Tag(obj.tag, this_chunk)
221
222 for name, desc in obj.ASDL_TYPE.GetFields(): # encode in order
223 field_val = getattr(obj, name)
224
225 # TODO:
226 # - Float would be inline, etc.
227 # - Repeated value: write them all adjacent to each other?
228
229 is_maybe = False
230 if isinstance(desc, asdl.MaybeType):
231 is_maybe = True
232 desc = desc.desc # descent
233
234 #
235 # Now look at types
236 #
237
238 if isinstance(desc, asdl.IntType) or isinstance(desc, asdl.BoolType):
239 enc.Int(field_val, this_chunk)
240
241 elif isinstance(desc, asdl.Sum) and asdl.is_simple(desc):
242 # Encode enums as integers. TODO later: Don't use 3 bytes! Can use 1
243 # byte for most enums.
244 enc.Int(field_val.enum_id, this_chunk)
245
246 # Write variable length field first, assuming that it's a ref/pointer.
247 # TODO: allow one inline, hanging string or array per record.
248 elif isinstance(desc, asdl.StrType):
249 ref = out.Write(enc.PaddedStr(field_val))
250 enc.Ref(ref, this_chunk)
251
252 elif isinstance(desc, asdl.ArrayType):
253 item_desc = desc.desc
254 ref = EncodeArray(field_val, item_desc, enc, out)
255 enc.Ref(ref, this_chunk)
256
257 elif isinstance(desc, asdl.UserType):
258 if is_maybe and field_val is None: # e.g. id? prefix_op
259 enc.Ref(0, this_chunk)
260 else:
261 # Assume Id for now
262 enc.Int(field_val.enum_value, this_chunk)
263
264 else:
265 if is_maybe and field_val is None:
266 enc.Ref(0, this_chunk)
267 else:
268 try:
269 ref = EncodeObj(field_val, enc, out)
270 except EncodeError as e:
271 if not e.details_printed:
272 util.log("Error encoding %s : %s (val %s)", name, e, field_val)
273 e.details_printed = True
274 raise
275 enc.Ref(ref, this_chunk)
276
277 # Write the parent record
278 this_ref = out.Write(enc.PaddedBlock(this_chunk))
279 return this_ref
280
281
282def EncodeRoot(obj, enc, out):
283 ref = out.Write(b'OHP\x01') # header, version 1
284 assert ref == 0
285 # 4-byte alignment, then 3 byte placeholder for the root ref.
286 ref = out.Write(b'\4\0\0\0')
287 assert ref == 1
288
289 root_ref = EncodeObj(obj, enc, out)
290 chunk = bytearray()
291 enc.Ref(root_ref, chunk)
292 out.WriteRootRef(chunk) # back up and write it
293
294 #print("Root obj ref:", root_ref)