opy/_regtest/src/asdl/encode.py

OILS / opy / _regtest / src / asdl / encode.py View on Github | oilshell.org

294 lines, 162 significant

1	"""
2	encode.py
3	"""
4
5	from asdl import asdl_ as asdl
6	from asdl import py_meta
7	from asdl import const
8
9	from core import util
10
11
12	class EncodeError(Exception):
13	def __init__(self, args, *kwargs):
14	Exception.__init__(self, args, *kwargs)
15	self.details_printed = False
16
17
18	_DEFAULT_ALIGNMENT = 4
19
20	class BinOutput(object):
21	"""Write aligned blocks here. Keeps track of block indexes for refs."""
22
23	def __init__(self, f, alignment=_DEFAULT_ALIGNMENT):
24	self.f = f
25	# index of last block, to return as a ref.
26	self.last_block = 0
27	self.alignment = alignment
28
29	def WriteRootRef(self, chunk):
30	self.f.seek(5) # seek past 'OHP\x01\x04'
31
32	assert len(chunk) == 3
33	self.f.write(chunk)
34
35	def Write(self, chunk):
36	"""
37	Return a block pointer/index.
38	"""
39	# Input should be padded
40	assert len(chunk) % self.alignment == 0
41	self.f.write(chunk)
42
43	ref = self.last_block
44	num_blocks = len(chunk) // self.alignment # int division
45	#print('WROTE %d blocks' % num_blocks)
46	self.last_block += num_blocks
47
48	# Return a reference to the beginning
49	return ref
50
51
52	class Params(object):
53	"""Encoding parameters.
54
55	Hm most of these settings should be per-field, expressed in the schema. The
56	only global one is the ref/pointer alignment. 4 and 8 are the most likely
57	choices, and 4 is probably fine, because you have 64 MB of addressable memory
58	with 24 bit pointers.
59	"""
60
61	def __init__(self, alignment=_DEFAULT_ALIGNMENT,
62	int_width=const.DEFAULT_INT_WIDTH):
63	self.alignment = alignment
64	self.pointer_type = 'uint32_t'
65
66	self.tag_width = 1 # for ArithVar vs ArithWord.
67	self.int_width = int_width
68	self.ref_width = int_width # Constant 3, used by gen_cpp
69
70	# used for fd, line/col
71	# also I guess steuff like SimpleCommand
72	self.index_width = 2 # 16 bits, e.g. max 64K entries in an array
73
74	self.max_int = 1 << (self.int_width * 8)
75	self.max_index = 1 << (self.index_width * 8)
76	self.max_tag = 1 << (self.tag_width * 8)
77
78	def Tag(self, i, chunk):
79	if i > self.max_tag:
80	raise AssertionError('Invalid id %r' % i)
81	chunk.append(i & 0xFF)
82
83	def Int(self, n, chunk):
84	if n < 0:
85	raise EncodeError(
86	"ASDL can't currently encode negative numbers. Got %d" % n)
87	if n > self.max_int:
88	raise EncodeError(
89	'%d is too big to fit in %d bytes' % (n, self.int_width))
90
91	for i in range(self.int_width):
92	chunk.append(n & 0xFF)
93	n >>= 8
94
95	def Ref(self, n, chunk):
96	# NOTE: ref width is currently the same as int width. Could be different.
97	self.Int(n, chunk)
98
99	def _Pad(self, chunk):
100	n = len(chunk)
101	a = self.alignment
102	if n % a != 0:
103	chunk.extend(b'\x00' * (a - (n % a)))
104	return chunk
105
106	# Right now all strings are references. Later they could be inline.
107	def Str(self, s, chunk):
108	# NOTE: For variable, proc, and function names, it could make sense to
109	# pre-compute and store a hash value. They will be looked up in the stack
110	# and so forth.
111	# - You could also return a obj number or object ID.
112	chunk.extend(s)
113	chunk.append(0) # NUL terminator
114
115	def PaddedStr(self, s):
116	# NOTE:
117	# - The encoder could also have an intern table to save space.
118	# - Str and PaddedStr will both return char* ? Should we allow them to
119	# VARY with the same schema, is a value/ref type PART of the schema? It's
120	# basically small size optimization and "flexible array" optimization. I
121	# think you want that possibility.
122	chunk = bytearray()
123	self.Str(s, chunk)
124	return self._Pad(chunk)
125
126	def Bytes(self, buf, chunk):
127	n = len(buf)
128	if n >= self.max_index:
129	raise EncodeError("bytes object is too long (%d)" % n)
130	for i in range(self.index_width):
131	chunk.append(n & 0xFF)
132	n >>= 8
133	chunk.extend(buf)
134
135	def PaddedBytes(self, buf):
136	chunk = bytearray()
137	self.Bytes(buf, chunk)
138	return self._Pad(chunk)
139
140	def PaddedBlock(self, chunk):
141	return self._Pad(chunk)
142
143
144	def EncodeArray(obj_list, item_desc, enc, out):
145	"""
146	Args:
147	obj_list: List of Obj values
148
149	Returns:
150	ref
151	"""
152	array_chunk = bytearray()
153	enc.Int(len(obj_list), array_chunk) # Length prefix
154
155	if isinstance(item_desc, asdl.IntType) or \
156	isinstance(item_desc, asdl.BoolType):
157	for item in obj_list:
158	enc.Int(item, array_chunk)
159
160	elif isinstance(item_desc, asdl.UserType):
161	# Assume Id for now
162	for item in obj_list:
163	enc.Int(item.enum_value, array_chunk)
164
165	elif isinstance(item_desc, asdl.StrType):
166	for item in obj_list:
167	ref = out.Write(enc.PaddedStr(item))
168	enc.Ref(ref, array_chunk)
169
170	elif isinstance(item_desc, asdl.Sum) and asdl.is_simple(item_desc):
171	for item in obj_list:
172	enc.Int(item.enum_id, array_chunk)
173
174	else:
175	# A simple value is either an int, enum, or pointer. (Later: Iter<Str>
176	# might be possible for locality.)
177	assert isinstance(item_desc, asdl.Sum) or \
178	isinstance(item_desc, asdl.Product), item_desc
179
180	# This is like vector<T*>
181	# Later:
182	# - Product types can be put in line
183	# - Sum types can even be put in line, if you have List<T> rather than
184	# Array<T>. Array implies O(1) random access; List doesn't.
185	for item in obj_list:
186	try:
187	ref = EncodeObj(item, enc, out)
188	except EncodeError as e:
189	if not e.details_printed:
190	util.log("Error encoding array: %s (item %s)", e, item)
191	e.details_printed = True
192	raise
193	enc.Ref(ref, array_chunk)
194
195	this_ref = out.Write(enc.PaddedBlock(array_chunk))
196	return this_ref
197
198
199	def EncodeObj(obj, enc, out):
200	"""
201	Args:
202	obj: Obj to encode
203	enc: encoding params
204	out: output file
205
206	Returns:
207	ref: Reference to the last block
208	"""
209	# Algorithm: Depth first, post-order traversal. First obj is the first leaf.
210	# last obj is the root.
211	#
212	# Array is a homogeneous type.
213
214	this_chunk = bytearray()
215	assert isinstance(obj, py_meta.CompoundObj), \
216	'%r is not a compound obj (%r)' % (obj, obj.__class__)
217
218	# Constructor objects have a tag.
219	if isinstance(obj.ASDL_TYPE, asdl.Constructor):
220	enc.Tag(obj.tag, this_chunk)
221
222	for name, desc in obj.ASDL_TYPE.GetFields(): # encode in order
223	field_val = getattr(obj, name)
224
225	# TODO:
226	# - Float would be inline, etc.
227	# - Repeated value: write them all adjacent to each other?
228
229	is_maybe = False
230	if isinstance(desc, asdl.MaybeType):
231	is_maybe = True
232	desc = desc.desc # descent
233
234	#
235	# Now look at types
236	#
237
238	if isinstance(desc, asdl.IntType) or isinstance(desc, asdl.BoolType):
239	enc.Int(field_val, this_chunk)
240
241	elif isinstance(desc, asdl.Sum) and asdl.is_simple(desc):
242	# Encode enums as integers. TODO later: Don't use 3 bytes! Can use 1
243	# byte for most enums.
244	enc.Int(field_val.enum_id, this_chunk)
245
246	# Write variable length field first, assuming that it's a ref/pointer.
247	# TODO: allow one inline, hanging string or array per record.
248	elif isinstance(desc, asdl.StrType):
249	ref = out.Write(enc.PaddedStr(field_val))
250	enc.Ref(ref, this_chunk)
251
252	elif isinstance(desc, asdl.ArrayType):
253	item_desc = desc.desc
254	ref = EncodeArray(field_val, item_desc, enc, out)
255	enc.Ref(ref, this_chunk)
256
257	elif isinstance(desc, asdl.UserType):
258	if is_maybe and field_val is None: # e.g. id? prefix_op
259	enc.Ref(0, this_chunk)
260	else:
261	# Assume Id for now
262	enc.Int(field_val.enum_value, this_chunk)
263
264	else:
265	if is_maybe and field_val is None:
266	enc.Ref(0, this_chunk)
267	else:
268	try:
269	ref = EncodeObj(field_val, enc, out)
270	except EncodeError as e:
271	if not e.details_printed:
272	util.log("Error encoding %s : %s (val %s)", name, e, field_val)
273	e.details_printed = True
274	raise
275	enc.Ref(ref, this_chunk)
276
277	# Write the parent record
278	this_ref = out.Write(enc.PaddedBlock(this_chunk))
279	return this_ref
280
281
282	def EncodeRoot(obj, enc, out):
283	ref = out.Write(b'OHP\x01') # header, version 1
284	assert ref == 0
285	# 4-byte alignment, then 3 byte placeholder for the root ref.
286	ref = out.Write(b'\4\0\0\0')
287	assert ref == 1
288
289	root_ref = EncodeObj(obj, enc, out)
290	chunk = bytearray()
291	enc.Ref(root_ref, chunk)
292	out.WriteRootRef(chunk) # back up and write it
293
294	#print("Root obj ref:", root_ref)