| 1 | """
|
| 2 | encode.py
|
| 3 | """
|
| 4 |
|
| 5 | from asdl import asdl_ as asdl
|
| 6 | from asdl import py_meta
|
| 7 | from asdl import const
|
| 8 |
|
| 9 | from core import util
|
| 10 |
|
| 11 |
|
| 12 | class EncodeError(Exception):
|
| 13 | def __init__(self, *args, **kwargs):
|
| 14 | Exception.__init__(self, *args, **kwargs)
|
| 15 | self.details_printed = False
|
| 16 |
|
| 17 |
|
| 18 | _DEFAULT_ALIGNMENT = 4
|
| 19 |
|
| 20 | class BinOutput(object):
|
| 21 | """Write aligned blocks here. Keeps track of block indexes for refs."""
|
| 22 |
|
| 23 | def __init__(self, f, alignment=_DEFAULT_ALIGNMENT):
|
| 24 | self.f = f
|
| 25 | # index of last block, to return as a ref.
|
| 26 | self.last_block = 0
|
| 27 | self.alignment = alignment
|
| 28 |
|
| 29 | def WriteRootRef(self, chunk):
|
| 30 | self.f.seek(5) # seek past 'OHP\x01\x04'
|
| 31 |
|
| 32 | assert len(chunk) == 3
|
| 33 | self.f.write(chunk)
|
| 34 |
|
| 35 | def Write(self, chunk):
|
| 36 | """
|
| 37 | Return a block pointer/index.
|
| 38 | """
|
| 39 | # Input should be padded
|
| 40 | assert len(chunk) % self.alignment == 0
|
| 41 | self.f.write(chunk)
|
| 42 |
|
| 43 | ref = self.last_block
|
| 44 | num_blocks = len(chunk) // self.alignment # int division
|
| 45 | #print('WROTE %d blocks' % num_blocks)
|
| 46 | self.last_block += num_blocks
|
| 47 |
|
| 48 | # Return a reference to the beginning
|
| 49 | return ref
|
| 50 |
|
| 51 |
|
| 52 | class Params(object):
|
| 53 | """Encoding parameters.
|
| 54 |
|
| 55 | Hm most of these settings should be per-field, expressed in the schema. The
|
| 56 | only global one is the ref/pointer alignment. 4 and 8 are the most likely
|
| 57 | choices, and 4 is probably fine, because you have 64 MB of addressable memory
|
| 58 | with 24 bit pointers.
|
| 59 | """
|
| 60 |
|
| 61 | def __init__(self, alignment=_DEFAULT_ALIGNMENT,
|
| 62 | int_width=const.DEFAULT_INT_WIDTH):
|
| 63 | self.alignment = alignment
|
| 64 | self.pointer_type = 'uint32_t'
|
| 65 |
|
| 66 | self.tag_width = 1 # for ArithVar vs ArithWord.
|
| 67 | self.int_width = int_width
|
| 68 | self.ref_width = int_width # Constant 3, used by gen_cpp
|
| 69 |
|
| 70 | # used for fd, line/col
|
| 71 | # also I guess steuff like SimpleCommand
|
| 72 | self.index_width = 2 # 16 bits, e.g. max 64K entries in an array
|
| 73 |
|
| 74 | self.max_int = 1 << (self.int_width * 8)
|
| 75 | self.max_index = 1 << (self.index_width * 8)
|
| 76 | self.max_tag = 1 << (self.tag_width * 8)
|
| 77 |
|
| 78 | def Tag(self, i, chunk):
|
| 79 | if i > self.max_tag:
|
| 80 | raise AssertionError('Invalid id %r' % i)
|
| 81 | chunk.append(i & 0xFF)
|
| 82 |
|
| 83 | def Int(self, n, chunk):
|
| 84 | if n < 0:
|
| 85 | raise EncodeError(
|
| 86 | "ASDL can't currently encode negative numbers. Got %d" % n)
|
| 87 | if n > self.max_int:
|
| 88 | raise EncodeError(
|
| 89 | '%d is too big to fit in %d bytes' % (n, self.int_width))
|
| 90 |
|
| 91 | for i in range(self.int_width):
|
| 92 | chunk.append(n & 0xFF)
|
| 93 | n >>= 8
|
| 94 |
|
| 95 | def Ref(self, n, chunk):
|
| 96 | # NOTE: ref width is currently the same as int width. Could be different.
|
| 97 | self.Int(n, chunk)
|
| 98 |
|
| 99 | def _Pad(self, chunk):
|
| 100 | n = len(chunk)
|
| 101 | a = self.alignment
|
| 102 | if n % a != 0:
|
| 103 | chunk.extend(b'\x00' * (a - (n % a)))
|
| 104 | return chunk
|
| 105 |
|
| 106 | # Right now all strings are references. Later they could be inline.
|
| 107 | def Str(self, s, chunk):
|
| 108 | # NOTE: For variable, proc, and function names, it could make sense to
|
| 109 | # pre-compute and store a hash value. They will be looked up in the stack
|
| 110 | # and so forth.
|
| 111 | # - You could also return a obj number or object ID.
|
| 112 | chunk.extend(s)
|
| 113 | chunk.append(0) # NUL terminator
|
| 114 |
|
| 115 | def PaddedStr(self, s):
|
| 116 | # NOTE:
|
| 117 | # - The encoder could also have an intern table to save space.
|
| 118 | # - Str and PaddedStr will both return char* ? Should we allow them to
|
| 119 | # VARY with the same schema, is a value/ref type PART of the schema? It's
|
| 120 | # basically small size optimization and "flexible array" optimization. I
|
| 121 | # think you want that possibility.
|
| 122 | chunk = bytearray()
|
| 123 | self.Str(s, chunk)
|
| 124 | return self._Pad(chunk)
|
| 125 |
|
| 126 | def Bytes(self, buf, chunk):
|
| 127 | n = len(buf)
|
| 128 | if n >= self.max_index:
|
| 129 | raise EncodeError("bytes object is too long (%d)" % n)
|
| 130 | for i in range(self.index_width):
|
| 131 | chunk.append(n & 0xFF)
|
| 132 | n >>= 8
|
| 133 | chunk.extend(buf)
|
| 134 |
|
| 135 | def PaddedBytes(self, buf):
|
| 136 | chunk = bytearray()
|
| 137 | self.Bytes(buf, chunk)
|
| 138 | return self._Pad(chunk)
|
| 139 |
|
| 140 | def PaddedBlock(self, chunk):
|
| 141 | return self._Pad(chunk)
|
| 142 |
|
| 143 |
|
| 144 | def EncodeArray(obj_list, item_desc, enc, out):
|
| 145 | """
|
| 146 | Args:
|
| 147 | obj_list: List of Obj values
|
| 148 |
|
| 149 | Returns:
|
| 150 | ref
|
| 151 | """
|
| 152 | array_chunk = bytearray()
|
| 153 | enc.Int(len(obj_list), array_chunk) # Length prefix
|
| 154 |
|
| 155 | if isinstance(item_desc, asdl.IntType) or \
|
| 156 | isinstance(item_desc, asdl.BoolType):
|
| 157 | for item in obj_list:
|
| 158 | enc.Int(item, array_chunk)
|
| 159 |
|
| 160 | elif isinstance(item_desc, asdl.UserType):
|
| 161 | # Assume Id for now
|
| 162 | for item in obj_list:
|
| 163 | enc.Int(item.enum_value, array_chunk)
|
| 164 |
|
| 165 | elif isinstance(item_desc, asdl.StrType):
|
| 166 | for item in obj_list:
|
| 167 | ref = out.Write(enc.PaddedStr(item))
|
| 168 | enc.Ref(ref, array_chunk)
|
| 169 |
|
| 170 | elif isinstance(item_desc, asdl.Sum) and asdl.is_simple(item_desc):
|
| 171 | for item in obj_list:
|
| 172 | enc.Int(item.enum_id, array_chunk)
|
| 173 |
|
| 174 | else:
|
| 175 | # A simple value is either an int, enum, or pointer. (Later: Iter<Str>
|
| 176 | # might be possible for locality.)
|
| 177 | assert isinstance(item_desc, asdl.Sum) or \
|
| 178 | isinstance(item_desc, asdl.Product), item_desc
|
| 179 |
|
| 180 | # This is like vector<T*>
|
| 181 | # Later:
|
| 182 | # - Product types can be put in line
|
| 183 | # - Sum types can even be put in line, if you have List<T> rather than
|
| 184 | # Array<T>. Array implies O(1) random access; List doesn't.
|
| 185 | for item in obj_list:
|
| 186 | try:
|
| 187 | ref = EncodeObj(item, enc, out)
|
| 188 | except EncodeError as e:
|
| 189 | if not e.details_printed:
|
| 190 | util.log("Error encoding array: %s (item %s)", e, item)
|
| 191 | e.details_printed = True
|
| 192 | raise
|
| 193 | enc.Ref(ref, array_chunk)
|
| 194 |
|
| 195 | this_ref = out.Write(enc.PaddedBlock(array_chunk))
|
| 196 | return this_ref
|
| 197 |
|
| 198 |
|
| 199 | def EncodeObj(obj, enc, out):
|
| 200 | """
|
| 201 | Args:
|
| 202 | obj: Obj to encode
|
| 203 | enc: encoding params
|
| 204 | out: output file
|
| 205 |
|
| 206 | Returns:
|
| 207 | ref: Reference to the last block
|
| 208 | """
|
| 209 | # Algorithm: Depth first, post-order traversal. First obj is the first leaf.
|
| 210 | # last obj is the root.
|
| 211 | #
|
| 212 | # Array is a homogeneous type.
|
| 213 |
|
| 214 | this_chunk = bytearray()
|
| 215 | assert isinstance(obj, py_meta.CompoundObj), \
|
| 216 | '%r is not a compound obj (%r)' % (obj, obj.__class__)
|
| 217 |
|
| 218 | # Constructor objects have a tag.
|
| 219 | if isinstance(obj.ASDL_TYPE, asdl.Constructor):
|
| 220 | enc.Tag(obj.tag, this_chunk)
|
| 221 |
|
| 222 | for name, desc in obj.ASDL_TYPE.GetFields(): # encode in order
|
| 223 | field_val = getattr(obj, name)
|
| 224 |
|
| 225 | # TODO:
|
| 226 | # - Float would be inline, etc.
|
| 227 | # - Repeated value: write them all adjacent to each other?
|
| 228 |
|
| 229 | is_maybe = False
|
| 230 | if isinstance(desc, asdl.MaybeType):
|
| 231 | is_maybe = True
|
| 232 | desc = desc.desc # descent
|
| 233 |
|
| 234 | #
|
| 235 | # Now look at types
|
| 236 | #
|
| 237 |
|
| 238 | if isinstance(desc, asdl.IntType) or isinstance(desc, asdl.BoolType):
|
| 239 | enc.Int(field_val, this_chunk)
|
| 240 |
|
| 241 | elif isinstance(desc, asdl.Sum) and asdl.is_simple(desc):
|
| 242 | # Encode enums as integers. TODO later: Don't use 3 bytes! Can use 1
|
| 243 | # byte for most enums.
|
| 244 | enc.Int(field_val.enum_id, this_chunk)
|
| 245 |
|
| 246 | # Write variable length field first, assuming that it's a ref/pointer.
|
| 247 | # TODO: allow one inline, hanging string or array per record.
|
| 248 | elif isinstance(desc, asdl.StrType):
|
| 249 | ref = out.Write(enc.PaddedStr(field_val))
|
| 250 | enc.Ref(ref, this_chunk)
|
| 251 |
|
| 252 | elif isinstance(desc, asdl.ArrayType):
|
| 253 | item_desc = desc.desc
|
| 254 | ref = EncodeArray(field_val, item_desc, enc, out)
|
| 255 | enc.Ref(ref, this_chunk)
|
| 256 |
|
| 257 | elif isinstance(desc, asdl.UserType):
|
| 258 | if is_maybe and field_val is None: # e.g. id? prefix_op
|
| 259 | enc.Ref(0, this_chunk)
|
| 260 | else:
|
| 261 | # Assume Id for now
|
| 262 | enc.Int(field_val.enum_value, this_chunk)
|
| 263 |
|
| 264 | else:
|
| 265 | if is_maybe and field_val is None:
|
| 266 | enc.Ref(0, this_chunk)
|
| 267 | else:
|
| 268 | try:
|
| 269 | ref = EncodeObj(field_val, enc, out)
|
| 270 | except EncodeError as e:
|
| 271 | if not e.details_printed:
|
| 272 | util.log("Error encoding %s : %s (val %s)", name, e, field_val)
|
| 273 | e.details_printed = True
|
| 274 | raise
|
| 275 | enc.Ref(ref, this_chunk)
|
| 276 |
|
| 277 | # Write the parent record
|
| 278 | this_ref = out.Write(enc.PaddedBlock(this_chunk))
|
| 279 | return this_ref
|
| 280 |
|
| 281 |
|
| 282 | def EncodeRoot(obj, enc, out):
|
| 283 | ref = out.Write(b'OHP\x01') # header, version 1
|
| 284 | assert ref == 0
|
| 285 | # 4-byte alignment, then 3 byte placeholder for the root ref.
|
| 286 | ref = out.Write(b'\4\0\0\0')
|
| 287 | assert ref == 1
|
| 288 |
|
| 289 | root_ref = EncodeObj(obj, enc, out)
|
| 290 | chunk = bytearray()
|
| 291 | enc.Ref(root_ref, chunk)
|
| 292 | out.WriteRootRef(chunk) # back up and write it
|
| 293 |
|
| 294 | #print("Root obj ref:", root_ref)
|