| 1 | """
 | 
| 2 | encode.py
 | 
| 3 | """
 | 
| 4 | 
 | 
| 5 | from asdl import asdl_ as asdl
 | 
| 6 | from asdl import py_meta
 | 
| 7 | from asdl import const
 | 
| 8 | 
 | 
| 9 | from core import util
 | 
| 10 | 
 | 
| 11 | 
 | 
| 12 | class EncodeError(Exception):
 | 
| 13 |   def __init__(self, *args, **kwargs):
 | 
| 14 |     Exception.__init__(self, *args, **kwargs)
 | 
| 15 |     self.details_printed = False
 | 
| 16 | 
 | 
| 17 | 
 | 
| 18 | _DEFAULT_ALIGNMENT = 4
 | 
| 19 | 
 | 
| 20 | class BinOutput(object):
 | 
| 21 |   """Write aligned blocks here.  Keeps track of block indexes for refs."""
 | 
| 22 | 
 | 
| 23 |   def __init__(self, f, alignment=_DEFAULT_ALIGNMENT):
 | 
| 24 |     self.f = f
 | 
| 25 |     # index of last block, to return as a ref.
 | 
| 26 |     self.last_block = 0
 | 
| 27 |     self.alignment = alignment
 | 
| 28 | 
 | 
| 29 |   def WriteRootRef(self, chunk):
 | 
| 30 |     self.f.seek(5)  # seek past 'OHP\x01\x04'
 | 
| 31 | 
 | 
| 32 |     assert len(chunk) == 3
 | 
| 33 |     self.f.write(chunk)
 | 
| 34 | 
 | 
| 35 |   def Write(self, chunk):
 | 
| 36 |     """
 | 
| 37 |     Return a block pointer/index.
 | 
| 38 |     """
 | 
| 39 |     # Input should be padded
 | 
| 40 |     assert len(chunk) % self.alignment == 0
 | 
| 41 |     self.f.write(chunk)
 | 
| 42 | 
 | 
| 43 |     ref = self.last_block
 | 
| 44 |     num_blocks = len(chunk) // self.alignment  # int division
 | 
| 45 |     #print('WROTE %d blocks' % num_blocks)
 | 
| 46 |     self.last_block += num_blocks
 | 
| 47 | 
 | 
| 48 |     # Return a reference to the beginning
 | 
| 49 |     return ref
 | 
| 50 | 
 | 
| 51 | 
 | 
| 52 | class Params(object):
 | 
| 53 |   """Encoding parameters.
 | 
| 54 | 
 | 
| 55 |   Hm most of these settings should be per-field, expressed in the schema.  The
 | 
| 56 |   only global one is the ref/pointer alignment.  4 and 8 are the most likely
 | 
| 57 |   choices, and 4 is probably fine, because you have 64 MB of addressable memory
 | 
| 58 |   with 24 bit pointers.
 | 
| 59 |   """
 | 
| 60 | 
 | 
| 61 |   def __init__(self, alignment=_DEFAULT_ALIGNMENT,
 | 
| 62 |                int_width=const.DEFAULT_INT_WIDTH):
 | 
| 63 |     self.alignment = alignment
 | 
| 64 |     self.pointer_type = 'uint32_t'
 | 
| 65 | 
 | 
| 66 |     self.tag_width = 1  # for ArithVar vs ArithWord.
 | 
| 67 |     self.int_width = int_width
 | 
| 68 |     self.ref_width = int_width  # Constant 3, used by gen_cpp
 | 
| 69 | 
 | 
| 70 |     # used for fd, line/col
 | 
| 71 |     # also I guess steuff like SimpleCommand
 | 
| 72 |     self.index_width = 2  # 16 bits, e.g. max 64K entries in an array
 | 
| 73 | 
 | 
| 74 |     self.max_int = 1 << (self.int_width * 8)
 | 
| 75 |     self.max_index = 1 << (self.index_width * 8)
 | 
| 76 |     self.max_tag = 1 << (self.tag_width * 8)
 | 
| 77 | 
 | 
| 78 |   def Tag(self, i, chunk):
 | 
| 79 |     if i > self.max_tag:
 | 
| 80 |       raise AssertionError('Invalid id %r' % i)
 | 
| 81 |     chunk.append(i & 0xFF)
 | 
| 82 | 
 | 
| 83 |   def Int(self, n, chunk):
 | 
| 84 |     if n < 0:
 | 
| 85 |       raise EncodeError(
 | 
| 86 |           "ASDL can't currently encode negative numbers.  Got %d" % n)
 | 
| 87 |     if n > self.max_int:
 | 
| 88 |       raise EncodeError(
 | 
| 89 |           '%d is too big to fit in %d bytes' % (n, self.int_width))
 | 
| 90 | 
 | 
| 91 |     for i in range(self.int_width):
 | 
| 92 |       chunk.append(n & 0xFF)
 | 
| 93 |       n >>= 8
 | 
| 94 | 
 | 
| 95 |   def Ref(self, n, chunk):
 | 
| 96 |     # NOTE: ref width is currently the same as int width.  Could be different.
 | 
| 97 |     self.Int(n, chunk)
 | 
| 98 | 
 | 
| 99 |   def _Pad(self, chunk):
 | 
| 100 |     n = len(chunk)
 | 
| 101 |     a = self.alignment
 | 
| 102 |     if n % a != 0:
 | 
| 103 |       chunk.extend(b'\x00' * (a - (n % a)))
 | 
| 104 |     return chunk
 | 
| 105 | 
 | 
| 106 |   # Right now all strings are references.  Later they could be inline.
 | 
| 107 |   def Str(self, s, chunk):
 | 
| 108 |     # NOTE: For variable, proc, and function names, it could make sense to
 | 
| 109 |     # pre-compute and store a hash value.  They will be looked up in the stack
 | 
| 110 |     # and so forth.
 | 
| 111 |     # - You could also return a obj number or object ID.
 | 
| 112 |     chunk.extend(s)
 | 
| 113 |     chunk.append(0)  # NUL terminator
 | 
| 114 | 
 | 
| 115 |   def PaddedStr(self, s):
 | 
| 116 |     # NOTE:
 | 
| 117 |     # - The encoder could also have an intern table to save space.
 | 
| 118 |     # - Str and PaddedStr will both return char* ?  Should we allow them to
 | 
| 119 |     # VARY with the same schema, is a value/ref type PART of the schema?  It's
 | 
| 120 |     # basically small size optimization and "flexible array" optimization.  I
 | 
| 121 |     # think you want that possibility.
 | 
| 122 |     chunk = bytearray()
 | 
| 123 |     self.Str(s, chunk)
 | 
| 124 |     return self._Pad(chunk)
 | 
| 125 | 
 | 
| 126 |   def Bytes(self, buf, chunk):
 | 
| 127 |     n = len(buf)
 | 
| 128 |     if n >= self.max_index:
 | 
| 129 |       raise EncodeError("bytes object is too long (%d)" % n)
 | 
| 130 |     for i in range(self.index_width):
 | 
| 131 |       chunk.append(n & 0xFF)
 | 
| 132 |       n >>= 8
 | 
| 133 |     chunk.extend(buf)
 | 
| 134 | 
 | 
| 135 |   def PaddedBytes(self, buf):
 | 
| 136 |     chunk = bytearray()
 | 
| 137 |     self.Bytes(buf, chunk)
 | 
| 138 |     return self._Pad(chunk)
 | 
| 139 | 
 | 
| 140 |   def PaddedBlock(self, chunk):
 | 
| 141 |     return self._Pad(chunk)
 | 
| 142 | 
 | 
| 143 | 
 | 
| 144 | def EncodeArray(obj_list, item_desc, enc, out):
 | 
| 145 |   """
 | 
| 146 |   Args:
 | 
| 147 |     obj_list: List of Obj values
 | 
| 148 | 
 | 
| 149 |   Returns:
 | 
| 150 |     ref
 | 
| 151 |   """
 | 
| 152 |   array_chunk = bytearray()
 | 
| 153 |   enc.Int(len(obj_list), array_chunk)  # Length prefix
 | 
| 154 | 
 | 
| 155 |   if isinstance(item_desc, asdl.IntType) or \
 | 
| 156 |       isinstance(item_desc, asdl.BoolType):
 | 
| 157 |     for item in obj_list:
 | 
| 158 |       enc.Int(item, array_chunk)
 | 
| 159 | 
 | 
| 160 |   elif isinstance(item_desc, asdl.UserType):
 | 
| 161 |     # Assume Id for now
 | 
| 162 |     for item in obj_list:
 | 
| 163 |       enc.Int(item.enum_value, array_chunk)
 | 
| 164 | 
 | 
| 165 |   elif isinstance(item_desc, asdl.StrType):
 | 
| 166 |     for item in obj_list:
 | 
| 167 |       ref = out.Write(enc.PaddedStr(item))
 | 
| 168 |       enc.Ref(ref, array_chunk)
 | 
| 169 | 
 | 
| 170 |   elif isinstance(item_desc, asdl.Sum) and asdl.is_simple(item_desc):
 | 
| 171 |     for item in obj_list:
 | 
| 172 |       enc.Int(item.enum_id, array_chunk)
 | 
| 173 | 
 | 
| 174 |   else:
 | 
| 175 |     # A simple value is either an int, enum, or pointer.  (Later: Iter<Str>
 | 
| 176 |     # might be possible for locality.)
 | 
| 177 |     assert isinstance(item_desc, asdl.Sum) or \
 | 
| 178 |         isinstance(item_desc, asdl.Product), item_desc
 | 
| 179 | 
 | 
| 180 |     # This is like vector<T*>
 | 
| 181 |     # Later:
 | 
| 182 |     # - Product types can be put in line
 | 
| 183 |     # - Sum types can even be put in line, if you have List<T> rather than
 | 
| 184 |     # Array<T>.  Array implies O(1) random access; List doesn't.
 | 
| 185 |     for item in obj_list:
 | 
| 186 |       try:
 | 
| 187 |         ref = EncodeObj(item, enc, out)
 | 
| 188 |       except EncodeError as e:
 | 
| 189 |         if not e.details_printed:
 | 
| 190 |           util.log("Error encoding array: %s (item %s)", e, item)
 | 
| 191 |           e.details_printed = True
 | 
| 192 |         raise
 | 
| 193 |       enc.Ref(ref, array_chunk)
 | 
| 194 | 
 | 
| 195 |   this_ref = out.Write(enc.PaddedBlock(array_chunk))
 | 
| 196 |   return this_ref
 | 
| 197 | 
 | 
| 198 | 
 | 
| 199 | def EncodeObj(obj, enc, out):
 | 
| 200 |   """
 | 
| 201 |   Args:
 | 
| 202 |     obj: Obj to encode
 | 
| 203 |     enc: encoding params
 | 
| 204 |     out: output file
 | 
| 205 | 
 | 
| 206 |   Returns:
 | 
| 207 |     ref: Reference to the last block
 | 
| 208 |   """
 | 
| 209 |   # Algorithm: Depth first, post-order traversal.  First obj is the first leaf.
 | 
| 210 |   # last obj is the root.
 | 
| 211 |   #
 | 
| 212 |   # Array is a homogeneous type.
 | 
| 213 | 
 | 
| 214 |   this_chunk = bytearray()
 | 
| 215 |   assert isinstance(obj, py_meta.CompoundObj), \
 | 
| 216 |     '%r is not a compound obj (%r)' % (obj, obj.__class__)
 | 
| 217 | 
 | 
| 218 |   # Constructor objects have a tag.
 | 
| 219 |   if isinstance(obj.ASDL_TYPE, asdl.Constructor):
 | 
| 220 |     enc.Tag(obj.tag, this_chunk)
 | 
| 221 | 
 | 
| 222 |   for name, desc in obj.ASDL_TYPE.GetFields():  # encode in order
 | 
| 223 |     field_val = getattr(obj, name)
 | 
| 224 | 
 | 
| 225 |     # TODO:
 | 
| 226 |     # - Float would be inline, etc.
 | 
| 227 |     # - Repeated value: write them all adjacent to each other?
 | 
| 228 | 
 | 
| 229 |     is_maybe = False
 | 
| 230 |     if isinstance(desc, asdl.MaybeType):
 | 
| 231 |       is_maybe = True
 | 
| 232 |       desc = desc.desc  # descent
 | 
| 233 | 
 | 
| 234 |     #
 | 
| 235 |     # Now look at types
 | 
| 236 |     #
 | 
| 237 | 
 | 
| 238 |     if isinstance(desc, asdl.IntType) or isinstance(desc, asdl.BoolType):
 | 
| 239 |       enc.Int(field_val, this_chunk)
 | 
| 240 | 
 | 
| 241 |     elif isinstance(desc, asdl.Sum) and asdl.is_simple(desc):
 | 
| 242 |       # Encode enums as integers.  TODO later: Don't use 3 bytes!  Can use 1
 | 
| 243 |       # byte for most enums.
 | 
| 244 |       enc.Int(field_val.enum_id, this_chunk)
 | 
| 245 | 
 | 
| 246 |     # Write variable length field first, assuming that it's a ref/pointer.
 | 
| 247 |     # TODO: allow one inline, hanging string or array per record.
 | 
| 248 |     elif isinstance(desc, asdl.StrType):
 | 
| 249 |       ref = out.Write(enc.PaddedStr(field_val))
 | 
| 250 |       enc.Ref(ref, this_chunk)
 | 
| 251 | 
 | 
| 252 |     elif isinstance(desc, asdl.ArrayType):
 | 
| 253 |       item_desc = desc.desc
 | 
| 254 |       ref = EncodeArray(field_val, item_desc, enc, out)
 | 
| 255 |       enc.Ref(ref, this_chunk)
 | 
| 256 | 
 | 
| 257 |     elif isinstance(desc, asdl.UserType):
 | 
| 258 |       if is_maybe and field_val is None:  # e.g. id? prefix_op
 | 
| 259 |         enc.Ref(0, this_chunk)
 | 
| 260 |       else:
 | 
| 261 |         # Assume Id for now
 | 
| 262 |         enc.Int(field_val.enum_value, this_chunk)
 | 
| 263 | 
 | 
| 264 |     else:
 | 
| 265 |       if is_maybe and field_val is None:
 | 
| 266 |         enc.Ref(0, this_chunk)
 | 
| 267 |       else:
 | 
| 268 |         try:
 | 
| 269 |           ref = EncodeObj(field_val, enc, out)
 | 
| 270 |         except EncodeError as e:
 | 
| 271 |           if not e.details_printed:
 | 
| 272 |             util.log("Error encoding %s : %s (val %s)", name, e, field_val)
 | 
| 273 |             e.details_printed = True
 | 
| 274 |           raise
 | 
| 275 |         enc.Ref(ref, this_chunk)
 | 
| 276 | 
 | 
| 277 |   # Write the parent record
 | 
| 278 |   this_ref = out.Write(enc.PaddedBlock(this_chunk))
 | 
| 279 |   return this_ref
 | 
| 280 | 
 | 
| 281 | 
 | 
| 282 | def EncodeRoot(obj, enc, out):
 | 
| 283 |   ref = out.Write(b'OHP\x01')  # header, version 1
 | 
| 284 |   assert ref == 0
 | 
| 285 |   # 4-byte alignment, then 3 byte placeholder for the root ref.
 | 
| 286 |   ref = out.Write(b'\4\0\0\0')
 | 
| 287 |   assert ref == 1
 | 
| 288 | 
 | 
| 289 |   root_ref = EncodeObj(obj, enc, out)
 | 
| 290 |   chunk = bytearray()
 | 
| 291 |   enc.Ref(root_ref, chunk)
 | 
| 292 |   out.WriteRootRef(chunk)  # back up and write it
 | 
| 293 | 
 | 
| 294 |   #print("Root obj ref:", root_ref)
 |