1 | """ codecs -- Python Codec Registry, API and helpers.
|
2 |
|
3 |
|
4 | Written by Marc-Andre Lemburg (mal@lemburg.com).
|
5 |
|
6 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
7 |
|
8 | """#"
|
9 |
|
10 | import __builtin__, sys
|
11 |
|
12 | ### Registry and builtin stateless codec functions
|
13 |
|
14 | try:
|
15 | from _codecs import *
|
16 | except ImportError, why:
|
17 | raise SystemError('Failed to load the builtin codecs: %s' % why)
|
18 |
|
19 | __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
|
20 | "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
|
21 | "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
|
22 | "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
|
23 | "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
|
24 | "StreamReader", "StreamWriter",
|
25 | "StreamReaderWriter", "StreamRecoder",
|
26 | "getencoder", "getdecoder", "getincrementalencoder",
|
27 | "getincrementaldecoder", "getreader", "getwriter",
|
28 | "encode", "decode", "iterencode", "iterdecode",
|
29 | "strict_errors", "ignore_errors", "replace_errors",
|
30 | "xmlcharrefreplace_errors", "backslashreplace_errors",
|
31 | "register_error", "lookup_error"]
|
32 |
|
33 | ### Constants
|
34 |
|
35 | #
|
36 | # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
|
37 | # and its possible byte string values
|
38 | # for UTF8/UTF16/UTF32 output and little/big endian machines
|
39 | #
|
40 |
|
41 | # UTF-8
|
42 | BOM_UTF8 = '\xef\xbb\xbf'
|
43 |
|
44 | # UTF-16, little endian
|
45 | BOM_LE = BOM_UTF16_LE = '\xff\xfe'
|
46 |
|
47 | # UTF-16, big endian
|
48 | BOM_BE = BOM_UTF16_BE = '\xfe\xff'
|
49 |
|
50 | # UTF-32, little endian
|
51 | BOM_UTF32_LE = '\xff\xfe\x00\x00'
|
52 |
|
53 | # UTF-32, big endian
|
54 | BOM_UTF32_BE = '\x00\x00\xfe\xff'
|
55 |
|
56 | if sys.byteorder == 'little':
|
57 |
|
58 | # UTF-16, native endianness
|
59 | BOM = BOM_UTF16 = BOM_UTF16_LE
|
60 |
|
61 | # UTF-32, native endianness
|
62 | BOM_UTF32 = BOM_UTF32_LE
|
63 |
|
64 | else:
|
65 |
|
66 | # UTF-16, native endianness
|
67 | BOM = BOM_UTF16 = BOM_UTF16_BE
|
68 |
|
69 | # UTF-32, native endianness
|
70 | BOM_UTF32 = BOM_UTF32_BE
|
71 |
|
72 | # Old broken names (don't use in new code)
|
73 | BOM32_LE = BOM_UTF16_LE
|
74 | BOM32_BE = BOM_UTF16_BE
|
75 | BOM64_LE = BOM_UTF32_LE
|
76 | BOM64_BE = BOM_UTF32_BE
|
77 |
|
78 |
|
79 | ### Codec base classes (defining the API)
|
80 |
|
81 | class CodecInfo(tuple):
|
82 | """Codec details when looking up the codec registry"""
|
83 |
|
84 | # Private API to allow Python to blacklist the known non-Unicode
|
85 | # codecs in the standard library. A more general mechanism to
|
86 | # reliably distinguish test encodings from other codecs will hopefully
|
87 | # be defined for Python 3.5
|
88 | #
|
89 | # See http://bugs.python.org/issue19619
|
90 | _is_text_encoding = True # Assume codecs are text encodings by default
|
91 |
|
92 | def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
|
93 | incrementalencoder=None, incrementaldecoder=None, name=None,
|
94 | _is_text_encoding=None):
|
95 | self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
|
96 | self.name = name
|
97 | self.encode = encode
|
98 | self.decode = decode
|
99 | self.incrementalencoder = incrementalencoder
|
100 | self.incrementaldecoder = incrementaldecoder
|
101 | self.streamwriter = streamwriter
|
102 | self.streamreader = streamreader
|
103 | if _is_text_encoding is not None:
|
104 | self._is_text_encoding = _is_text_encoding
|
105 | return self
|
106 |
|
107 | def __repr__(self):
|
108 | return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
|
109 |
|
110 | class Codec:
|
111 |
|
112 | """ Defines the interface for stateless encoders/decoders.
|
113 |
|
114 | The .encode()/.decode() methods may use different error
|
115 | handling schemes by providing the errors argument. These
|
116 | string values are predefined:
|
117 |
|
118 | 'strict' - raise a ValueError error (or a subclass)
|
119 | 'ignore' - ignore the character and continue with the next
|
120 | 'replace' - replace with a suitable replacement character;
|
121 | Python will use the official U+FFFD REPLACEMENT
|
122 | CHARACTER for the builtin Unicode codecs on
|
123 | decoding and '?' on encoding.
|
124 | 'xmlcharrefreplace' - Replace with the appropriate XML
|
125 | character reference (only for encoding).
|
126 | 'backslashreplace' - Replace with backslashed escape sequences
|
127 | (only for encoding).
|
128 |
|
129 | The set of allowed values can be extended via register_error.
|
130 |
|
131 | """
|
132 | def encode(self, input, errors='strict'):
|
133 |
|
134 | """ Encodes the object input and returns a tuple (output
|
135 | object, length consumed).
|
136 |
|
137 | errors defines the error handling to apply. It defaults to
|
138 | 'strict' handling.
|
139 |
|
140 | The method may not store state in the Codec instance. Use
|
141 | StreamWriter for codecs which have to keep state in order to
|
142 | make encoding efficient.
|
143 |
|
144 | The encoder must be able to handle zero length input and
|
145 | return an empty object of the output object type in this
|
146 | situation.
|
147 |
|
148 | """
|
149 | raise NotImplementedError
|
150 |
|
151 | def decode(self, input, errors='strict'):
|
152 |
|
153 | """ Decodes the object input and returns a tuple (output
|
154 | object, length consumed).
|
155 |
|
156 | input must be an object which provides the bf_getreadbuf
|
157 | buffer slot. Python strings, buffer objects and memory
|
158 | mapped files are examples of objects providing this slot.
|
159 |
|
160 | errors defines the error handling to apply. It defaults to
|
161 | 'strict' handling.
|
162 |
|
163 | The method may not store state in the Codec instance. Use
|
164 | StreamReader for codecs which have to keep state in order to
|
165 | make decoding efficient.
|
166 |
|
167 | The decoder must be able to handle zero length input and
|
168 | return an empty object of the output object type in this
|
169 | situation.
|
170 |
|
171 | """
|
172 | raise NotImplementedError
|
173 |
|
174 | class IncrementalEncoder(object):
|
175 | """
|
176 | An IncrementalEncoder encodes an input in multiple steps. The input can be
|
177 | passed piece by piece to the encode() method. The IncrementalEncoder remembers
|
178 | the state of the Encoding process between calls to encode().
|
179 | """
|
180 | def __init__(self, errors='strict'):
|
181 | """
|
182 | Creates an IncrementalEncoder instance.
|
183 |
|
184 | The IncrementalEncoder may use different error handling schemes by
|
185 | providing the errors keyword argument. See the module docstring
|
186 | for a list of possible values.
|
187 | """
|
188 | self.errors = errors
|
189 | self.buffer = ""
|
190 |
|
191 | def encode(self, input, final=False):
|
192 | """
|
193 | Encodes input and returns the resulting object.
|
194 | """
|
195 | raise NotImplementedError
|
196 |
|
197 | def reset(self):
|
198 | """
|
199 | Resets the encoder to the initial state.
|
200 | """
|
201 |
|
202 | def getstate(self):
|
203 | """
|
204 | Return the current state of the encoder.
|
205 | """
|
206 | return 0
|
207 |
|
208 | def setstate(self, state):
|
209 | """
|
210 | Set the current state of the encoder. state must have been
|
211 | returned by getstate().
|
212 | """
|
213 |
|
214 | class BufferedIncrementalEncoder(IncrementalEncoder):
|
215 | """
|
216 | This subclass of IncrementalEncoder can be used as the baseclass for an
|
217 | incremental encoder if the encoder must keep some of the output in a
|
218 | buffer between calls to encode().
|
219 | """
|
220 | def __init__(self, errors='strict'):
|
221 | IncrementalEncoder.__init__(self, errors)
|
222 | self.buffer = "" # unencoded input that is kept between calls to encode()
|
223 |
|
224 | def _buffer_encode(self, input, errors, final):
|
225 | # Overwrite this method in subclasses: It must encode input
|
226 | # and return an (output, length consumed) tuple
|
227 | raise NotImplementedError
|
228 |
|
229 | def encode(self, input, final=False):
|
230 | # encode input (taking the buffer into account)
|
231 | data = self.buffer + input
|
232 | (result, consumed) = self._buffer_encode(data, self.errors, final)
|
233 | # keep unencoded input until the next call
|
234 | self.buffer = data[consumed:]
|
235 | return result
|
236 |
|
237 | def reset(self):
|
238 | IncrementalEncoder.reset(self)
|
239 | self.buffer = ""
|
240 |
|
241 | def getstate(self):
|
242 | return self.buffer or 0
|
243 |
|
244 | def setstate(self, state):
|
245 | self.buffer = state or ""
|
246 |
|
247 | class IncrementalDecoder(object):
|
248 | """
|
249 | An IncrementalDecoder decodes an input in multiple steps. The input can be
|
250 | passed piece by piece to the decode() method. The IncrementalDecoder
|
251 | remembers the state of the decoding process between calls to decode().
|
252 | """
|
253 | def __init__(self, errors='strict'):
|
254 | """
|
255 | Creates an IncrementalDecoder instance.
|
256 |
|
257 | The IncrementalDecoder may use different error handling schemes by
|
258 | providing the errors keyword argument. See the module docstring
|
259 | for a list of possible values.
|
260 | """
|
261 | self.errors = errors
|
262 |
|
263 | def decode(self, input, final=False):
|
264 | """
|
265 | Decodes input and returns the resulting object.
|
266 | """
|
267 | raise NotImplementedError
|
268 |
|
269 | def reset(self):
|
270 | """
|
271 | Resets the decoder to the initial state.
|
272 | """
|
273 |
|
274 | def getstate(self):
|
275 | """
|
276 | Return the current state of the decoder.
|
277 |
|
278 | This must be a (buffered_input, additional_state_info) tuple.
|
279 | buffered_input must be a bytes object containing bytes that
|
280 | were passed to decode() that have not yet been converted.
|
281 | additional_state_info must be a non-negative integer
|
282 | representing the state of the decoder WITHOUT yet having
|
283 | processed the contents of buffered_input. In the initial state
|
284 | and after reset(), getstate() must return (b"", 0).
|
285 | """
|
286 | return (b"", 0)
|
287 |
|
288 | def setstate(self, state):
|
289 | """
|
290 | Set the current state of the decoder.
|
291 |
|
292 | state must have been returned by getstate(). The effect of
|
293 | setstate((b"", 0)) must be equivalent to reset().
|
294 | """
|
295 |
|
296 | class BufferedIncrementalDecoder(IncrementalDecoder):
|
297 | """
|
298 | This subclass of IncrementalDecoder can be used as the baseclass for an
|
299 | incremental decoder if the decoder must be able to handle incomplete byte
|
300 | sequences.
|
301 | """
|
302 | def __init__(self, errors='strict'):
|
303 | IncrementalDecoder.__init__(self, errors)
|
304 | self.buffer = "" # undecoded input that is kept between calls to decode()
|
305 |
|
306 | def _buffer_decode(self, input, errors, final):
|
307 | # Overwrite this method in subclasses: It must decode input
|
308 | # and return an (output, length consumed) tuple
|
309 | raise NotImplementedError
|
310 |
|
311 | def decode(self, input, final=False):
|
312 | # decode input (taking the buffer into account)
|
313 | data = self.buffer + input
|
314 | (result, consumed) = self._buffer_decode(data, self.errors, final)
|
315 | # keep undecoded input until the next call
|
316 | self.buffer = data[consumed:]
|
317 | return result
|
318 |
|
319 | def reset(self):
|
320 | IncrementalDecoder.reset(self)
|
321 | self.buffer = ""
|
322 |
|
323 | def getstate(self):
|
324 | # additional state info is always 0
|
325 | return (self.buffer, 0)
|
326 |
|
327 | def setstate(self, state):
|
328 | # ignore additional state info
|
329 | self.buffer = state[0]
|
330 |
|
331 | #
|
332 | # The StreamWriter and StreamReader class provide generic working
|
333 | # interfaces which can be used to implement new encoding submodules
|
334 | # very easily. See encodings/utf_8.py for an example on how this is
|
335 | # done.
|
336 | #
|
337 |
|
338 | class StreamWriter(Codec):
|
339 |
|
340 | def __init__(self, stream, errors='strict'):
|
341 |
|
342 | """ Creates a StreamWriter instance.
|
343 |
|
344 | stream must be a file-like object open for writing
|
345 | (binary) data.
|
346 |
|
347 | The StreamWriter may use different error handling
|
348 | schemes by providing the errors keyword argument. These
|
349 | parameters are predefined:
|
350 |
|
351 | 'strict' - raise a ValueError (or a subclass)
|
352 | 'ignore' - ignore the character and continue with the next
|
353 | 'replace'- replace with a suitable replacement character
|
354 | 'xmlcharrefreplace' - Replace with the appropriate XML
|
355 | character reference.
|
356 | 'backslashreplace' - Replace with backslashed escape
|
357 | sequences (only for encoding).
|
358 |
|
359 | The set of allowed parameter values can be extended via
|
360 | register_error.
|
361 | """
|
362 | self.stream = stream
|
363 | self.errors = errors
|
364 |
|
365 | def write(self, object):
|
366 |
|
367 | """ Writes the object's contents encoded to self.stream.
|
368 | """
|
369 | data, consumed = self.encode(object, self.errors)
|
370 | self.stream.write(data)
|
371 |
|
372 | def writelines(self, list):
|
373 |
|
374 | """ Writes the concatenated list of strings to the stream
|
375 | using .write().
|
376 | """
|
377 | self.write(''.join(list))
|
378 |
|
379 | def reset(self):
|
380 |
|
381 | """ Flushes and resets the codec buffers used for keeping state.
|
382 |
|
383 | Calling this method should ensure that the data on the
|
384 | output is put into a clean state, that allows appending
|
385 | of new fresh data without having to rescan the whole
|
386 | stream to recover state.
|
387 |
|
388 | """
|
389 | pass
|
390 |
|
391 | def seek(self, offset, whence=0):
|
392 | self.stream.seek(offset, whence)
|
393 | if whence == 0 and offset == 0:
|
394 | self.reset()
|
395 |
|
396 | def __getattr__(self, name,
|
397 | getattr=getattr):
|
398 |
|
399 | """ Inherit all other methods from the underlying stream.
|
400 | """
|
401 | return getattr(self.stream, name)
|
402 |
|
403 | def __enter__(self):
|
404 | return self
|
405 |
|
406 | def __exit__(self, type, value, tb):
|
407 | self.stream.close()
|
408 |
|
409 | ###
|
410 |
|
411 | class StreamReader(Codec):
|
412 |
|
413 | def __init__(self, stream, errors='strict'):
|
414 |
|
415 | """ Creates a StreamReader instance.
|
416 |
|
417 | stream must be a file-like object open for reading
|
418 | (binary) data.
|
419 |
|
420 | The StreamReader may use different error handling
|
421 | schemes by providing the errors keyword argument. These
|
422 | parameters are predefined:
|
423 |
|
424 | 'strict' - raise a ValueError (or a subclass)
|
425 | 'ignore' - ignore the character and continue with the next
|
426 | 'replace'- replace with a suitable replacement character;
|
427 |
|
428 | The set of allowed parameter values can be extended via
|
429 | register_error.
|
430 | """
|
431 | self.stream = stream
|
432 | self.errors = errors
|
433 | self.bytebuffer = ""
|
434 | # For str->str decoding this will stay a str
|
435 | # For str->unicode decoding the first read will promote it to unicode
|
436 | self.charbuffer = ""
|
437 | self.linebuffer = None
|
438 |
|
439 | def decode(self, input, errors='strict'):
|
440 | raise NotImplementedError
|
441 |
|
442 | def read(self, size=-1, chars=-1, firstline=False):
|
443 |
|
444 | """ Decodes data from the stream self.stream and returns the
|
445 | resulting object.
|
446 |
|
447 | chars indicates the number of characters to read from the
|
448 | stream. read() will never return more than chars
|
449 | characters, but it might return less, if there are not enough
|
450 | characters available.
|
451 |
|
452 | size indicates the approximate maximum number of bytes to
|
453 | read from the stream for decoding purposes. The decoder
|
454 | can modify this setting as appropriate. The default value
|
455 | -1 indicates to read and decode as much as possible. size
|
456 | is intended to prevent having to decode huge files in one
|
457 | step.
|
458 |
|
459 | If firstline is true, and a UnicodeDecodeError happens
|
460 | after the first line terminator in the input only the first line
|
461 | will be returned, the rest of the input will be kept until the
|
462 | next call to read().
|
463 |
|
464 | The method should use a greedy read strategy meaning that
|
465 | it should read as much data as is allowed within the
|
466 | definition of the encoding and the given size, e.g. if
|
467 | optional encoding endings or state markers are available
|
468 | on the stream, these should be read too.
|
469 | """
|
470 | # If we have lines cached, first merge them back into characters
|
471 | if self.linebuffer:
|
472 | self.charbuffer = "".join(self.linebuffer)
|
473 | self.linebuffer = None
|
474 |
|
475 | # read until we get the required number of characters (if available)
|
476 | while True:
|
477 | # can the request be satisfied from the character buffer?
|
478 | if chars >= 0:
|
479 | if len(self.charbuffer) >= chars:
|
480 | break
|
481 | elif size >= 0:
|
482 | if len(self.charbuffer) >= size:
|
483 | break
|
484 | # we need more data
|
485 | if size < 0:
|
486 | newdata = self.stream.read()
|
487 | else:
|
488 | newdata = self.stream.read(size)
|
489 | # decode bytes (those remaining from the last call included)
|
490 | data = self.bytebuffer + newdata
|
491 | try:
|
492 | newchars, decodedbytes = self.decode(data, self.errors)
|
493 | except UnicodeDecodeError, exc:
|
494 | if firstline:
|
495 | newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
|
496 | lines = newchars.splitlines(True)
|
497 | if len(lines)<=1:
|
498 | raise
|
499 | else:
|
500 | raise
|
501 | # keep undecoded bytes until the next call
|
502 | self.bytebuffer = data[decodedbytes:]
|
503 | # put new characters in the character buffer
|
504 | self.charbuffer += newchars
|
505 | # there was no data available
|
506 | if not newdata:
|
507 | break
|
508 | if chars < 0:
|
509 | # Return everything we've got
|
510 | result = self.charbuffer
|
511 | self.charbuffer = ""
|
512 | else:
|
513 | # Return the first chars characters
|
514 | result = self.charbuffer[:chars]
|
515 | self.charbuffer = self.charbuffer[chars:]
|
516 | return result
|
517 |
|
518 | def readline(self, size=None, keepends=True):
|
519 |
|
520 | """ Read one line from the input stream and return the
|
521 | decoded data.
|
522 |
|
523 | size, if given, is passed as size argument to the
|
524 | read() method.
|
525 |
|
526 | """
|
527 | # If we have lines cached from an earlier read, return
|
528 | # them unconditionally
|
529 | if self.linebuffer:
|
530 | line = self.linebuffer[0]
|
531 | del self.linebuffer[0]
|
532 | if len(self.linebuffer) == 1:
|
533 | # revert to charbuffer mode; we might need more data
|
534 | # next time
|
535 | self.charbuffer = self.linebuffer[0]
|
536 | self.linebuffer = None
|
537 | if not keepends:
|
538 | line = line.splitlines(False)[0]
|
539 | return line
|
540 |
|
541 | readsize = size or 72
|
542 | line = ""
|
543 | # If size is given, we call read() only once
|
544 | while True:
|
545 | data = self.read(readsize, firstline=True)
|
546 | if data:
|
547 | # If we're at a "\r" read one extra character (which might
|
548 | # be a "\n") to get a proper line ending. If the stream is
|
549 | # temporarily exhausted we return the wrong line ending.
|
550 | if data.endswith("\r"):
|
551 | data += self.read(size=1, chars=1)
|
552 |
|
553 | line += data
|
554 | lines = line.splitlines(True)
|
555 | if lines:
|
556 | if len(lines) > 1:
|
557 | # More than one line result; the first line is a full line
|
558 | # to return
|
559 | line = lines[0]
|
560 | del lines[0]
|
561 | if len(lines) > 1:
|
562 | # cache the remaining lines
|
563 | lines[-1] += self.charbuffer
|
564 | self.linebuffer = lines
|
565 | self.charbuffer = None
|
566 | else:
|
567 | # only one remaining line, put it back into charbuffer
|
568 | self.charbuffer = lines[0] + self.charbuffer
|
569 | if not keepends:
|
570 | line = line.splitlines(False)[0]
|
571 | break
|
572 | line0withend = lines[0]
|
573 | line0withoutend = lines[0].splitlines(False)[0]
|
574 | if line0withend != line0withoutend: # We really have a line end
|
575 | # Put the rest back together and keep it until the next call
|
576 | self.charbuffer = "".join(lines[1:]) + self.charbuffer
|
577 | if keepends:
|
578 | line = line0withend
|
579 | else:
|
580 | line = line0withoutend
|
581 | break
|
582 | # we didn't get anything or this was our only try
|
583 | if not data or size is not None:
|
584 | if line and not keepends:
|
585 | line = line.splitlines(False)[0]
|
586 | break
|
587 | if readsize<8000:
|
588 | readsize *= 2
|
589 | return line
|
590 |
|
591 | def readlines(self, sizehint=None, keepends=True):
|
592 |
|
593 | """ Read all lines available on the input stream
|
594 | and return them as list of lines.
|
595 |
|
596 | Line breaks are implemented using the codec's decoder
|
597 | method and are included in the list entries.
|
598 |
|
599 | sizehint, if given, is ignored since there is no efficient
|
600 | way to finding the true end-of-line.
|
601 |
|
602 | """
|
603 | data = self.read()
|
604 | return data.splitlines(keepends)
|
605 |
|
606 | def reset(self):
|
607 |
|
608 | """ Resets the codec buffers used for keeping state.
|
609 |
|
610 | Note that no stream repositioning should take place.
|
611 | This method is primarily intended to be able to recover
|
612 | from decoding errors.
|
613 |
|
614 | """
|
615 | self.bytebuffer = ""
|
616 | self.charbuffer = u""
|
617 | self.linebuffer = None
|
618 |
|
619 | def seek(self, offset, whence=0):
|
620 | """ Set the input stream's current position.
|
621 |
|
622 | Resets the codec buffers used for keeping state.
|
623 | """
|
624 | self.stream.seek(offset, whence)
|
625 | self.reset()
|
626 |
|
627 | def next(self):
|
628 |
|
629 | """ Return the next decoded line from the input stream."""
|
630 | line = self.readline()
|
631 | if line:
|
632 | return line
|
633 | raise StopIteration
|
634 |
|
635 | def __iter__(self):
|
636 | return self
|
637 |
|
638 | def __getattr__(self, name,
|
639 | getattr=getattr):
|
640 |
|
641 | """ Inherit all other methods from the underlying stream.
|
642 | """
|
643 | return getattr(self.stream, name)
|
644 |
|
645 | def __enter__(self):
|
646 | return self
|
647 |
|
648 | def __exit__(self, type, value, tb):
|
649 | self.stream.close()
|
650 |
|
651 | ###
|
652 |
|
653 | class StreamReaderWriter:
|
654 |
|
655 | """ StreamReaderWriter instances allow wrapping streams which
|
656 | work in both read and write modes.
|
657 |
|
658 | The design is such that one can use the factory functions
|
659 | returned by the codec.lookup() function to construct the
|
660 | instance.
|
661 |
|
662 | """
|
663 | # Optional attributes set by the file wrappers below
|
664 | encoding = 'unknown'
|
665 |
|
666 | def __init__(self, stream, Reader, Writer, errors='strict'):
|
667 |
|
668 | """ Creates a StreamReaderWriter instance.
|
669 |
|
670 | stream must be a Stream-like object.
|
671 |
|
672 | Reader, Writer must be factory functions or classes
|
673 | providing the StreamReader, StreamWriter interface resp.
|
674 |
|
675 | Error handling is done in the same way as defined for the
|
676 | StreamWriter/Readers.
|
677 |
|
678 | """
|
679 | self.stream = stream
|
680 | self.reader = Reader(stream, errors)
|
681 | self.writer = Writer(stream, errors)
|
682 | self.errors = errors
|
683 |
|
684 | def read(self, size=-1):
|
685 |
|
686 | return self.reader.read(size)
|
687 |
|
688 | def readline(self, size=None):
|
689 |
|
690 | return self.reader.readline(size)
|
691 |
|
692 | def readlines(self, sizehint=None):
|
693 |
|
694 | return self.reader.readlines(sizehint)
|
695 |
|
696 | def next(self):
|
697 |
|
698 | """ Return the next decoded line from the input stream."""
|
699 | return self.reader.next()
|
700 |
|
701 | def __iter__(self):
|
702 | return self
|
703 |
|
704 | def write(self, data):
|
705 |
|
706 | return self.writer.write(data)
|
707 |
|
708 | def writelines(self, list):
|
709 |
|
710 | return self.writer.writelines(list)
|
711 |
|
712 | def reset(self):
|
713 |
|
714 | self.reader.reset()
|
715 | self.writer.reset()
|
716 |
|
717 | def seek(self, offset, whence=0):
|
718 | self.stream.seek(offset, whence)
|
719 | self.reader.reset()
|
720 | if whence == 0 and offset == 0:
|
721 | self.writer.reset()
|
722 |
|
723 | def __getattr__(self, name,
|
724 | getattr=getattr):
|
725 |
|
726 | """ Inherit all other methods from the underlying stream.
|
727 | """
|
728 | return getattr(self.stream, name)
|
729 |
|
730 | # these are needed to make "with codecs.open(...)" work properly
|
731 |
|
732 | def __enter__(self):
|
733 | return self
|
734 |
|
735 | def __exit__(self, type, value, tb):
|
736 | self.stream.close()
|
737 |
|
738 | ###
|
739 |
|
740 | class StreamRecoder:
|
741 |
|
742 | """ StreamRecoder instances provide a frontend - backend
|
743 | view of encoding data.
|
744 |
|
745 | They use the complete set of APIs returned by the
|
746 | codecs.lookup() function to implement their task.
|
747 |
|
748 | Data written to the stream is first decoded into an
|
749 | intermediate format (which is dependent on the given codec
|
750 | combination) and then written to the stream using an instance
|
751 | of the provided Writer class.
|
752 |
|
753 | In the other direction, data is read from the stream using a
|
754 | Reader instance and then return encoded data to the caller.
|
755 |
|
756 | """
|
757 | # Optional attributes set by the file wrappers below
|
758 | data_encoding = 'unknown'
|
759 | file_encoding = 'unknown'
|
760 |
|
761 | def __init__(self, stream, encode, decode, Reader, Writer,
|
762 | errors='strict'):
|
763 |
|
764 | """ Creates a StreamRecoder instance which implements a two-way
|
765 | conversion: encode and decode work on the frontend (the
|
766 | input to .read() and output of .write()) while
|
767 | Reader and Writer work on the backend (reading and
|
768 | writing to the stream).
|
769 |
|
770 | You can use these objects to do transparent direct
|
771 | recodings from e.g. latin-1 to utf-8 and back.
|
772 |
|
773 | stream must be a file-like object.
|
774 |
|
775 | encode, decode must adhere to the Codec interface, Reader,
|
776 | Writer must be factory functions or classes providing the
|
777 | StreamReader, StreamWriter interface resp.
|
778 |
|
779 | encode and decode are needed for the frontend translation,
|
780 | Reader and Writer for the backend translation. Unicode is
|
781 | used as intermediate encoding.
|
782 |
|
783 | Error handling is done in the same way as defined for the
|
784 | StreamWriter/Readers.
|
785 |
|
786 | """
|
787 | self.stream = stream
|
788 | self.encode = encode
|
789 | self.decode = decode
|
790 | self.reader = Reader(stream, errors)
|
791 | self.writer = Writer(stream, errors)
|
792 | self.errors = errors
|
793 |
|
794 | def read(self, size=-1):
|
795 |
|
796 | data = self.reader.read(size)
|
797 | data, bytesencoded = self.encode(data, self.errors)
|
798 | return data
|
799 |
|
800 | def readline(self, size=None):
|
801 |
|
802 | if size is None:
|
803 | data = self.reader.readline()
|
804 | else:
|
805 | data = self.reader.readline(size)
|
806 | data, bytesencoded = self.encode(data, self.errors)
|
807 | return data
|
808 |
|
809 | def readlines(self, sizehint=None):
|
810 |
|
811 | data = self.reader.read()
|
812 | data, bytesencoded = self.encode(data, self.errors)
|
813 | return data.splitlines(1)
|
814 |
|
815 | def next(self):
|
816 |
|
817 | """ Return the next decoded line from the input stream."""
|
818 | data = self.reader.next()
|
819 | data, bytesencoded = self.encode(data, self.errors)
|
820 | return data
|
821 |
|
822 | def __iter__(self):
|
823 | return self
|
824 |
|
825 | def write(self, data):
|
826 |
|
827 | data, bytesdecoded = self.decode(data, self.errors)
|
828 | return self.writer.write(data)
|
829 |
|
830 | def writelines(self, list):
|
831 |
|
832 | data = ''.join(list)
|
833 | data, bytesdecoded = self.decode(data, self.errors)
|
834 | return self.writer.write(data)
|
835 |
|
836 | def reset(self):
|
837 |
|
838 | self.reader.reset()
|
839 | self.writer.reset()
|
840 |
|
841 | def __getattr__(self, name,
|
842 | getattr=getattr):
|
843 |
|
844 | """ Inherit all other methods from the underlying stream.
|
845 | """
|
846 | return getattr(self.stream, name)
|
847 |
|
848 | def __enter__(self):
|
849 | return self
|
850 |
|
851 | def __exit__(self, type, value, tb):
|
852 | self.stream.close()
|
853 |
|
854 | ### Shortcuts
|
855 |
|
856 | def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
|
857 |
|
858 | """ Open an encoded file using the given mode and return
|
859 | a wrapped version providing transparent encoding/decoding.
|
860 |
|
861 | Note: The wrapped version will only accept the object format
|
862 | defined by the codecs, i.e. Unicode objects for most builtin
|
863 | codecs. Output is also codec dependent and will usually be
|
864 | Unicode as well.
|
865 |
|
866 | Files are always opened in binary mode, even if no binary mode
|
867 | was specified. This is done to avoid data loss due to encodings
|
868 | using 8-bit values. The default file mode is 'rb' meaning to
|
869 | open the file in binary read mode.
|
870 |
|
871 | encoding specifies the encoding which is to be used for the
|
872 | file.
|
873 |
|
874 | errors may be given to define the error handling. It defaults
|
875 | to 'strict' which causes ValueErrors to be raised in case an
|
876 | encoding error occurs.
|
877 |
|
878 | buffering has the same meaning as for the builtin open() API.
|
879 | It defaults to line buffered.
|
880 |
|
881 | The returned wrapped file object provides an extra attribute
|
882 | .encoding which allows querying the used encoding. This
|
883 | attribute is only available if an encoding was specified as
|
884 | parameter.
|
885 |
|
886 | """
|
887 | if encoding is not None:
|
888 | if 'U' in mode:
|
889 | # No automatic conversion of '\n' is done on reading and writing
|
890 | mode = mode.strip().replace('U', '')
|
891 | if mode[:1] not in set('rwa'):
|
892 | mode = 'r' + mode
|
893 | if 'b' not in mode:
|
894 | # Force opening of the file in binary mode
|
895 | mode = mode + 'b'
|
896 | file = __builtin__.open(filename, mode, buffering)
|
897 | if encoding is None:
|
898 | return file
|
899 | info = lookup(encoding)
|
900 | srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
|
901 | # Add attributes to simplify introspection
|
902 | srw.encoding = encoding
|
903 | return srw
|
904 |
|
905 | def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
|
906 |
|
907 | """ Return a wrapped version of file which provides transparent
|
908 | encoding translation.
|
909 |
|
910 | Strings written to the wrapped file are interpreted according
|
911 | to the given data_encoding and then written to the original
|
912 | file as string using file_encoding. The intermediate encoding
|
913 | will usually be Unicode but depends on the specified codecs.
|
914 |
|
915 | Strings are read from the file using file_encoding and then
|
916 | passed back to the caller as string using data_encoding.
|
917 |
|
918 | If file_encoding is not given, it defaults to data_encoding.
|
919 |
|
920 | errors may be given to define the error handling. It defaults
|
921 | to 'strict' which causes ValueErrors to be raised in case an
|
922 | encoding error occurs.
|
923 |
|
924 | The returned wrapped file object provides two extra attributes
|
925 | .data_encoding and .file_encoding which reflect the given
|
926 | parameters of the same name. The attributes can be used for
|
927 | introspection by Python programs.
|
928 |
|
929 | """
|
930 | if file_encoding is None:
|
931 | file_encoding = data_encoding
|
932 | data_info = lookup(data_encoding)
|
933 | file_info = lookup(file_encoding)
|
934 | sr = StreamRecoder(file, data_info.encode, data_info.decode,
|
935 | file_info.streamreader, file_info.streamwriter, errors)
|
936 | # Add attributes to simplify introspection
|
937 | sr.data_encoding = data_encoding
|
938 | sr.file_encoding = file_encoding
|
939 | return sr
|
940 |
|
941 | ### Helpers for codec lookup
|
942 |
|
943 | def getencoder(encoding):
|
944 |
|
945 | """ Lookup up the codec for the given encoding and return
|
946 | its encoder function.
|
947 |
|
948 | Raises a LookupError in case the encoding cannot be found.
|
949 |
|
950 | """
|
951 | return lookup(encoding).encode
|
952 |
|
953 | def getdecoder(encoding):
|
954 |
|
955 | """ Lookup up the codec for the given encoding and return
|
956 | its decoder function.
|
957 |
|
958 | Raises a LookupError in case the encoding cannot be found.
|
959 |
|
960 | """
|
961 | return lookup(encoding).decode
|
962 |
|
963 | def getincrementalencoder(encoding):
|
964 |
|
965 | """ Lookup up the codec for the given encoding and return
|
966 | its IncrementalEncoder class or factory function.
|
967 |
|
968 | Raises a LookupError in case the encoding cannot be found
|
969 | or the codecs doesn't provide an incremental encoder.
|
970 |
|
971 | """
|
972 | encoder = lookup(encoding).incrementalencoder
|
973 | if encoder is None:
|
974 | raise LookupError(encoding)
|
975 | return encoder
|
976 |
|
977 | def getincrementaldecoder(encoding):
|
978 |
|
979 | """ Lookup up the codec for the given encoding and return
|
980 | its IncrementalDecoder class or factory function.
|
981 |
|
982 | Raises a LookupError in case the encoding cannot be found
|
983 | or the codecs doesn't provide an incremental decoder.
|
984 |
|
985 | """
|
986 | decoder = lookup(encoding).incrementaldecoder
|
987 | if decoder is None:
|
988 | raise LookupError(encoding)
|
989 | return decoder
|
990 |
|
991 | def getreader(encoding):
|
992 |
|
993 | """ Lookup up the codec for the given encoding and return
|
994 | its StreamReader class or factory function.
|
995 |
|
996 | Raises a LookupError in case the encoding cannot be found.
|
997 |
|
998 | """
|
999 | return lookup(encoding).streamreader
|
1000 |
|
1001 | def getwriter(encoding):
|
1002 |
|
1003 | """ Lookup up the codec for the given encoding and return
|
1004 | its StreamWriter class or factory function.
|
1005 |
|
1006 | Raises a LookupError in case the encoding cannot be found.
|
1007 |
|
1008 | """
|
1009 | return lookup(encoding).streamwriter
|
1010 |
|
1011 | def iterencode(iterator, encoding, errors='strict', **kwargs):
|
1012 | """
|
1013 | Encoding iterator.
|
1014 |
|
1015 | Encodes the input strings from the iterator using an IncrementalEncoder.
|
1016 |
|
1017 | errors and kwargs are passed through to the IncrementalEncoder
|
1018 | constructor.
|
1019 | """
|
1020 | encoder = getincrementalencoder(encoding)(errors, **kwargs)
|
1021 | for input in iterator:
|
1022 | output = encoder.encode(input)
|
1023 | if output:
|
1024 | yield output
|
1025 | output = encoder.encode("", True)
|
1026 | if output:
|
1027 | yield output
|
1028 |
|
1029 | def iterdecode(iterator, encoding, errors='strict', **kwargs):
|
1030 | """
|
1031 | Decoding iterator.
|
1032 |
|
1033 | Decodes the input strings from the iterator using an IncrementalDecoder.
|
1034 |
|
1035 | errors and kwargs are passed through to the IncrementalDecoder
|
1036 | constructor.
|
1037 | """
|
1038 | decoder = getincrementaldecoder(encoding)(errors, **kwargs)
|
1039 | for input in iterator:
|
1040 | output = decoder.decode(input)
|
1041 | if output:
|
1042 | yield output
|
1043 | output = decoder.decode("", True)
|
1044 | if output:
|
1045 | yield output
|
1046 |
|
1047 | ### Helpers for charmap-based codecs
|
1048 |
|
1049 | def make_identity_dict(rng):
|
1050 |
|
1051 | """ make_identity_dict(rng) -> dict
|
1052 |
|
1053 | Return a dictionary where elements of the rng sequence are
|
1054 | mapped to themselves.
|
1055 |
|
1056 | """
|
1057 | res = {}
|
1058 | for i in rng:
|
1059 | res[i]=i
|
1060 | return res
|
1061 |
|
1062 | def make_encoding_map(decoding_map):
|
1063 |
|
1064 | """ Creates an encoding map from a decoding map.
|
1065 |
|
1066 | If a target mapping in the decoding map occurs multiple
|
1067 | times, then that target is mapped to None (undefined mapping),
|
1068 | causing an exception when encountered by the charmap codec
|
1069 | during translation.
|
1070 |
|
1071 | One example where this happens is cp875.py which decodes
|
1072 | multiple character to \\u001a.
|
1073 |
|
1074 | """
|
1075 | m = {}
|
1076 | for k,v in decoding_map.items():
|
1077 | if not v in m:
|
1078 | m[v] = k
|
1079 | else:
|
1080 | m[v] = None
|
1081 | return m
|
1082 |
|
1083 | ### error handlers
|
1084 |
|
1085 | try:
|
1086 | strict_errors = lookup_error("strict")
|
1087 | ignore_errors = lookup_error("ignore")
|
1088 | replace_errors = lookup_error("replace")
|
1089 | xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
|
1090 | backslashreplace_errors = lookup_error("backslashreplace")
|
1091 | except LookupError:
|
1092 | # In --disable-unicode builds, these error handler are missing
|
1093 | strict_errors = None
|
1094 | ignore_errors = None
|
1095 | replace_errors = None
|
1096 | xmlcharrefreplace_errors = None
|
1097 | backslashreplace_errors = None
|
1098 |
|
1099 | # Tell modulefinder that using codecs probably needs the encodings
|
1100 | # package
|
1101 | _false = 0
|
1102 | if _false:
|
1103 | import encodings
|
1104 |
|
1105 | ### Tests
|
1106 |
|
1107 | if __name__ == '__main__':
|
1108 |
|
1109 | # Make stdout translate Latin-1 output into UTF-8 output
|
1110 | sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
|
1111 |
|
1112 | # Have stdin translate Latin-1 input into UTF-8 input
|
1113 | sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
|