OILS / opy / _regtest / src / codecs.py View on Github | oilshell.org

1113 lines, 428 significant
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import __builtin__, sys
11
12### Registry and builtin stateless codec functions
13
14try:
15 from _codecs import *
16except ImportError, why:
17 raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24 "StreamReader", "StreamWriter",
25 "StreamReaderWriter", "StreamRecoder",
26 "getencoder", "getdecoder", "getincrementalencoder",
27 "getincrementaldecoder", "getreader", "getwriter",
28 "encode", "decode", "iterencode", "iterdecode",
29 "strict_errors", "ignore_errors", "replace_errors",
30 "xmlcharrefreplace_errors", "backslashreplace_errors",
31 "register_error", "lookup_error"]
32
33### Constants
34
35#
36# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37# and its possible byte string values
38# for UTF8/UTF16/UTF32 output and little/big endian machines
39#
40
41# UTF-8
42BOM_UTF8 = '\xef\xbb\xbf'
43
44# UTF-16, little endian
45BOM_LE = BOM_UTF16_LE = '\xff\xfe'
46
47# UTF-16, big endian
48BOM_BE = BOM_UTF16_BE = '\xfe\xff'
49
50# UTF-32, little endian
51BOM_UTF32_LE = '\xff\xfe\x00\x00'
52
53# UTF-32, big endian
54BOM_UTF32_BE = '\x00\x00\xfe\xff'
55
56if sys.byteorder == 'little':
57
58 # UTF-16, native endianness
59 BOM = BOM_UTF16 = BOM_UTF16_LE
60
61 # UTF-32, native endianness
62 BOM_UTF32 = BOM_UTF32_LE
63
64else:
65
66 # UTF-16, native endianness
67 BOM = BOM_UTF16 = BOM_UTF16_BE
68
69 # UTF-32, native endianness
70 BOM_UTF32 = BOM_UTF32_BE
71
72# Old broken names (don't use in new code)
73BOM32_LE = BOM_UTF16_LE
74BOM32_BE = BOM_UTF16_BE
75BOM64_LE = BOM_UTF32_LE
76BOM64_BE = BOM_UTF32_BE
77
78
79### Codec base classes (defining the API)
80
81class CodecInfo(tuple):
82 """Codec details when looking up the codec registry"""
83
84 # Private API to allow Python to blacklist the known non-Unicode
85 # codecs in the standard library. A more general mechanism to
86 # reliably distinguish test encodings from other codecs will hopefully
87 # be defined for Python 3.5
88 #
89 # See http://bugs.python.org/issue19619
90 _is_text_encoding = True # Assume codecs are text encodings by default
91
92 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
93 incrementalencoder=None, incrementaldecoder=None, name=None,
94 _is_text_encoding=None):
95 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
96 self.name = name
97 self.encode = encode
98 self.decode = decode
99 self.incrementalencoder = incrementalencoder
100 self.incrementaldecoder = incrementaldecoder
101 self.streamwriter = streamwriter
102 self.streamreader = streamreader
103 if _is_text_encoding is not None:
104 self._is_text_encoding = _is_text_encoding
105 return self
106
107 def __repr__(self):
108 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
109
110class Codec:
111
112 """ Defines the interface for stateless encoders/decoders.
113
114 The .encode()/.decode() methods may use different error
115 handling schemes by providing the errors argument. These
116 string values are predefined:
117
118 'strict' - raise a ValueError error (or a subclass)
119 'ignore' - ignore the character and continue with the next
120 'replace' - replace with a suitable replacement character;
121 Python will use the official U+FFFD REPLACEMENT
122 CHARACTER for the builtin Unicode codecs on
123 decoding and '?' on encoding.
124 'xmlcharrefreplace' - Replace with the appropriate XML
125 character reference (only for encoding).
126 'backslashreplace' - Replace with backslashed escape sequences
127 (only for encoding).
128
129 The set of allowed values can be extended via register_error.
130
131 """
132 def encode(self, input, errors='strict'):
133
134 """ Encodes the object input and returns a tuple (output
135 object, length consumed).
136
137 errors defines the error handling to apply. It defaults to
138 'strict' handling.
139
140 The method may not store state in the Codec instance. Use
141 StreamWriter for codecs which have to keep state in order to
142 make encoding efficient.
143
144 The encoder must be able to handle zero length input and
145 return an empty object of the output object type in this
146 situation.
147
148 """
149 raise NotImplementedError
150
151 def decode(self, input, errors='strict'):
152
153 """ Decodes the object input and returns a tuple (output
154 object, length consumed).
155
156 input must be an object which provides the bf_getreadbuf
157 buffer slot. Python strings, buffer objects and memory
158 mapped files are examples of objects providing this slot.
159
160 errors defines the error handling to apply. It defaults to
161 'strict' handling.
162
163 The method may not store state in the Codec instance. Use
164 StreamReader for codecs which have to keep state in order to
165 make decoding efficient.
166
167 The decoder must be able to handle zero length input and
168 return an empty object of the output object type in this
169 situation.
170
171 """
172 raise NotImplementedError
173
174class IncrementalEncoder(object):
175 """
176 An IncrementalEncoder encodes an input in multiple steps. The input can be
177 passed piece by piece to the encode() method. The IncrementalEncoder remembers
178 the state of the Encoding process between calls to encode().
179 """
180 def __init__(self, errors='strict'):
181 """
182 Creates an IncrementalEncoder instance.
183
184 The IncrementalEncoder may use different error handling schemes by
185 providing the errors keyword argument. See the module docstring
186 for a list of possible values.
187 """
188 self.errors = errors
189 self.buffer = ""
190
191 def encode(self, input, final=False):
192 """
193 Encodes input and returns the resulting object.
194 """
195 raise NotImplementedError
196
197 def reset(self):
198 """
199 Resets the encoder to the initial state.
200 """
201
202 def getstate(self):
203 """
204 Return the current state of the encoder.
205 """
206 return 0
207
208 def setstate(self, state):
209 """
210 Set the current state of the encoder. state must have been
211 returned by getstate().
212 """
213
214class BufferedIncrementalEncoder(IncrementalEncoder):
215 """
216 This subclass of IncrementalEncoder can be used as the baseclass for an
217 incremental encoder if the encoder must keep some of the output in a
218 buffer between calls to encode().
219 """
220 def __init__(self, errors='strict'):
221 IncrementalEncoder.__init__(self, errors)
222 self.buffer = "" # unencoded input that is kept between calls to encode()
223
224 def _buffer_encode(self, input, errors, final):
225 # Overwrite this method in subclasses: It must encode input
226 # and return an (output, length consumed) tuple
227 raise NotImplementedError
228
229 def encode(self, input, final=False):
230 # encode input (taking the buffer into account)
231 data = self.buffer + input
232 (result, consumed) = self._buffer_encode(data, self.errors, final)
233 # keep unencoded input until the next call
234 self.buffer = data[consumed:]
235 return result
236
237 def reset(self):
238 IncrementalEncoder.reset(self)
239 self.buffer = ""
240
241 def getstate(self):
242 return self.buffer or 0
243
244 def setstate(self, state):
245 self.buffer = state or ""
246
247class IncrementalDecoder(object):
248 """
249 An IncrementalDecoder decodes an input in multiple steps. The input can be
250 passed piece by piece to the decode() method. The IncrementalDecoder
251 remembers the state of the decoding process between calls to decode().
252 """
253 def __init__(self, errors='strict'):
254 """
255 Creates an IncrementalDecoder instance.
256
257 The IncrementalDecoder may use different error handling schemes by
258 providing the errors keyword argument. See the module docstring
259 for a list of possible values.
260 """
261 self.errors = errors
262
263 def decode(self, input, final=False):
264 """
265 Decodes input and returns the resulting object.
266 """
267 raise NotImplementedError
268
269 def reset(self):
270 """
271 Resets the decoder to the initial state.
272 """
273
274 def getstate(self):
275 """
276 Return the current state of the decoder.
277
278 This must be a (buffered_input, additional_state_info) tuple.
279 buffered_input must be a bytes object containing bytes that
280 were passed to decode() that have not yet been converted.
281 additional_state_info must be a non-negative integer
282 representing the state of the decoder WITHOUT yet having
283 processed the contents of buffered_input. In the initial state
284 and after reset(), getstate() must return (b"", 0).
285 """
286 return (b"", 0)
287
288 def setstate(self, state):
289 """
290 Set the current state of the decoder.
291
292 state must have been returned by getstate(). The effect of
293 setstate((b"", 0)) must be equivalent to reset().
294 """
295
296class BufferedIncrementalDecoder(IncrementalDecoder):
297 """
298 This subclass of IncrementalDecoder can be used as the baseclass for an
299 incremental decoder if the decoder must be able to handle incomplete byte
300 sequences.
301 """
302 def __init__(self, errors='strict'):
303 IncrementalDecoder.__init__(self, errors)
304 self.buffer = "" # undecoded input that is kept between calls to decode()
305
306 def _buffer_decode(self, input, errors, final):
307 # Overwrite this method in subclasses: It must decode input
308 # and return an (output, length consumed) tuple
309 raise NotImplementedError
310
311 def decode(self, input, final=False):
312 # decode input (taking the buffer into account)
313 data = self.buffer + input
314 (result, consumed) = self._buffer_decode(data, self.errors, final)
315 # keep undecoded input until the next call
316 self.buffer = data[consumed:]
317 return result
318
319 def reset(self):
320 IncrementalDecoder.reset(self)
321 self.buffer = ""
322
323 def getstate(self):
324 # additional state info is always 0
325 return (self.buffer, 0)
326
327 def setstate(self, state):
328 # ignore additional state info
329 self.buffer = state[0]
330
331#
332# The StreamWriter and StreamReader class provide generic working
333# interfaces which can be used to implement new encoding submodules
334# very easily. See encodings/utf_8.py for an example on how this is
335# done.
336#
337
338class StreamWriter(Codec):
339
340 def __init__(self, stream, errors='strict'):
341
342 """ Creates a StreamWriter instance.
343
344 stream must be a file-like object open for writing
345 (binary) data.
346
347 The StreamWriter may use different error handling
348 schemes by providing the errors keyword argument. These
349 parameters are predefined:
350
351 'strict' - raise a ValueError (or a subclass)
352 'ignore' - ignore the character and continue with the next
353 'replace'- replace with a suitable replacement character
354 'xmlcharrefreplace' - Replace with the appropriate XML
355 character reference.
356 'backslashreplace' - Replace with backslashed escape
357 sequences (only for encoding).
358
359 The set of allowed parameter values can be extended via
360 register_error.
361 """
362 self.stream = stream
363 self.errors = errors
364
365 def write(self, object):
366
367 """ Writes the object's contents encoded to self.stream.
368 """
369 data, consumed = self.encode(object, self.errors)
370 self.stream.write(data)
371
372 def writelines(self, list):
373
374 """ Writes the concatenated list of strings to the stream
375 using .write().
376 """
377 self.write(''.join(list))
378
379 def reset(self):
380
381 """ Flushes and resets the codec buffers used for keeping state.
382
383 Calling this method should ensure that the data on the
384 output is put into a clean state, that allows appending
385 of new fresh data without having to rescan the whole
386 stream to recover state.
387
388 """
389 pass
390
391 def seek(self, offset, whence=0):
392 self.stream.seek(offset, whence)
393 if whence == 0 and offset == 0:
394 self.reset()
395
396 def __getattr__(self, name,
397 getattr=getattr):
398
399 """ Inherit all other methods from the underlying stream.
400 """
401 return getattr(self.stream, name)
402
403 def __enter__(self):
404 return self
405
406 def __exit__(self, type, value, tb):
407 self.stream.close()
408
409###
410
411class StreamReader(Codec):
412
413 def __init__(self, stream, errors='strict'):
414
415 """ Creates a StreamReader instance.
416
417 stream must be a file-like object open for reading
418 (binary) data.
419
420 The StreamReader may use different error handling
421 schemes by providing the errors keyword argument. These
422 parameters are predefined:
423
424 'strict' - raise a ValueError (or a subclass)
425 'ignore' - ignore the character and continue with the next
426 'replace'- replace with a suitable replacement character;
427
428 The set of allowed parameter values can be extended via
429 register_error.
430 """
431 self.stream = stream
432 self.errors = errors
433 self.bytebuffer = ""
434 # For str->str decoding this will stay a str
435 # For str->unicode decoding the first read will promote it to unicode
436 self.charbuffer = ""
437 self.linebuffer = None
438
439 def decode(self, input, errors='strict'):
440 raise NotImplementedError
441
442 def read(self, size=-1, chars=-1, firstline=False):
443
444 """ Decodes data from the stream self.stream and returns the
445 resulting object.
446
447 chars indicates the number of characters to read from the
448 stream. read() will never return more than chars
449 characters, but it might return less, if there are not enough
450 characters available.
451
452 size indicates the approximate maximum number of bytes to
453 read from the stream for decoding purposes. The decoder
454 can modify this setting as appropriate. The default value
455 -1 indicates to read and decode as much as possible. size
456 is intended to prevent having to decode huge files in one
457 step.
458
459 If firstline is true, and a UnicodeDecodeError happens
460 after the first line terminator in the input only the first line
461 will be returned, the rest of the input will be kept until the
462 next call to read().
463
464 The method should use a greedy read strategy meaning that
465 it should read as much data as is allowed within the
466 definition of the encoding and the given size, e.g. if
467 optional encoding endings or state markers are available
468 on the stream, these should be read too.
469 """
470 # If we have lines cached, first merge them back into characters
471 if self.linebuffer:
472 self.charbuffer = "".join(self.linebuffer)
473 self.linebuffer = None
474
475 # read until we get the required number of characters (if available)
476 while True:
477 # can the request be satisfied from the character buffer?
478 if chars >= 0:
479 if len(self.charbuffer) >= chars:
480 break
481 elif size >= 0:
482 if len(self.charbuffer) >= size:
483 break
484 # we need more data
485 if size < 0:
486 newdata = self.stream.read()
487 else:
488 newdata = self.stream.read(size)
489 # decode bytes (those remaining from the last call included)
490 data = self.bytebuffer + newdata
491 try:
492 newchars, decodedbytes = self.decode(data, self.errors)
493 except UnicodeDecodeError, exc:
494 if firstline:
495 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
496 lines = newchars.splitlines(True)
497 if len(lines)<=1:
498 raise
499 else:
500 raise
501 # keep undecoded bytes until the next call
502 self.bytebuffer = data[decodedbytes:]
503 # put new characters in the character buffer
504 self.charbuffer += newchars
505 # there was no data available
506 if not newdata:
507 break
508 if chars < 0:
509 # Return everything we've got
510 result = self.charbuffer
511 self.charbuffer = ""
512 else:
513 # Return the first chars characters
514 result = self.charbuffer[:chars]
515 self.charbuffer = self.charbuffer[chars:]
516 return result
517
518 def readline(self, size=None, keepends=True):
519
520 """ Read one line from the input stream and return the
521 decoded data.
522
523 size, if given, is passed as size argument to the
524 read() method.
525
526 """
527 # If we have lines cached from an earlier read, return
528 # them unconditionally
529 if self.linebuffer:
530 line = self.linebuffer[0]
531 del self.linebuffer[0]
532 if len(self.linebuffer) == 1:
533 # revert to charbuffer mode; we might need more data
534 # next time
535 self.charbuffer = self.linebuffer[0]
536 self.linebuffer = None
537 if not keepends:
538 line = line.splitlines(False)[0]
539 return line
540
541 readsize = size or 72
542 line = ""
543 # If size is given, we call read() only once
544 while True:
545 data = self.read(readsize, firstline=True)
546 if data:
547 # If we're at a "\r" read one extra character (which might
548 # be a "\n") to get a proper line ending. If the stream is
549 # temporarily exhausted we return the wrong line ending.
550 if data.endswith("\r"):
551 data += self.read(size=1, chars=1)
552
553 line += data
554 lines = line.splitlines(True)
555 if lines:
556 if len(lines) > 1:
557 # More than one line result; the first line is a full line
558 # to return
559 line = lines[0]
560 del lines[0]
561 if len(lines) > 1:
562 # cache the remaining lines
563 lines[-1] += self.charbuffer
564 self.linebuffer = lines
565 self.charbuffer = None
566 else:
567 # only one remaining line, put it back into charbuffer
568 self.charbuffer = lines[0] + self.charbuffer
569 if not keepends:
570 line = line.splitlines(False)[0]
571 break
572 line0withend = lines[0]
573 line0withoutend = lines[0].splitlines(False)[0]
574 if line0withend != line0withoutend: # We really have a line end
575 # Put the rest back together and keep it until the next call
576 self.charbuffer = "".join(lines[1:]) + self.charbuffer
577 if keepends:
578 line = line0withend
579 else:
580 line = line0withoutend
581 break
582 # we didn't get anything or this was our only try
583 if not data or size is not None:
584 if line and not keepends:
585 line = line.splitlines(False)[0]
586 break
587 if readsize<8000:
588 readsize *= 2
589 return line
590
591 def readlines(self, sizehint=None, keepends=True):
592
593 """ Read all lines available on the input stream
594 and return them as list of lines.
595
596 Line breaks are implemented using the codec's decoder
597 method and are included in the list entries.
598
599 sizehint, if given, is ignored since there is no efficient
600 way to finding the true end-of-line.
601
602 """
603 data = self.read()
604 return data.splitlines(keepends)
605
606 def reset(self):
607
608 """ Resets the codec buffers used for keeping state.
609
610 Note that no stream repositioning should take place.
611 This method is primarily intended to be able to recover
612 from decoding errors.
613
614 """
615 self.bytebuffer = ""
616 self.charbuffer = u""
617 self.linebuffer = None
618
619 def seek(self, offset, whence=0):
620 """ Set the input stream's current position.
621
622 Resets the codec buffers used for keeping state.
623 """
624 self.stream.seek(offset, whence)
625 self.reset()
626
627 def next(self):
628
629 """ Return the next decoded line from the input stream."""
630 line = self.readline()
631 if line:
632 return line
633 raise StopIteration
634
635 def __iter__(self):
636 return self
637
638 def __getattr__(self, name,
639 getattr=getattr):
640
641 """ Inherit all other methods from the underlying stream.
642 """
643 return getattr(self.stream, name)
644
645 def __enter__(self):
646 return self
647
648 def __exit__(self, type, value, tb):
649 self.stream.close()
650
651###
652
653class StreamReaderWriter:
654
655 """ StreamReaderWriter instances allow wrapping streams which
656 work in both read and write modes.
657
658 The design is such that one can use the factory functions
659 returned by the codec.lookup() function to construct the
660 instance.
661
662 """
663 # Optional attributes set by the file wrappers below
664 encoding = 'unknown'
665
666 def __init__(self, stream, Reader, Writer, errors='strict'):
667
668 """ Creates a StreamReaderWriter instance.
669
670 stream must be a Stream-like object.
671
672 Reader, Writer must be factory functions or classes
673 providing the StreamReader, StreamWriter interface resp.
674
675 Error handling is done in the same way as defined for the
676 StreamWriter/Readers.
677
678 """
679 self.stream = stream
680 self.reader = Reader(stream, errors)
681 self.writer = Writer(stream, errors)
682 self.errors = errors
683
684 def read(self, size=-1):
685
686 return self.reader.read(size)
687
688 def readline(self, size=None):
689
690 return self.reader.readline(size)
691
692 def readlines(self, sizehint=None):
693
694 return self.reader.readlines(sizehint)
695
696 def next(self):
697
698 """ Return the next decoded line from the input stream."""
699 return self.reader.next()
700
701 def __iter__(self):
702 return self
703
704 def write(self, data):
705
706 return self.writer.write(data)
707
708 def writelines(self, list):
709
710 return self.writer.writelines(list)
711
712 def reset(self):
713
714 self.reader.reset()
715 self.writer.reset()
716
717 def seek(self, offset, whence=0):
718 self.stream.seek(offset, whence)
719 self.reader.reset()
720 if whence == 0 and offset == 0:
721 self.writer.reset()
722
723 def __getattr__(self, name,
724 getattr=getattr):
725
726 """ Inherit all other methods from the underlying stream.
727 """
728 return getattr(self.stream, name)
729
730 # these are needed to make "with codecs.open(...)" work properly
731
732 def __enter__(self):
733 return self
734
735 def __exit__(self, type, value, tb):
736 self.stream.close()
737
738###
739
740class StreamRecoder:
741
742 """ StreamRecoder instances provide a frontend - backend
743 view of encoding data.
744
745 They use the complete set of APIs returned by the
746 codecs.lookup() function to implement their task.
747
748 Data written to the stream is first decoded into an
749 intermediate format (which is dependent on the given codec
750 combination) and then written to the stream using an instance
751 of the provided Writer class.
752
753 In the other direction, data is read from the stream using a
754 Reader instance and then return encoded data to the caller.
755
756 """
757 # Optional attributes set by the file wrappers below
758 data_encoding = 'unknown'
759 file_encoding = 'unknown'
760
761 def __init__(self, stream, encode, decode, Reader, Writer,
762 errors='strict'):
763
764 """ Creates a StreamRecoder instance which implements a two-way
765 conversion: encode and decode work on the frontend (the
766 input to .read() and output of .write()) while
767 Reader and Writer work on the backend (reading and
768 writing to the stream).
769
770 You can use these objects to do transparent direct
771 recodings from e.g. latin-1 to utf-8 and back.
772
773 stream must be a file-like object.
774
775 encode, decode must adhere to the Codec interface, Reader,
776 Writer must be factory functions or classes providing the
777 StreamReader, StreamWriter interface resp.
778
779 encode and decode are needed for the frontend translation,
780 Reader and Writer for the backend translation. Unicode is
781 used as intermediate encoding.
782
783 Error handling is done in the same way as defined for the
784 StreamWriter/Readers.
785
786 """
787 self.stream = stream
788 self.encode = encode
789 self.decode = decode
790 self.reader = Reader(stream, errors)
791 self.writer = Writer(stream, errors)
792 self.errors = errors
793
794 def read(self, size=-1):
795
796 data = self.reader.read(size)
797 data, bytesencoded = self.encode(data, self.errors)
798 return data
799
800 def readline(self, size=None):
801
802 if size is None:
803 data = self.reader.readline()
804 else:
805 data = self.reader.readline(size)
806 data, bytesencoded = self.encode(data, self.errors)
807 return data
808
809 def readlines(self, sizehint=None):
810
811 data = self.reader.read()
812 data, bytesencoded = self.encode(data, self.errors)
813 return data.splitlines(1)
814
815 def next(self):
816
817 """ Return the next decoded line from the input stream."""
818 data = self.reader.next()
819 data, bytesencoded = self.encode(data, self.errors)
820 return data
821
822 def __iter__(self):
823 return self
824
825 def write(self, data):
826
827 data, bytesdecoded = self.decode(data, self.errors)
828 return self.writer.write(data)
829
830 def writelines(self, list):
831
832 data = ''.join(list)
833 data, bytesdecoded = self.decode(data, self.errors)
834 return self.writer.write(data)
835
836 def reset(self):
837
838 self.reader.reset()
839 self.writer.reset()
840
841 def __getattr__(self, name,
842 getattr=getattr):
843
844 """ Inherit all other methods from the underlying stream.
845 """
846 return getattr(self.stream, name)
847
848 def __enter__(self):
849 return self
850
851 def __exit__(self, type, value, tb):
852 self.stream.close()
853
854### Shortcuts
855
856def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
857
858 """ Open an encoded file using the given mode and return
859 a wrapped version providing transparent encoding/decoding.
860
861 Note: The wrapped version will only accept the object format
862 defined by the codecs, i.e. Unicode objects for most builtin
863 codecs. Output is also codec dependent and will usually be
864 Unicode as well.
865
866 Files are always opened in binary mode, even if no binary mode
867 was specified. This is done to avoid data loss due to encodings
868 using 8-bit values. The default file mode is 'rb' meaning to
869 open the file in binary read mode.
870
871 encoding specifies the encoding which is to be used for the
872 file.
873
874 errors may be given to define the error handling. It defaults
875 to 'strict' which causes ValueErrors to be raised in case an
876 encoding error occurs.
877
878 buffering has the same meaning as for the builtin open() API.
879 It defaults to line buffered.
880
881 The returned wrapped file object provides an extra attribute
882 .encoding which allows querying the used encoding. This
883 attribute is only available if an encoding was specified as
884 parameter.
885
886 """
887 if encoding is not None:
888 if 'U' in mode:
889 # No automatic conversion of '\n' is done on reading and writing
890 mode = mode.strip().replace('U', '')
891 if mode[:1] not in set('rwa'):
892 mode = 'r' + mode
893 if 'b' not in mode:
894 # Force opening of the file in binary mode
895 mode = mode + 'b'
896 file = __builtin__.open(filename, mode, buffering)
897 if encoding is None:
898 return file
899 info = lookup(encoding)
900 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
901 # Add attributes to simplify introspection
902 srw.encoding = encoding
903 return srw
904
905def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
906
907 """ Return a wrapped version of file which provides transparent
908 encoding translation.
909
910 Strings written to the wrapped file are interpreted according
911 to the given data_encoding and then written to the original
912 file as string using file_encoding. The intermediate encoding
913 will usually be Unicode but depends on the specified codecs.
914
915 Strings are read from the file using file_encoding and then
916 passed back to the caller as string using data_encoding.
917
918 If file_encoding is not given, it defaults to data_encoding.
919
920 errors may be given to define the error handling. It defaults
921 to 'strict' which causes ValueErrors to be raised in case an
922 encoding error occurs.
923
924 The returned wrapped file object provides two extra attributes
925 .data_encoding and .file_encoding which reflect the given
926 parameters of the same name. The attributes can be used for
927 introspection by Python programs.
928
929 """
930 if file_encoding is None:
931 file_encoding = data_encoding
932 data_info = lookup(data_encoding)
933 file_info = lookup(file_encoding)
934 sr = StreamRecoder(file, data_info.encode, data_info.decode,
935 file_info.streamreader, file_info.streamwriter, errors)
936 # Add attributes to simplify introspection
937 sr.data_encoding = data_encoding
938 sr.file_encoding = file_encoding
939 return sr
940
941### Helpers for codec lookup
942
943def getencoder(encoding):
944
945 """ Lookup up the codec for the given encoding and return
946 its encoder function.
947
948 Raises a LookupError in case the encoding cannot be found.
949
950 """
951 return lookup(encoding).encode
952
953def getdecoder(encoding):
954
955 """ Lookup up the codec for the given encoding and return
956 its decoder function.
957
958 Raises a LookupError in case the encoding cannot be found.
959
960 """
961 return lookup(encoding).decode
962
963def getincrementalencoder(encoding):
964
965 """ Lookup up the codec for the given encoding and return
966 its IncrementalEncoder class or factory function.
967
968 Raises a LookupError in case the encoding cannot be found
969 or the codecs doesn't provide an incremental encoder.
970
971 """
972 encoder = lookup(encoding).incrementalencoder
973 if encoder is None:
974 raise LookupError(encoding)
975 return encoder
976
977def getincrementaldecoder(encoding):
978
979 """ Lookup up the codec for the given encoding and return
980 its IncrementalDecoder class or factory function.
981
982 Raises a LookupError in case the encoding cannot be found
983 or the codecs doesn't provide an incremental decoder.
984
985 """
986 decoder = lookup(encoding).incrementaldecoder
987 if decoder is None:
988 raise LookupError(encoding)
989 return decoder
990
991def getreader(encoding):
992
993 """ Lookup up the codec for the given encoding and return
994 its StreamReader class or factory function.
995
996 Raises a LookupError in case the encoding cannot be found.
997
998 """
999 return lookup(encoding).streamreader
1000
1001def getwriter(encoding):
1002
1003 """ Lookup up the codec for the given encoding and return
1004 its StreamWriter class or factory function.
1005
1006 Raises a LookupError in case the encoding cannot be found.
1007
1008 """
1009 return lookup(encoding).streamwriter
1010
1011def iterencode(iterator, encoding, errors='strict', **kwargs):
1012 """
1013 Encoding iterator.
1014
1015 Encodes the input strings from the iterator using an IncrementalEncoder.
1016
1017 errors and kwargs are passed through to the IncrementalEncoder
1018 constructor.
1019 """
1020 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1021 for input in iterator:
1022 output = encoder.encode(input)
1023 if output:
1024 yield output
1025 output = encoder.encode("", True)
1026 if output:
1027 yield output
1028
1029def iterdecode(iterator, encoding, errors='strict', **kwargs):
1030 """
1031 Decoding iterator.
1032
1033 Decodes the input strings from the iterator using an IncrementalDecoder.
1034
1035 errors and kwargs are passed through to the IncrementalDecoder
1036 constructor.
1037 """
1038 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1039 for input in iterator:
1040 output = decoder.decode(input)
1041 if output:
1042 yield output
1043 output = decoder.decode("", True)
1044 if output:
1045 yield output
1046
1047### Helpers for charmap-based codecs
1048
1049def make_identity_dict(rng):
1050
1051 """ make_identity_dict(rng) -> dict
1052
1053 Return a dictionary where elements of the rng sequence are
1054 mapped to themselves.
1055
1056 """
1057 res = {}
1058 for i in rng:
1059 res[i]=i
1060 return res
1061
1062def make_encoding_map(decoding_map):
1063
1064 """ Creates an encoding map from a decoding map.
1065
1066 If a target mapping in the decoding map occurs multiple
1067 times, then that target is mapped to None (undefined mapping),
1068 causing an exception when encountered by the charmap codec
1069 during translation.
1070
1071 One example where this happens is cp875.py which decodes
1072 multiple character to \\u001a.
1073
1074 """
1075 m = {}
1076 for k,v in decoding_map.items():
1077 if not v in m:
1078 m[v] = k
1079 else:
1080 m[v] = None
1081 return m
1082
1083### error handlers
1084
1085try:
1086 strict_errors = lookup_error("strict")
1087 ignore_errors = lookup_error("ignore")
1088 replace_errors = lookup_error("replace")
1089 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1090 backslashreplace_errors = lookup_error("backslashreplace")
1091except LookupError:
1092 # In --disable-unicode builds, these error handler are missing
1093 strict_errors = None
1094 ignore_errors = None
1095 replace_errors = None
1096 xmlcharrefreplace_errors = None
1097 backslashreplace_errors = None
1098
1099# Tell modulefinder that using codecs probably needs the encodings
1100# package
1101_false = 0
1102if _false:
1103 import encodings
1104
1105### Tests
1106
1107if __name__ == '__main__':
1108
1109 # Make stdout translate Latin-1 output into UTF-8 output
1110 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1111
1112 # Have stdin translate Latin-1 input into UTF-8 input
1113 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')