opy/_regtest/src/codecs.py

OILS / opy / _regtest / src / codecs.py View on Github | oilshell.org

1113 lines, 428 significant

1	""" codecs -- Python Codec Registry, API and helpers.
2
3
4	Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8	"""#"
9
10	import __builtin__, sys
11
12	### Registry and builtin stateless codec functions
13
14	try:
15	from _codecs import *
16	except ImportError, why:
17	raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19	__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20	"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21	"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22	"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23	"CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24	"StreamReader", "StreamWriter",
25	"StreamReaderWriter", "StreamRecoder",
26	"getencoder", "getdecoder", "getincrementalencoder",
27	"getincrementaldecoder", "getreader", "getwriter",
28	"encode", "decode", "iterencode", "iterdecode",
29	"strict_errors", "ignore_errors", "replace_errors",
30	"xmlcharrefreplace_errors", "backslashreplace_errors",
31	"register_error", "lookup_error"]
32
33	### Constants
34
35	#
36	# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37	# and its possible byte string values
38	# for UTF8/UTF16/UTF32 output and little/big endian machines
39	#
40
41	# UTF-8
42	BOM_UTF8 = '\xef\xbb\xbf'
43
44	# UTF-16, little endian
45	BOM_LE = BOM_UTF16_LE = '\xff\xfe'
46
47	# UTF-16, big endian
48	BOM_BE = BOM_UTF16_BE = '\xfe\xff'
49
50	# UTF-32, little endian
51	BOM_UTF32_LE = '\xff\xfe\x00\x00'
52
53	# UTF-32, big endian
54	BOM_UTF32_BE = '\x00\x00\xfe\xff'
55
56	if sys.byteorder == 'little':
57
58	# UTF-16, native endianness
59	BOM = BOM_UTF16 = BOM_UTF16_LE
60
61	# UTF-32, native endianness
62	BOM_UTF32 = BOM_UTF32_LE
63
64	else:
65
66	# UTF-16, native endianness
67	BOM = BOM_UTF16 = BOM_UTF16_BE
68
69	# UTF-32, native endianness
70	BOM_UTF32 = BOM_UTF32_BE
71
72	# Old broken names (don't use in new code)
73	BOM32_LE = BOM_UTF16_LE
74	BOM32_BE = BOM_UTF16_BE
75	BOM64_LE = BOM_UTF32_LE
76	BOM64_BE = BOM_UTF32_BE
77
78
79	### Codec base classes (defining the API)
80
81	class CodecInfo(tuple):
82	"""Codec details when looking up the codec registry"""
83
84	# Private API to allow Python to blacklist the known non-Unicode
85	# codecs in the standard library. A more general mechanism to
86	# reliably distinguish test encodings from other codecs will hopefully
87	# be defined for Python 3.5
88	#
89	# See http://bugs.python.org/issue19619
90	_is_text_encoding = True # Assume codecs are text encodings by default
91
92	def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
93	incrementalencoder=None, incrementaldecoder=None, name=None,
94	_is_text_encoding=None):
95	self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
96	self.name = name
97	self.encode = encode
98	self.decode = decode
99	self.incrementalencoder = incrementalencoder
100	self.incrementaldecoder = incrementaldecoder
101	self.streamwriter = streamwriter
102	self.streamreader = streamreader
103	if _is_text_encoding is not None:
104	self._is_text_encoding = _is_text_encoding
105	return self
106
107	def __repr__(self):
108	return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
109
110	class Codec:
111
112	""" Defines the interface for stateless encoders/decoders.
113
114	The .encode()/.decode() methods may use different error
115	handling schemes by providing the errors argument. These
116	string values are predefined:
117
118	'strict' - raise a ValueError error (or a subclass)
119	'ignore' - ignore the character and continue with the next
120	'replace' - replace with a suitable replacement character;
121	Python will use the official U+FFFD REPLACEMENT
122	CHARACTER for the builtin Unicode codecs on
123	decoding and '?' on encoding.
124	'xmlcharrefreplace' - Replace with the appropriate XML
125	character reference (only for encoding).
126	'backslashreplace' - Replace with backslashed escape sequences
127	(only for encoding).
128
129	The set of allowed values can be extended via register_error.
130
131	"""
132	def encode(self, input, errors='strict'):
133
134	""" Encodes the object input and returns a tuple (output
135	object, length consumed).
136
137	errors defines the error handling to apply. It defaults to
138	'strict' handling.
139
140	The method may not store state in the Codec instance. Use
141	StreamWriter for codecs which have to keep state in order to
142	make encoding efficient.
143
144	The encoder must be able to handle zero length input and
145	return an empty object of the output object type in this
146	situation.
147
148	"""
149	raise NotImplementedError
150
151	def decode(self, input, errors='strict'):
152
153	""" Decodes the object input and returns a tuple (output
154	object, length consumed).
155
156	input must be an object which provides the bf_getreadbuf
157	buffer slot. Python strings, buffer objects and memory
158	mapped files are examples of objects providing this slot.
159
160	errors defines the error handling to apply. It defaults to
161	'strict' handling.
162
163	The method may not store state in the Codec instance. Use
164	StreamReader for codecs which have to keep state in order to
165	make decoding efficient.
166
167	The decoder must be able to handle zero length input and
168	return an empty object of the output object type in this
169	situation.
170
171	"""
172	raise NotImplementedError
173
174	class IncrementalEncoder(object):
175	"""
176	An IncrementalEncoder encodes an input in multiple steps. The input can be
177	passed piece by piece to the encode() method. The IncrementalEncoder remembers
178	the state of the Encoding process between calls to encode().
179	"""
180	def __init__(self, errors='strict'):
181	"""
182	Creates an IncrementalEncoder instance.
183
184	The IncrementalEncoder may use different error handling schemes by
185	providing the errors keyword argument. See the module docstring
186	for a list of possible values.
187	"""
188	self.errors = errors
189	self.buffer = ""
190
191	def encode(self, input, final=False):
192	"""
193	Encodes input and returns the resulting object.
194	"""
195	raise NotImplementedError
196
197	def reset(self):
198	"""
199	Resets the encoder to the initial state.
200	"""
201
202	def getstate(self):
203	"""
204	Return the current state of the encoder.
205	"""
206	return 0
207
208	def setstate(self, state):
209	"""
210	Set the current state of the encoder. state must have been
211	returned by getstate().
212	"""
213
214	class BufferedIncrementalEncoder(IncrementalEncoder):
215	"""
216	This subclass of IncrementalEncoder can be used as the baseclass for an
217	incremental encoder if the encoder must keep some of the output in a
218	buffer between calls to encode().
219	"""
220	def __init__(self, errors='strict'):
221	IncrementalEncoder.__init__(self, errors)
222	self.buffer = "" # unencoded input that is kept between calls to encode()
223
224	def _buffer_encode(self, input, errors, final):
225	# Overwrite this method in subclasses: It must encode input
226	# and return an (output, length consumed) tuple
227	raise NotImplementedError
228
229	def encode(self, input, final=False):
230	# encode input (taking the buffer into account)
231	data = self.buffer + input
232	(result, consumed) = self._buffer_encode(data, self.errors, final)
233	# keep unencoded input until the next call
234	self.buffer = data[consumed:]
235	return result
236
237	def reset(self):
238	IncrementalEncoder.reset(self)
239	self.buffer = ""
240
241	def getstate(self):
242	return self.buffer or 0
243
244	def setstate(self, state):
245	self.buffer = state or ""
246
247	class IncrementalDecoder(object):
248	"""
249	An IncrementalDecoder decodes an input in multiple steps. The input can be
250	passed piece by piece to the decode() method. The IncrementalDecoder
251	remembers the state of the decoding process between calls to decode().
252	"""
253	def __init__(self, errors='strict'):
254	"""
255	Creates an IncrementalDecoder instance.
256
257	The IncrementalDecoder may use different error handling schemes by
258	providing the errors keyword argument. See the module docstring
259	for a list of possible values.
260	"""
261	self.errors = errors
262
263	def decode(self, input, final=False):
264	"""
265	Decodes input and returns the resulting object.
266	"""
267	raise NotImplementedError
268
269	def reset(self):
270	"""
271	Resets the decoder to the initial state.
272	"""
273
274	def getstate(self):
275	"""
276	Return the current state of the decoder.
277
278	This must be a (buffered_input, additional_state_info) tuple.
279	buffered_input must be a bytes object containing bytes that
280	were passed to decode() that have not yet been converted.
281	additional_state_info must be a non-negative integer
282	representing the state of the decoder WITHOUT yet having
283	processed the contents of buffered_input. In the initial state
284	and after reset(), getstate() must return (b"", 0).
285	"""
286	return (b"", 0)
287
288	def setstate(self, state):
289	"""
290	Set the current state of the decoder.
291
292	state must have been returned by getstate(). The effect of
293	setstate((b"", 0)) must be equivalent to reset().
294	"""
295
296	class BufferedIncrementalDecoder(IncrementalDecoder):
297	"""
298	This subclass of IncrementalDecoder can be used as the baseclass for an
299	incremental decoder if the decoder must be able to handle incomplete byte
300	sequences.
301	"""
302	def __init__(self, errors='strict'):
303	IncrementalDecoder.__init__(self, errors)
304	self.buffer = "" # undecoded input that is kept between calls to decode()
305
306	def _buffer_decode(self, input, errors, final):
307	# Overwrite this method in subclasses: It must decode input
308	# and return an (output, length consumed) tuple
309	raise NotImplementedError
310
311	def decode(self, input, final=False):
312	# decode input (taking the buffer into account)
313	data = self.buffer + input
314	(result, consumed) = self._buffer_decode(data, self.errors, final)
315	# keep undecoded input until the next call
316	self.buffer = data[consumed:]
317	return result
318
319	def reset(self):
320	IncrementalDecoder.reset(self)
321	self.buffer = ""
322
323	def getstate(self):
324	# additional state info is always 0
325	return (self.buffer, 0)
326
327	def setstate(self, state):
328	# ignore additional state info
329	self.buffer = state[0]
330
331	#
332	# The StreamWriter and StreamReader class provide generic working
333	# interfaces which can be used to implement new encoding submodules
334	# very easily. See encodings/utf_8.py for an example on how this is
335	# done.
336	#
337
338	class StreamWriter(Codec):
339
340	def __init__(self, stream, errors='strict'):
341
342	""" Creates a StreamWriter instance.
343
344	stream must be a file-like object open for writing
345	(binary) data.
346
347	The StreamWriter may use different error handling
348	schemes by providing the errors keyword argument. These
349	parameters are predefined:
350
351	'strict' - raise a ValueError (or a subclass)
352	'ignore' - ignore the character and continue with the next
353	'replace'- replace with a suitable replacement character
354	'xmlcharrefreplace' - Replace with the appropriate XML
355	character reference.
356	'backslashreplace' - Replace with backslashed escape
357	sequences (only for encoding).
358
359	The set of allowed parameter values can be extended via
360	register_error.
361	"""
362	self.stream = stream
363	self.errors = errors
364
365	def write(self, object):
366
367	""" Writes the object's contents encoded to self.stream.
368	"""
369	data, consumed = self.encode(object, self.errors)
370	self.stream.write(data)
371
372	def writelines(self, list):
373
374	""" Writes the concatenated list of strings to the stream
375	using .write().
376	"""
377	self.write(''.join(list))
378
379	def reset(self):
380
381	""" Flushes and resets the codec buffers used for keeping state.
382
383	Calling this method should ensure that the data on the
384	output is put into a clean state, that allows appending
385	of new fresh data without having to rescan the whole
386	stream to recover state.
387
388	"""
389	pass
390
391	def seek(self, offset, whence=0):
392	self.stream.seek(offset, whence)
393	if whence == 0 and offset == 0:
394	self.reset()
395
396	def __getattr__(self, name,
397	getattr=getattr):
398
399	""" Inherit all other methods from the underlying stream.
400	"""
401	return getattr(self.stream, name)
402
403	def __enter__(self):
404	return self
405
406	def __exit__(self, type, value, tb):
407	self.stream.close()
408
409	###
410
411	class StreamReader(Codec):
412
413	def __init__(self, stream, errors='strict'):
414
415	""" Creates a StreamReader instance.
416
417	stream must be a file-like object open for reading
418	(binary) data.
419
420	The StreamReader may use different error handling
421	schemes by providing the errors keyword argument. These
422	parameters are predefined:
423
424	'strict' - raise a ValueError (or a subclass)
425	'ignore' - ignore the character and continue with the next
426	'replace'- replace with a suitable replacement character;
427
428	The set of allowed parameter values can be extended via
429	register_error.
430	"""
431	self.stream = stream
432	self.errors = errors
433	self.bytebuffer = ""
434	# For str->str decoding this will stay a str
435	# For str->unicode decoding the first read will promote it to unicode
436	self.charbuffer = ""
437	self.linebuffer = None
438
439	def decode(self, input, errors='strict'):
440	raise NotImplementedError
441
442	def read(self, size=-1, chars=-1, firstline=False):
443
444	""" Decodes data from the stream self.stream and returns the
445	resulting object.
446
447	chars indicates the number of characters to read from the
448	stream. read() will never return more than chars
449	characters, but it might return less, if there are not enough
450	characters available.
451
452	size indicates the approximate maximum number of bytes to
453	read from the stream for decoding purposes. The decoder
454	can modify this setting as appropriate. The default value
455	-1 indicates to read and decode as much as possible. size
456	is intended to prevent having to decode huge files in one
457	step.
458
459	If firstline is true, and a UnicodeDecodeError happens
460	after the first line terminator in the input only the first line
461	will be returned, the rest of the input will be kept until the
462	next call to read().
463
464	The method should use a greedy read strategy meaning that
465	it should read as much data as is allowed within the
466	definition of the encoding and the given size, e.g. if
467	optional encoding endings or state markers are available
468	on the stream, these should be read too.
469	"""
470	# If we have lines cached, first merge them back into characters
471	if self.linebuffer:
472	self.charbuffer = "".join(self.linebuffer)
473	self.linebuffer = None
474
475	# read until we get the required number of characters (if available)
476	while True:
477	# can the request be satisfied from the character buffer?
478	if chars >= 0:
479	if len(self.charbuffer) >= chars:
480	break
481	elif size >= 0:
482	if len(self.charbuffer) >= size:
483	break
484	# we need more data
485	if size < 0:
486	newdata = self.stream.read()
487	else:
488	newdata = self.stream.read(size)
489	# decode bytes (those remaining from the last call included)
490	data = self.bytebuffer + newdata
491	try:
492	newchars, decodedbytes = self.decode(data, self.errors)
493	except UnicodeDecodeError, exc:
494	if firstline:
495	newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
496	lines = newchars.splitlines(True)
497	if len(lines)<=1:
498	raise
499	else:
500	raise
501	# keep undecoded bytes until the next call
502	self.bytebuffer = data[decodedbytes:]
503	# put new characters in the character buffer
504	self.charbuffer += newchars
505	# there was no data available
506	if not newdata:
507	break
508	if chars < 0:
509	# Return everything we've got
510	result = self.charbuffer
511	self.charbuffer = ""
512	else:
513	# Return the first chars characters
514	result = self.charbuffer[:chars]
515	self.charbuffer = self.charbuffer[chars:]
516	return result
517
518	def readline(self, size=None, keepends=True):
519
520	""" Read one line from the input stream and return the
521	decoded data.
522
523	size, if given, is passed as size argument to the
524	read() method.
525
526	"""
527	# If we have lines cached from an earlier read, return
528	# them unconditionally
529	if self.linebuffer:
530	line = self.linebuffer[0]
531	del self.linebuffer[0]
532	if len(self.linebuffer) == 1:
533	# revert to charbuffer mode; we might need more data
534	# next time
535	self.charbuffer = self.linebuffer[0]
536	self.linebuffer = None
537	if not keepends:
538	line = line.splitlines(False)[0]
539	return line
540
541	readsize = size or 72
542	line = ""
543	# If size is given, we call read() only once
544	while True:
545	data = self.read(readsize, firstline=True)
546	if data:
547	# If we're at a "\r" read one extra character (which might
548	# be a "\n") to get a proper line ending. If the stream is
549	# temporarily exhausted we return the wrong line ending.
550	if data.endswith("\r"):
551	data += self.read(size=1, chars=1)
552
553	line += data
554	lines = line.splitlines(True)
555	if lines:
556	if len(lines) > 1:
557	# More than one line result; the first line is a full line
558	# to return
559	line = lines[0]
560	del lines[0]
561	if len(lines) > 1:
562	# cache the remaining lines
563	lines[-1] += self.charbuffer
564	self.linebuffer = lines
565	self.charbuffer = None
566	else:
567	# only one remaining line, put it back into charbuffer
568	self.charbuffer = lines[0] + self.charbuffer
569	if not keepends:
570	line = line.splitlines(False)[0]
571	break
572	line0withend = lines[0]
573	line0withoutend = lines[0].splitlines(False)[0]
574	if line0withend != line0withoutend: # We really have a line end
575	# Put the rest back together and keep it until the next call
576	self.charbuffer = "".join(lines[1:]) + self.charbuffer
577	if keepends:
578	line = line0withend
579	else:
580	line = line0withoutend
581	break
582	# we didn't get anything or this was our only try
583	if not data or size is not None:
584	if line and not keepends:
585	line = line.splitlines(False)[0]
586	break
587	if readsize<8000:
588	readsize *= 2
589	return line
590
591	def readlines(self, sizehint=None, keepends=True):
592
593	""" Read all lines available on the input stream
594	and return them as list of lines.
595
596	Line breaks are implemented using the codec's decoder
597	method and are included in the list entries.
598
599	sizehint, if given, is ignored since there is no efficient
600	way to finding the true end-of-line.
601
602	"""
603	data = self.read()
604	return data.splitlines(keepends)
605
606	def reset(self):
607
608	""" Resets the codec buffers used for keeping state.
609
610	Note that no stream repositioning should take place.
611	This method is primarily intended to be able to recover
612	from decoding errors.
613
614	"""
615	self.bytebuffer = ""
616	self.charbuffer = u""
617	self.linebuffer = None
618
619	def seek(self, offset, whence=0):
620	""" Set the input stream's current position.
621
622	Resets the codec buffers used for keeping state.
623	"""
624	self.stream.seek(offset, whence)
625	self.reset()
626
627	def next(self):
628
629	""" Return the next decoded line from the input stream."""
630	line = self.readline()
631	if line:
632	return line
633	raise StopIteration
634
635	def __iter__(self):
636	return self
637
638	def __getattr__(self, name,
639	getattr=getattr):
640
641	""" Inherit all other methods from the underlying stream.
642	"""
643	return getattr(self.stream, name)
644
645	def __enter__(self):
646	return self
647
648	def __exit__(self, type, value, tb):
649	self.stream.close()
650
651	###
652
653	class StreamReaderWriter:
654
655	""" StreamReaderWriter instances allow wrapping streams which
656	work in both read and write modes.
657
658	The design is such that one can use the factory functions
659	returned by the codec.lookup() function to construct the
660	instance.
661
662	"""
663	# Optional attributes set by the file wrappers below
664	encoding = 'unknown'
665
666	def __init__(self, stream, Reader, Writer, errors='strict'):
667
668	""" Creates a StreamReaderWriter instance.
669
670	stream must be a Stream-like object.
671
672	Reader, Writer must be factory functions or classes
673	providing the StreamReader, StreamWriter interface resp.
674
675	Error handling is done in the same way as defined for the
676	StreamWriter/Readers.
677
678	"""
679	self.stream = stream
680	self.reader = Reader(stream, errors)
681	self.writer = Writer(stream, errors)
682	self.errors = errors
683
684	def read(self, size=-1):
685
686	return self.reader.read(size)
687
688	def readline(self, size=None):
689
690	return self.reader.readline(size)
691
692	def readlines(self, sizehint=None):
693
694	return self.reader.readlines(sizehint)
695
696	def next(self):
697
698	""" Return the next decoded line from the input stream."""
699	return self.reader.next()
700
701	def __iter__(self):
702	return self
703
704	def write(self, data):
705
706	return self.writer.write(data)
707
708	def writelines(self, list):
709
710	return self.writer.writelines(list)
711
712	def reset(self):
713
714	self.reader.reset()
715	self.writer.reset()
716
717	def seek(self, offset, whence=0):
718	self.stream.seek(offset, whence)
719	self.reader.reset()
720	if whence == 0 and offset == 0:
721	self.writer.reset()
722
723	def __getattr__(self, name,
724	getattr=getattr):
725
726	""" Inherit all other methods from the underlying stream.
727	"""
728	return getattr(self.stream, name)
729
730	# these are needed to make "with codecs.open(...)" work properly
731
732	def __enter__(self):
733	return self
734
735	def __exit__(self, type, value, tb):
736	self.stream.close()
737
738	###
739
740	class StreamRecoder:
741
742	""" StreamRecoder instances provide a frontend - backend
743	view of encoding data.
744
745	They use the complete set of APIs returned by the
746	codecs.lookup() function to implement their task.
747
748	Data written to the stream is first decoded into an
749	intermediate format (which is dependent on the given codec
750	combination) and then written to the stream using an instance
751	of the provided Writer class.
752
753	In the other direction, data is read from the stream using a
754	Reader instance and then return encoded data to the caller.
755
756	"""
757	# Optional attributes set by the file wrappers below
758	data_encoding = 'unknown'
759	file_encoding = 'unknown'
760
761	def __init__(self, stream, encode, decode, Reader, Writer,
762	errors='strict'):
763
764	""" Creates a StreamRecoder instance which implements a two-way
765	conversion: encode and decode work on the frontend (the
766	input to .read() and output of .write()) while
767	Reader and Writer work on the backend (reading and
768	writing to the stream).
769
770	You can use these objects to do transparent direct
771	recodings from e.g. latin-1 to utf-8 and back.
772
773	stream must be a file-like object.
774
775	encode, decode must adhere to the Codec interface, Reader,
776	Writer must be factory functions or classes providing the
777	StreamReader, StreamWriter interface resp.
778
779	encode and decode are needed for the frontend translation,
780	Reader and Writer for the backend translation. Unicode is
781	used as intermediate encoding.
782
783	Error handling is done in the same way as defined for the
784	StreamWriter/Readers.
785
786	"""
787	self.stream = stream
788	self.encode = encode
789	self.decode = decode
790	self.reader = Reader(stream, errors)
791	self.writer = Writer(stream, errors)
792	self.errors = errors
793
794	def read(self, size=-1):
795
796	data = self.reader.read(size)
797	data, bytesencoded = self.encode(data, self.errors)
798	return data
799
800	def readline(self, size=None):
801
802	if size is None:
803	data = self.reader.readline()
804	else:
805	data = self.reader.readline(size)
806	data, bytesencoded = self.encode(data, self.errors)
807	return data
808
809	def readlines(self, sizehint=None):
810
811	data = self.reader.read()
812	data, bytesencoded = self.encode(data, self.errors)
813	return data.splitlines(1)
814
815	def next(self):
816
817	""" Return the next decoded line from the input stream."""
818	data = self.reader.next()
819	data, bytesencoded = self.encode(data, self.errors)
820	return data
821
822	def __iter__(self):
823	return self
824
825	def write(self, data):
826
827	data, bytesdecoded = self.decode(data, self.errors)
828	return self.writer.write(data)
829
830	def writelines(self, list):
831
832	data = ''.join(list)
833	data, bytesdecoded = self.decode(data, self.errors)
834	return self.writer.write(data)
835
836	def reset(self):
837
838	self.reader.reset()
839	self.writer.reset()
840
841	def __getattr__(self, name,
842	getattr=getattr):
843
844	""" Inherit all other methods from the underlying stream.
845	"""
846	return getattr(self.stream, name)
847
848	def __enter__(self):
849	return self
850
851	def __exit__(self, type, value, tb):
852	self.stream.close()
853
854	### Shortcuts
855
856	def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
857
858	""" Open an encoded file using the given mode and return
859	a wrapped version providing transparent encoding/decoding.
860
861	Note: The wrapped version will only accept the object format
862	defined by the codecs, i.e. Unicode objects for most builtin
863	codecs. Output is also codec dependent and will usually be
864	Unicode as well.
865
866	Files are always opened in binary mode, even if no binary mode
867	was specified. This is done to avoid data loss due to encodings
868	using 8-bit values. The default file mode is 'rb' meaning to
869	open the file in binary read mode.
870
871	encoding specifies the encoding which is to be used for the
872	file.
873
874	errors may be given to define the error handling. It defaults
875	to 'strict' which causes ValueErrors to be raised in case an
876	encoding error occurs.
877
878	buffering has the same meaning as for the builtin open() API.
879	It defaults to line buffered.
880
881	The returned wrapped file object provides an extra attribute
882	.encoding which allows querying the used encoding. This
883	attribute is only available if an encoding was specified as
884	parameter.
885
886	"""
887	if encoding is not None:
888	if 'U' in mode:
889	# No automatic conversion of '\n' is done on reading and writing
890	mode = mode.strip().replace('U', '')
891	if mode[:1] not in set('rwa'):
892	mode = 'r' + mode
893	if 'b' not in mode:
894	# Force opening of the file in binary mode
895	mode = mode + 'b'
896	file = __builtin__.open(filename, mode, buffering)
897	if encoding is None:
898	return file
899	info = lookup(encoding)
900	srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
901	# Add attributes to simplify introspection
902	srw.encoding = encoding
903	return srw
904
905	def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
906
907	""" Return a wrapped version of file which provides transparent
908	encoding translation.
909
910	Strings written to the wrapped file are interpreted according
911	to the given data_encoding and then written to the original
912	file as string using file_encoding. The intermediate encoding
913	will usually be Unicode but depends on the specified codecs.
914
915	Strings are read from the file using file_encoding and then
916	passed back to the caller as string using data_encoding.
917
918	If file_encoding is not given, it defaults to data_encoding.
919
920	errors may be given to define the error handling. It defaults
921	to 'strict' which causes ValueErrors to be raised in case an
922	encoding error occurs.
923
924	The returned wrapped file object provides two extra attributes
925	.data_encoding and .file_encoding which reflect the given
926	parameters of the same name. The attributes can be used for
927	introspection by Python programs.
928
929	"""
930	if file_encoding is None:
931	file_encoding = data_encoding
932	data_info = lookup(data_encoding)
933	file_info = lookup(file_encoding)
934	sr = StreamRecoder(file, data_info.encode, data_info.decode,
935	file_info.streamreader, file_info.streamwriter, errors)
936	# Add attributes to simplify introspection
937	sr.data_encoding = data_encoding
938	sr.file_encoding = file_encoding
939	return sr
940
941	### Helpers for codec lookup
942
943	def getencoder(encoding):
944
945	""" Lookup up the codec for the given encoding and return
946	its encoder function.
947
948	Raises a LookupError in case the encoding cannot be found.
949
950	"""
951	return lookup(encoding).encode
952
953	def getdecoder(encoding):
954
955	""" Lookup up the codec for the given encoding and return
956	its decoder function.
957
958	Raises a LookupError in case the encoding cannot be found.
959
960	"""
961	return lookup(encoding).decode
962
963	def getincrementalencoder(encoding):
964
965	""" Lookup up the codec for the given encoding and return
966	its IncrementalEncoder class or factory function.
967
968	Raises a LookupError in case the encoding cannot be found
969	or the codecs doesn't provide an incremental encoder.
970
971	"""
972	encoder = lookup(encoding).incrementalencoder
973	if encoder is None:
974	raise LookupError(encoding)
975	return encoder
976
977	def getincrementaldecoder(encoding):
978
979	""" Lookup up the codec for the given encoding and return
980	its IncrementalDecoder class or factory function.
981
982	Raises a LookupError in case the encoding cannot be found
983	or the codecs doesn't provide an incremental decoder.
984
985	"""
986	decoder = lookup(encoding).incrementaldecoder
987	if decoder is None:
988	raise LookupError(encoding)
989	return decoder
990
991	def getreader(encoding):
992
993	""" Lookup up the codec for the given encoding and return
994	its StreamReader class or factory function.
995
996	Raises a LookupError in case the encoding cannot be found.
997
998	"""
999	return lookup(encoding).streamreader
1000
1001	def getwriter(encoding):
1002
1003	""" Lookup up the codec for the given encoding and return
1004	its StreamWriter class or factory function.
1005
1006	Raises a LookupError in case the encoding cannot be found.
1007
1008	"""
1009	return lookup(encoding).streamwriter
1010
1011	def iterencode(iterator, encoding, errors='strict', **kwargs):
1012	"""
1013	Encoding iterator.
1014
1015	Encodes the input strings from the iterator using an IncrementalEncoder.
1016
1017	errors and kwargs are passed through to the IncrementalEncoder
1018	constructor.
1019	"""
1020	encoder = getincrementalencoder(encoding)(errors, **kwargs)
1021	for input in iterator:
1022	output = encoder.encode(input)
1023	if output:
1024	yield output
1025	output = encoder.encode("", True)
1026	if output:
1027	yield output
1028
1029	def iterdecode(iterator, encoding, errors='strict', **kwargs):
1030	"""
1031	Decoding iterator.
1032
1033	Decodes the input strings from the iterator using an IncrementalDecoder.
1034
1035	errors and kwargs are passed through to the IncrementalDecoder
1036	constructor.
1037	"""
1038	decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1039	for input in iterator:
1040	output = decoder.decode(input)
1041	if output:
1042	yield output
1043	output = decoder.decode("", True)
1044	if output:
1045	yield output
1046
1047	### Helpers for charmap-based codecs
1048
1049	def make_identity_dict(rng):
1050
1051	""" make_identity_dict(rng) -> dict
1052
1053	Return a dictionary where elements of the rng sequence are
1054	mapped to themselves.
1055
1056	"""
1057	res = {}
1058	for i in rng:
1059	res[i]=i
1060	return res
1061
1062	def make_encoding_map(decoding_map):
1063
1064	""" Creates an encoding map from a decoding map.
1065
1066	If a target mapping in the decoding map occurs multiple
1067	times, then that target is mapped to None (undefined mapping),
1068	causing an exception when encountered by the charmap codec
1069	during translation.
1070
1071	One example where this happens is cp875.py which decodes
1072	multiple character to \\u001a.
1073
1074	"""
1075	m = {}
1076	for k,v in decoding_map.items():
1077	if not v in m:
1078	m[v] = k
1079	else:
1080	m[v] = None
1081	return m
1082
1083	### error handlers
1084
1085	try:
1086	strict_errors = lookup_error("strict")
1087	ignore_errors = lookup_error("ignore")
1088	replace_errors = lookup_error("replace")
1089	xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1090	backslashreplace_errors = lookup_error("backslashreplace")
1091	except LookupError:
1092	# In --disable-unicode builds, these error handler are missing
1093	strict_errors = None
1094	ignore_errors = None
1095	replace_errors = None
1096	xmlcharrefreplace_errors = None
1097	backslashreplace_errors = None
1098
1099	# Tell modulefinder that using codecs probably needs the encodings
1100	# package
1101	_false = 0
1102	if _false:
1103	import encodings
1104
1105	### Tests
1106
1107	if __name__ == '__main__':
1108
1109	# Make stdout translate Latin-1 output into UTF-8 output
1110	sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1111
1112	# Have stdin translate Latin-1 input into UTF-8 input
1113	sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')