| 1 | """ Standard "encodings" Package
 | 
| 2 | 
 | 
| 3 |     Standard Python encoding modules are stored in this package
 | 
| 4 |     directory.
 | 
| 5 | 
 | 
| 6 |     Codec modules must have names corresponding to normalized encoding
 | 
| 7 |     names as defined in the normalize_encoding() function below, e.g.
 | 
| 8 |     'utf-8' must be implemented by the module 'utf_8.py'.
 | 
| 9 | 
 | 
| 10 |     Each codec module must export the following interface:
 | 
| 11 | 
 | 
| 12 |     * getregentry() -> codecs.CodecInfo object
 | 
| 13 |     The getregentry() API must a CodecInfo object with encoder, decoder,
 | 
| 14 |     incrementalencoder, incrementaldecoder, streamwriter and streamreader
 | 
| 15 |     attributes which adhere to the Python Codec Interface Standard.
 | 
| 16 | 
 | 
| 17 |     In addition, a module may optionally also define the following
 | 
| 18 |     APIs which are then used by the package's codec search function:
 | 
| 19 | 
 | 
| 20 |     * getaliases() -> sequence of encoding name strings to use as aliases
 | 
| 21 | 
 | 
| 22 |     Alias names returned by getaliases() must be normalized encoding
 | 
| 23 |     names as defined by normalize_encoding().
 | 
| 24 | 
 | 
| 25 | Written by Marc-Andre Lemburg (mal@lemburg.com).
 | 
| 26 | 
 | 
| 27 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
 | 
| 28 | 
 | 
| 29 | """#"
 | 
| 30 | 
 | 
| 31 | import codecs
 | 
| 32 | from encodings import aliases
 | 
| 33 | import __builtin__
 | 
| 34 | 
 | 
| 35 | _cache = {}
 | 
| 36 | _unknown = '--unknown--'
 | 
| 37 | _import_tail = ['*']
 | 
| 38 | _norm_encoding_map = ('                                              . '
 | 
| 39 |                       '0123456789       ABCDEFGHIJKLMNOPQRSTUVWXYZ     '
 | 
| 40 |                       ' abcdefghijklmnopqrstuvwxyz                     '
 | 
| 41 |                       '                                                '
 | 
| 42 |                       '                                                '
 | 
| 43 |                       '                ')
 | 
| 44 | _aliases = aliases.aliases
 | 
| 45 | 
 | 
| 46 | class CodecRegistryError(LookupError, SystemError):
 | 
| 47 |     pass
 | 
| 48 | 
 | 
| 49 | def normalize_encoding(encoding):
 | 
| 50 | 
 | 
| 51 |     """ Normalize an encoding name.
 | 
| 52 | 
 | 
| 53 |         Normalization works as follows: all non-alphanumeric
 | 
| 54 |         characters except the dot used for Python package names are
 | 
| 55 |         collapsed and replaced with a single underscore, e.g. '  -;#'
 | 
| 56 |         becomes '_'. Leading and trailing underscores are removed.
 | 
| 57 | 
 | 
| 58 |         Note that encoding names should be ASCII only; if they do use
 | 
| 59 |         non-ASCII characters, these must be Latin-1 compatible.
 | 
| 60 | 
 | 
| 61 |     """
 | 
| 62 |     # Make sure we have an 8-bit string, because .translate() works
 | 
| 63 |     # differently for Unicode strings.
 | 
| 64 |     if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode):
 | 
| 65 |         # Note that .encode('latin-1') does *not* use the codec
 | 
| 66 |         # registry, so this call doesn't recurse. (See unicodeobject.c
 | 
| 67 |         # PyUnicode_AsEncodedString() for details)
 | 
| 68 |         encoding = encoding.encode('latin-1')
 | 
| 69 |     return '_'.join(encoding.translate(_norm_encoding_map).split())
 | 
| 70 | 
 | 
| 71 | def search_function(encoding):
 | 
| 72 | 
 | 
| 73 |     # Cache lookup
 | 
| 74 |     entry = _cache.get(encoding, _unknown)
 | 
| 75 |     if entry is not _unknown:
 | 
| 76 |         return entry
 | 
| 77 | 
 | 
| 78 |     # Import the module:
 | 
| 79 |     #
 | 
| 80 |     # First try to find an alias for the normalized encoding
 | 
| 81 |     # name and lookup the module using the aliased name, then try to
 | 
| 82 |     # lookup the module using the standard import scheme, i.e. first
 | 
| 83 |     # try in the encodings package, then at top-level.
 | 
| 84 |     #
 | 
| 85 |     norm_encoding = normalize_encoding(encoding)
 | 
| 86 |     aliased_encoding = _aliases.get(norm_encoding) or \
 | 
| 87 |                        _aliases.get(norm_encoding.replace('.', '_'))
 | 
| 88 |     if aliased_encoding is not None:
 | 
| 89 |         modnames = [aliased_encoding,
 | 
| 90 |                     norm_encoding]
 | 
| 91 |     else:
 | 
| 92 |         modnames = [norm_encoding]
 | 
| 93 |     for modname in modnames:
 | 
| 94 |         if not modname or '.' in modname:
 | 
| 95 |             continue
 | 
| 96 |         try:
 | 
| 97 |             # Import is absolute to prevent the possibly malicious import of a
 | 
| 98 |             # module with side-effects that is not in the 'encodings' package.
 | 
| 99 |             mod = __import__('encodings.' + modname, fromlist=_import_tail,
 | 
| 100 |                              level=0)
 | 
| 101 |         except ImportError:
 | 
| 102 |             pass
 | 
| 103 |         else:
 | 
| 104 |             break
 | 
| 105 |     else:
 | 
| 106 |         mod = None
 | 
| 107 | 
 | 
| 108 |     try:
 | 
| 109 |         getregentry = mod.getregentry
 | 
| 110 |     except AttributeError:
 | 
| 111 |         # Not a codec module
 | 
| 112 |         mod = None
 | 
| 113 | 
 | 
| 114 |     if mod is None:
 | 
| 115 |         # Cache misses
 | 
| 116 |         _cache[encoding] = None
 | 
| 117 |         return None
 | 
| 118 | 
 | 
| 119 |     # Now ask the module for the registry entry
 | 
| 120 |     entry = getregentry()
 | 
| 121 |     if not isinstance(entry, codecs.CodecInfo):
 | 
| 122 |         if not 4 <= len(entry) <= 7:
 | 
| 123 |             raise CodecRegistryError,\
 | 
| 124 |                  'module "%s" (%s) failed to register' % \
 | 
| 125 |                   (mod.__name__, mod.__file__)
 | 
| 126 |         if not hasattr(entry[0], '__call__') or \
 | 
| 127 |            not hasattr(entry[1], '__call__') or \
 | 
| 128 |            (entry[2] is not None and not hasattr(entry[2], '__call__')) or \
 | 
| 129 |            (entry[3] is not None and not hasattr(entry[3], '__call__')) or \
 | 
| 130 |            (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \
 | 
| 131 |            (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):
 | 
| 132 |             raise CodecRegistryError,\
 | 
| 133 |                 'incompatible codecs in module "%s" (%s)' % \
 | 
| 134 |                 (mod.__name__, mod.__file__)
 | 
| 135 |         if len(entry)<7 or entry[6] is None:
 | 
| 136 |             entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
 | 
| 137 |         entry = codecs.CodecInfo(*entry)
 | 
| 138 | 
 | 
| 139 |     # Cache the codec registry entry
 | 
| 140 |     _cache[encoding] = entry
 | 
| 141 | 
 | 
| 142 |     # Register its aliases (without overwriting previously registered
 | 
| 143 |     # aliases)
 | 
| 144 |     try:
 | 
| 145 |         codecaliases = mod.getaliases()
 | 
| 146 |     except AttributeError:
 | 
| 147 |         pass
 | 
| 148 |     else:
 | 
| 149 |         for alias in codecaliases:
 | 
| 150 |             if alias not in _aliases:
 | 
| 151 |                 _aliases[alias] = modname
 | 
| 152 | 
 | 
| 153 |     # Return the registry entry
 | 
| 154 |     return entry
 | 
| 155 | 
 | 
| 156 | # Register the search_function in the Python codec registry
 | 
| 157 | codecs.register(search_function)
 |