2014-11-24 16:42:30 -05:00
|
|
|
|
# coding: utf-8
|
|
|
|
|
r"""
|
|
|
|
|
Decodes single-byte encodings, filling their "holes" in the same messy way that
|
|
|
|
|
everyone else does.
|
|
|
|
|
|
|
|
|
|
A single-byte encoding maps each byte to a Unicode character, except that some
|
|
|
|
|
bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
|
|
|
|
|
example, bytes 0x81 and 0x8D, among others, have no meaning.
|
|
|
|
|
|
|
|
|
|
Python, wanting to preserve some sense of decorum, will handle these bytes
|
|
|
|
|
as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
|
|
|
|
|
different from each other. It just hasn't defined what they are in terms of
|
|
|
|
|
Unicode.
|
|
|
|
|
|
|
|
|
|
Software that has to interoperate with Windows-1252 and Unicode -- such as all
|
|
|
|
|
the common Web browsers -- will pick some Unicode characters for them to map
|
|
|
|
|
to, and the characters they pick are the Unicode characters with the same
|
|
|
|
|
numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
|
|
|
|
|
resulting characters tend to fall into a range of Unicode that's set aside for
|
|
|
|
|
obselete Latin-1 control characters anyway.
|
|
|
|
|
|
|
|
|
|
These sloppy codecs let Python do the same thing, thus interoperating with
|
|
|
|
|
other software that works this way. It defines a sloppy version of many
|
|
|
|
|
single-byte encodings with holes. (There is no need for a sloppy version of
|
|
|
|
|
an encoding without holes: for example, there is no such thing as
|
|
|
|
|
sloppy-iso-8859-2 or sloppy-macroman.)
|
|
|
|
|
|
|
|
|
|
The following encodings will become defined:
|
|
|
|
|
|
|
|
|
|
- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
|
|
|
|
|
- sloppy-windows-1251 (Cyrillic)
|
|
|
|
|
- sloppy-windows-1252 (Western European, based on Latin-1)
|
|
|
|
|
- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
|
|
|
|
|
- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
|
|
|
|
|
- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
|
|
|
|
|
- sloppy-windows-1256 (Arabic)
|
|
|
|
|
- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
|
|
|
|
|
- sloppy-windows-1258 (Vietnamese)
|
|
|
|
|
- sloppy-cp874 (Thai, based on ISO-8859-11)
|
|
|
|
|
- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
|
|
|
|
|
- sloppy-iso-8859-6 (different Arabic)
|
|
|
|
|
- sloppy-iso-8859-7 (Greek)
|
|
|
|
|
- sloppy-iso-8859-8 (Hebrew)
|
|
|
|
|
- sloppy-iso-8859-11 (Thai)
|
|
|
|
|
|
|
|
|
|
Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
|
|
|
|
|
defined.
|
|
|
|
|
|
|
|
|
|
Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
|
|
|
|
|
the rest are rather uncommon.
|
|
|
|
|
|
|
|
|
|
Here are some examples, using `ftfy.explain_unicode` to illustrate how
|
|
|
|
|
sloppy-windows-1252 merges Windows-1252 with Latin-1:
|
|
|
|
|
|
|
|
|
|
>>> from ftfy import explain_unicode
|
|
|
|
|
>>> some_bytes = b'\x80\x81\x82'
|
|
|
|
|
>>> explain_unicode(some_bytes.decode('latin-1'))
|
|
|
|
|
U+0080 \x80 [Cc] <unknown>
|
|
|
|
|
U+0081 \x81 [Cc] <unknown>
|
|
|
|
|
U+0082 \x82 [Cc] <unknown>
|
|
|
|
|
|
|
|
|
|
>>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
|
|
|
|
|
U+20AC € [Sc] EURO SIGN
|
|
|
|
|
U+FFFD <EFBFBD> [So] REPLACEMENT CHARACTER
|
|
|
|
|
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
|
|
|
|
|
|
|
|
|
|
>>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
|
|
|
|
|
U+20AC € [Sc] EURO SIGN
|
|
|
|
|
U+0081 \x81 [Cc] <unknown>
|
|
|
|
|
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
|
|
|
|
|
"""
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
import codecs
|
|
|
|
|
from encodings import normalize_encoding
|
|
|
|
|
|
|
|
|
|
REPLACEMENT_CHAR = '\ufffd'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_sloppy_codec(encoding):
|
|
|
|
|
"""
|
|
|
|
|
Take a codec name, and return a 'sloppy' version of that codec that can
|
|
|
|
|
encode and decode the unassigned bytes in that encoding.
|
|
|
|
|
|
|
|
|
|
Single-byte encodings in the standard library are defined using some
|
|
|
|
|
boilerplate classes surrounding the functions that do the actual work,
|
|
|
|
|
`codecs.charmap_decode` and `charmap_encode`. This function, given an
|
|
|
|
|
encoding name, *defines* those boilerplate classes.
|
|
|
|
|
"""
|
|
|
|
|
# Make an array of all 256 possible bytes.
|
|
|
|
|
all_bytes = bytearray(range(256))
|
|
|
|
|
|
|
|
|
|
# Get a list of what they would decode to in Latin-1.
|
|
|
|
|
sloppy_chars = list(all_bytes.decode('latin-1'))
|
|
|
|
|
|
|
|
|
|
# Get a list of what they decode to in the given encoding. Use the
|
|
|
|
|
# replacement character for unassigned bytes.
|
2014-11-24 22:14:39 -05:00
|
|
|
|
decoded_chars = all_bytes.decode(encoding, 'replace')
|
2014-11-24 16:42:30 -05:00
|
|
|
|
|
|
|
|
|
# Update the sloppy_chars list. Each byte that was successfully decoded
|
|
|
|
|
# gets its decoded value in the list. The unassigned bytes are left as
|
|
|
|
|
# they are, which gives their decoding in Latin-1.
|
|
|
|
|
for i, char in enumerate(decoded_chars):
|
|
|
|
|
if char != REPLACEMENT_CHAR:
|
|
|
|
|
sloppy_chars[i] = char
|
|
|
|
|
|
|
|
|
|
# Create the data structures that tell the charmap methods how to encode
|
|
|
|
|
# and decode in this sloppy encoding.
|
|
|
|
|
decoding_table = ''.join(sloppy_chars)
|
|
|
|
|
encoding_table = codecs.charmap_build(decoding_table)
|
|
|
|
|
|
|
|
|
|
# Now produce all the class boilerplate. Look at the Python source for
|
|
|
|
|
# `encodings.cp1252` for comparison; this is almost exactly the same,
|
|
|
|
|
# except I made it follow pep8.
|
|
|
|
|
class Codec(codecs.Codec):
|
|
|
|
|
def encode(self, input, errors='strict'):
|
|
|
|
|
return codecs.charmap_encode(input, errors, encoding_table)
|
|
|
|
|
|
|
|
|
|
def decode(self, input, errors='strict'):
|
|
|
|
|
return codecs.charmap_decode(input, errors, decoding_table)
|
|
|
|
|
|
|
|
|
|
class IncrementalEncoder(codecs.IncrementalEncoder):
|
|
|
|
|
def encode(self, input, final=False):
|
|
|
|
|
return codecs.charmap_encode(input, self.errors, encoding_table)[0]
|
|
|
|
|
|
|
|
|
|
class IncrementalDecoder(codecs.IncrementalDecoder):
|
|
|
|
|
def decode(self, input, final=False):
|
|
|
|
|
return codecs.charmap_decode(input, self.errors, decoding_table)[0]
|
|
|
|
|
|
|
|
|
|
class StreamWriter(Codec, codecs.StreamWriter):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
class StreamReader(Codec, codecs.StreamReader):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
return codecs.CodecInfo(
|
|
|
|
|
name='sloppy-' + encoding,
|
|
|
|
|
encode=Codec().encode,
|
|
|
|
|
decode=Codec().decode,
|
|
|
|
|
incrementalencoder=IncrementalEncoder,
|
|
|
|
|
incrementaldecoder=IncrementalDecoder,
|
|
|
|
|
streamreader=StreamReader,
|
|
|
|
|
streamwriter=StreamWriter,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Define a codec for each incomplete encoding. The resulting CODECS dictionary
|
|
|
|
|
# can be used by the main module of ftfy.bad_codecs.
|
|
|
|
|
CODECS = {}
|
|
|
|
|
INCOMPLETE_ENCODINGS = (
|
|
|
|
|
['windows-%s' % num for num in range(1250, 1259)] +
|
|
|
|
|
['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
|
|
|
|
|
['cp%s' % num for num in range(1250, 1259)] + ['cp874']
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for _encoding in INCOMPLETE_ENCODINGS:
|
|
|
|
|
_new_name = normalize_encoding('sloppy-' + _encoding)
|
|
|
|
|
CODECS[_new_name] = make_sloppy_codec(_encoding)
|