1
0
mirror of https://github.com/moparisthebest/SickRage synced 2025-01-06 03:18:01 -05:00

Added FTFY module to help with any encoding/decoding issues

This commit is contained in:
echel0n 2014-11-24 13:42:30 -08:00
parent f73aee78cc
commit 6a140aa907
22 changed files with 2039 additions and 36 deletions

351
lib/ftfy/__init__.py Normal file
View File

@ -0,0 +1,351 @@
# -*- coding: utf-8 -*-
"""
ftfy: fixes text for you
This is a module for making text less broken. See the `fix_text` function
for more information.
"""
from __future__ import unicode_literals
# See the docstring for ftfy.bad_codecs to see what we're doing here.
import ftfy.bad_codecs
ftfy.bad_codecs.ok()
from ftfy import fixes
from ftfy.fixes import fix_text_encoding
from ftfy.compatibility import PYTHON34_OR_LATER, is_printable
import unicodedata
import warnings
def fix_text(text,
remove_unsafe_private_use=(not PYTHON34_OR_LATER),
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
normalization='NFKC',
uncurl_quotes=True,
fix_line_breaks=True,
remove_control_chars=True,
remove_bom=True,
max_decode_length=2**16):
r"""
Given Unicode text as input, make its representation consistent and
possibly less broken.
Let's start with some examples:
>>> print(fix_text('ˆnicode'))
ünicode
>>> print(fix_text('Broken text… it’s flubberific!'))
Broken text... it's flubberific!
>>> print(fix_text('HTML entities <3'))
HTML entities <3
>>> print(fix_text('<em>HTML entities &lt;3</em>'))
<em>HTML entities &lt;3</em>
>>> print(fix_text('\001\033[36;44mI&#x92;m blue, da ba dee da ba '
... 'doo&#133;\033[0m'))
I'm blue, da ba dee da ba doo...
>>> # This example string starts with a byte-order mark, even if
>>> # you can't see it on the Web.
>>> print(fix_text('\ufeffParty like\nit&rsquo;s 1999!'))
Party like
it's 1999!
>>> len(fix_text('' * 100000))
200000
>>> len(fix_text(''))
0
Based on the options you provide, ftfy applies these steps in order:
- If `remove_unsafe_private_use` is True, it removes a range of private-use
characters that could trigger a Python bug. The bug is fixed in
the most recent versions of Python, so this will default to False
starting on Python 3.4.
- If `fix_entities` is True, replace HTML entities with their equivalent
characters. If it's "auto" (the default), then consider replacing HTML
entities, but don't do so in text where you have seen a pair of actual
angle brackets (that's probably actually HTML and you shouldn't mess
with the entities).
- If `remove_terminal_escapes` is True, remove sequences of bytes that are
instructions for Unix terminals, such as the codes that make text appear
in different colors.
- If `fix_encoding` is True, look for common mistakes that come from
encoding or decoding Unicode text incorrectly, and fix them if they are
reasonably fixable. See `fix_text_encoding` for details.
- If `normalization` is not None, apply the specified form of Unicode
normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
The default, 'NFKC', applies the following relevant transformations:
- C: Combine characters and diacritics that are written using separate
code points, such as converting "e" plus an acute accent modifier
into "é", or converting "ka" () plus a dakuten into the
single character "ga" ().
- K: Replace characters that are functionally equivalent with the most
common form. For example, half-width katakana will be replaced with
full-width versions, full-width Roman characters will be replaced with
ASCII characters, ellipsis characters will be replaced with three
periods, and the ligature '' will be replaced with 'fl'.
- If `uncurl_quotes` is True, replace various curly quotation marks with
plain-ASCII straight quotes.
- If `fix_line_breaks` is true, convert all line breaks to Unix style
(CRLF and CR line breaks become LF line breaks).
- If `fix_control_characters` is true, remove all C0 control characters
except the common useful ones: TAB, CR, LF, and FF. (CR characters
may have already been removed by the `fix_line_breaks` step.)
- If `remove_bom` is True, remove the Byte-Order Mark if it exists.
- If anything was changed, repeat all the steps, so that the function is
idempotent. "&amp;amp;" will become "&", for example, not "&amp;".
`fix_text` will work one line at a time, with the possibility that some
lines are in different encodings. When it encounters lines longer than
`max_decode_length`, it will not run the `fix_encoding` step, to avoid
unbounded slowdowns.
If you are certain your entire text is in the same encoding (though that
encoding is possibly flawed), and do not mind performing operations on
the whole text at once, use `fix_text_segment`.
"""
if isinstance(text, bytes):
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
out = []
pos = 0
while pos < len(text):
textbreak = text.find('\n', pos) + 1
fix_encoding_this_time = fix_encoding
if textbreak == 0:
textbreak = len(text)
if (textbreak - pos) > max_decode_length:
fix_encoding_this_time = False
substring = text[pos:textbreak]
if fix_entities == 'auto' and '<' in substring and '>' in substring:
# we see angle brackets together; this could be HTML
fix_entities = False
out.append(
fix_text_segment(
substring,
remove_unsafe_private_use=remove_unsafe_private_use,
fix_entities=fix_entities,
remove_terminal_escapes=remove_terminal_escapes,
fix_encoding=fix_encoding_this_time,
normalization=normalization,
uncurl_quotes=uncurl_quotes,
fix_line_breaks=fix_line_breaks,
remove_control_chars=remove_control_chars,
remove_bom=remove_bom
)
)
pos = textbreak
return ''.join(out)
ftfy = fix_text
def fix_file(input_file,
remove_unsafe_private_use=True,
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
normalization='NFKC',
uncurl_quotes=True,
fix_line_breaks=True,
remove_control_chars=True,
remove_bom=True):
"""
Fix text that is found in a file.
If the file is being read as Unicode text, use that. If it's being read as
bytes, then unfortunately, we have to guess what encoding it is. We'll try
a few common encodings, but we make no promises. See the `guess_bytes`
function for how this is done.
The output is a stream of fixed lines of text.
"""
entities = fix_entities
for line in input_file:
if isinstance(line, bytes):
line, encoding = guess_bytes(line)
if fix_entities == 'auto' and '<' in line and '>' in line:
entities = False
yield fix_text_segment(
line,
remove_unsafe_private_use=remove_unsafe_private_use,
fix_entities=entities,
remove_terminal_escapes=remove_terminal_escapes,
fix_encoding=fix_encoding,
normalization=normalization,
uncurl_quotes=uncurl_quotes,
fix_line_breaks=fix_line_breaks,
remove_control_chars=remove_control_chars,
remove_bom=remove_bom
)
def fix_text_segment(text,
remove_unsafe_private_use=True,
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
normalization='NFKC',
uncurl_quotes=True,
fix_line_breaks=True,
remove_control_chars=True,
remove_bom=True):
"""
Apply fixes to text in a single chunk. This could be a line of text
within a larger run of `fix_text`, or it could be a larger amount
of text that you are certain is all in the same encoding.
See `fix_text` for a description of the parameters.
"""
if isinstance(text, bytes):
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
if fix_entities == 'auto' and '<' in text and '>' in text:
fix_entities = False
while True:
origtext = text
if remove_unsafe_private_use:
text = fixes.remove_unsafe_private_use(text)
if fix_entities:
text = fixes.unescape_html(text)
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_text_encoding(text)
if normalization is not None:
text = unicodedata.normalize(normalization, text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom:
text = fixes.remove_bom(text)
if text == origtext:
return text
def guess_bytes(bstring):
"""
If you have some bytes in an unknown encoding, here's a reasonable
strategy for decoding them, by trying a few common encodings that
can be distinguished from each other.
This is not a magic bullet. If the bytes are coming from some MySQL
database with the "character set" set to ISO Elbonian, this won't figure
it out. Perhaps more relevantly, this currently doesn't try East Asian
encodings.
The encodings we try are:
- UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
like nothing else
- UTF-8, because it's the global de facto standard
- "utf-8-variants", because it's what people actually implement when they
think they're doing UTF-8
- MacRoman, because Microsoft Office thinks it's still a thing, and it
can be distinguished by its line breaks. (If there are no line breaks in
the string, though, you're out of luck.)
- "sloppy-windows-1252", the Latin-1-like encoding that is the most common
single-byte encoding
"""
if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
return bstring.decode('utf-16'), 'utf-16'
byteset = set(bytes(bstring))
byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
try:
if byte_ed in byteset or byte_c0 in byteset:
# Byte 0xed can be used to encode a range of codepoints that
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
# so when we see 0xed, it's very likely we're being asked to
# decode CESU-8, the variant that encodes UTF-16 surrogates
# instead of the original characters themselves.
#
# This will occasionally trigger on standard UTF-8, as there
# are some Korean characters that also use byte 0xed, but that's
# not harmful.
#
# Byte 0xc0 is impossible because, numerically, it would only
# encode characters lower than U+0040. Those already have
# single-byte representations, and UTF-8 requires using the
# shortest possible representation. However, Java hides the null
# codepoint, U+0000, in a non-standard longer representation -- it
# encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
# will never appear in the encoded bytes.
#
# The 'utf-8-variants' decoder can handle both of these cases, as
# well as standard UTF-8, at the cost of a bit of speed.
return bstring.decode('utf-8-variants'), 'utf-8-variants'
else:
return bstring.decode('utf-8'), 'utf-8'
except UnicodeDecodeError:
pass
if byte_CR in bstring and byte_LF not in bstring:
return bstring.decode('macroman'), 'macroman'
else:
return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
def explain_unicode(text):
"""
A utility method that's useful for debugging mysterious Unicode.
It breaks down a string, showing you for each codepoint its number in
hexadecimal, its glyph, its category in the Unicode standard, and its name
in the Unicode standard.
>>> explain_unicode('(╯°□°)╯︵ ┻━┻')
U+0028 ( [Ps] LEFT PARENTHESIS
U+256F [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
U+00B0 ° [So] DEGREE SIGN
U+25A1 [So] WHITE SQUARE
U+00B0 ° [So] DEGREE SIGN
U+0029 ) [Pe] RIGHT PARENTHESIS
U+256F [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
U+FE35 [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
U+0020 [Zs] SPACE
U+253B [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
U+2501 [So] BOX DRAWINGS HEAVY HORIZONTAL
U+253B [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
"""
for char in text:
if is_printable(char):
display = char
else:
display = char.encode('unicode-escape').decode('ascii')
print('U+{code:04X} {display:<7} [{category}] {name}'.format(
display=display,
code=ord(char),
category=unicodedata.category(char),
name=unicodedata.name(char, '<unknown>')
))
def fix_bad_encoding(text):
"""
Kept for compatibility with previous versions of ftfy.
"""
warnings.warn(
'fix_bad_encoding is now known as fix_text_encoding',
DeprecationWarning
)
return fix_text_encoding(text)

View File

@ -0,0 +1,94 @@
# coding: utf-8
r"""
Give Python the ability to decode some common, flawed encodings.
Python does not want you to be sloppy with your text. Its encoders and decoders
("codecs") follow the relevant standards whenever possible, which means that
when you get text that *doesn't* follow those standards, you'll probably fail
to decode it. Or you might succeed at decoding it for implementation-specific
reasons, which is perhaps worse.
There are some encodings out there that Python wishes didn't exist, which are
widely used outside of Python:
- "utf-8-variants", a family of not-quite-UTF-8 encodings, including the
ever-popular CESU-8 and "Java modified UTF-8".
- "Sloppy" versions of character map encodings, where bytes that don't map to
anything will instead map to the Unicode character with the same number.
Simply importing this module, or in fact any part of the `ftfy` package, will
make these new "bad codecs" available to Python through the standard Codecs
API. You never have to actually call any functions inside `ftfy.bad_codecs`.
However, if you want to call something because your code checker insists on it,
you can call ``ftfy.bad_codecs.ok()``.
A quick example of decoding text that's encoded in CESU-8:
>>> import ftfy.bad_codecs
>>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
😍
"""
from __future__ import unicode_literals
from encodings import normalize_encoding
import codecs
_CACHE = {}
# Define some aliases for 'utf-8-variants'. All hyphens get turned into
# underscores, because of `normalize_encoding`.
UTF8_VAR_NAMES = (
'utf_8_variants', 'utf8_variants',
'utf_8_variant', 'utf8_variant',
'utf_8_var', 'utf8_var',
'cesu_8', 'cesu8',
'java_utf_8', 'java_utf8'
)
def search_function(encoding):
"""
Register our "bad codecs" with Python's codecs API. This involves adding
a search function that takes in an encoding name, and returns a codec
for that encoding if it knows one, or None if it doesn't.
The encodings this will match are:
- Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N',
where the non-sloppy version is an encoding that leaves some bytes
unmapped to characters.
- The 'utf-8-variants' encoding, which has the several aliases seen
above.
"""
if encoding in _CACHE:
return _CACHE[encoding]
norm_encoding = normalize_encoding(encoding)
codec = None
if norm_encoding in UTF8_VAR_NAMES:
from ftfy.bad_codecs.utf8_variants import CODEC_INFO
codec = CODEC_INFO
elif norm_encoding.startswith('sloppy_'):
from ftfy.bad_codecs.sloppy import CODECS
codec = CODECS.get(norm_encoding)
if codec is not None:
_CACHE[encoding] = codec
return codec
def ok():
"""
A feel-good function that gives you something to call after importing
this package.
Why is this here? Pyflakes. Pyflakes gets upset when you import a module
and appear not to use it. It doesn't know that you're using it when
you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
encodings.
"""
pass
codecs.register(search_function)

View File

@ -0,0 +1,156 @@
# coding: utf-8
r"""
Decodes single-byte encodings, filling their "holes" in the same messy way that
everyone else does.
A single-byte encoding maps each byte to a Unicode character, except that some
bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
example, bytes 0x81 and 0x8D, among others, have no meaning.
Python, wanting to preserve some sense of decorum, will handle these bytes
as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
different from each other. It just hasn't defined what they are in terms of
Unicode.
Software that has to interoperate with Windows-1252 and Unicode -- such as all
the common Web browsers -- will pick some Unicode characters for them to map
to, and the characters they pick are the Unicode characters with the same
numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
resulting characters tend to fall into a range of Unicode that's set aside for
obselete Latin-1 control characters anyway.
These sloppy codecs let Python do the same thing, thus interoperating with
other software that works this way. It defines a sloppy version of many
single-byte encodings with holes. (There is no need for a sloppy version of
an encoding without holes: for example, there is no such thing as
sloppy-iso-8859-2 or sloppy-macroman.)
The following encodings will become defined:
- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
- sloppy-windows-1251 (Cyrillic)
- sloppy-windows-1252 (Western European, based on Latin-1)
- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
- sloppy-windows-1256 (Arabic)
- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
- sloppy-windows-1258 (Vietnamese)
- sloppy-cp874 (Thai, based on ISO-8859-11)
- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
- sloppy-iso-8859-6 (different Arabic)
- sloppy-iso-8859-7 (Greek)
- sloppy-iso-8859-8 (Hebrew)
- sloppy-iso-8859-11 (Thai)
Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
defined.
Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
the rest are rather uncommon.
Here are some examples, using `ftfy.explain_unicode` to illustrate how
sloppy-windows-1252 merges Windows-1252 with Latin-1:
>>> from ftfy import explain_unicode
>>> some_bytes = b'\x80\x81\x82'
>>> explain_unicode(some_bytes.decode('latin-1'))
U+0080 \x80 [Cc] <unknown>
U+0081 \x81 [Cc] <unknown>
U+0082 \x82 [Cc] <unknown>
>>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
U+20AC [Sc] EURO SIGN
U+FFFD <EFBFBD> [So] REPLACEMENT CHARACTER
U+201A [Ps] SINGLE LOW-9 QUOTATION MARK
>>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
U+20AC [Sc] EURO SIGN
U+0081 \x81 [Cc] <unknown>
U+201A [Ps] SINGLE LOW-9 QUOTATION MARK
"""
from __future__ import unicode_literals
import codecs
from encodings import normalize_encoding
REPLACEMENT_CHAR = '\ufffd'
def make_sloppy_codec(encoding):
"""
Take a codec name, and return a 'sloppy' version of that codec that can
encode and decode the unassigned bytes in that encoding.
Single-byte encodings in the standard library are defined using some
boilerplate classes surrounding the functions that do the actual work,
`codecs.charmap_decode` and `charmap_encode`. This function, given an
encoding name, *defines* those boilerplate classes.
"""
# Make an array of all 256 possible bytes.
all_bytes = bytearray(range(256))
# Get a list of what they would decode to in Latin-1.
sloppy_chars = list(all_bytes.decode('latin-1'))
# Get a list of what they decode to in the given encoding. Use the
# replacement character for unassigned bytes.
decoded_chars = all_bytes.decode(encoding, errors='replace')
# Update the sloppy_chars list. Each byte that was successfully decoded
# gets its decoded value in the list. The unassigned bytes are left as
# they are, which gives their decoding in Latin-1.
for i, char in enumerate(decoded_chars):
if char != REPLACEMENT_CHAR:
sloppy_chars[i] = char
# Create the data structures that tell the charmap methods how to encode
# and decode in this sloppy encoding.
decoding_table = ''.join(sloppy_chars)
encoding_table = codecs.charmap_build(decoding_table)
# Now produce all the class boilerplate. Look at the Python source for
# `encodings.cp1252` for comparison; this is almost exactly the same,
# except I made it follow pep8.
class Codec(codecs.Codec):
def encode(self, input, errors='strict'):
return codecs.charmap_encode(input, errors, encoding_table)
def decode(self, input, errors='strict'):
return codecs.charmap_decode(input, errors, decoding_table)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.charmap_encode(input, self.errors, encoding_table)[0]
class IncrementalDecoder(codecs.IncrementalDecoder):
def decode(self, input, final=False):
return codecs.charmap_decode(input, self.errors, decoding_table)[0]
class StreamWriter(Codec, codecs.StreamWriter):
pass
class StreamReader(Codec, codecs.StreamReader):
pass
return codecs.CodecInfo(
name='sloppy-' + encoding,
encode=Codec().encode,
decode=Codec().decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)
# Define a codec for each incomplete encoding. The resulting CODECS dictionary
# can be used by the main module of ftfy.bad_codecs.
CODECS = {}
INCOMPLETE_ENCODINGS = (
['windows-%s' % num for num in range(1250, 1259)] +
['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
['cp%s' % num for num in range(1250, 1259)] + ['cp874']
)
for _encoding in INCOMPLETE_ENCODINGS:
_new_name = normalize_encoding('sloppy-' + _encoding)
CODECS[_new_name] = make_sloppy_codec(_encoding)

View File

@ -0,0 +1,281 @@
r"""
This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can
decode text that's been encoded with a popular non-standard version of UTF-8.
This includes CESU-8, the accidental encoding made by layering UTF-8 on top of
UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for
codepoint 0.
This is particularly relevant in Python 3, which provides no other way of
decoding CESU-8 or Java's encoding. [1]
The easiest way to use the codec is to simply import `ftfy.bad_codecs`:
>>> import ftfy.bad_codecs
>>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var')
>>> print(repr(result).lstrip('u'))
'here comes a null! \x00'
The codec does not at all enforce "correct" CESU-8. For example, the Unicode
Consortium's not-quite-standard describing CESU-8 requires that there is only
one possible encoding of any character, so it does not allow mixing of valid
UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8
decoder does.
Characters in the Basic Multilingual Plane still have only one encoding. This
codec still enforces the rule, within the BMP, that characters must appear in
their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`,
instead of just `0x00`, may be used to encode the null character `U+0000`, like
in Java.
If you encode with this codec, you get legitimate UTF-8. Decoding with this
codec and then re-encoding is not idempotent, although encoding and then
decoding is. So this module won't produce CESU-8 for you. Look for that
functionality in the sister module, "Breaks Text For You", coming approximately
never.
[1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: first
decode the bytes (incorrectly), then encode them, then decode them again, using
UTF-8 as the codec every time.
"""
from __future__ import unicode_literals
from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2
from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder,
IncrementalEncoder as UTF8IncrementalEncoder)
import re
import codecs
NAME = 'utf-8-variants'
# This regular expression matches all possible six-byte CESU-8 sequences.
CESU8_RE = re.compile(b'\xed[\xa0-\xaf][\x80-\xbf]\xed[\xb0-\xbf][\x80-\xbf]')
class IncrementalDecoder(UTF8IncrementalDecoder):
"""
An incremental decoder that extends Python's built-in UTF-8 decoder.
This encoder needs to take in bytes, possibly arriving in a stream, and
output the correctly decoded text. The general strategy for doing this
is to fall back on the real UTF-8 decoder whenever possible, because
the real UTF-8 decoder is way optimized, but to call specialized methods
we define here for the cases the real encoder isn't expecting.
"""
def _buffer_decode(self, input, errors, final):
"""
Decode bytes that may be arriving in a stream, following the Codecs
API.
`input` is the incoming sequence of bytes. `errors` tells us how to
handle errors, though we delegate all error-handling cases to the real
UTF-8 decoder to ensure correct behavior. `final` indicates whether
this is the end of the sequence, in which case we should raise an
error given incomplete input.
Returns as much decoded text as possible, and the number of bytes
consumed.
"""
# decoded_segments are the pieces of text we have decoded so far,
# and position is our current position in the byte string. (Bytes
# before this position have been consumed, and bytes after it have
# yet to be decoded.)
decoded_segments = []
position = 0
while True:
# Use _buffer_decode_step to decode a segment of text.
decoded, consumed = self._buffer_decode_step(
input[position:],
errors,
final
)
if consumed == 0:
# Either there's nothing left to decode, or we need to wait
# for more input. Either way, we're done for now.
break
# Append the decoded text to the list, and update our position.
decoded_segments.append(decoded)
position += consumed
if final:
# _buffer_decode_step must consume all the bytes when `final` is
# true.
assert position == len(input)
return ''.join(decoded_segments), position
def _buffer_decode_step(self, input, errors, final):
"""
There are three possibilities for each decoding step:
- Decode as much real UTF-8 as possible.
- Decode a six-byte CESU-8 sequence at the current position.
- Decode a Java-style null at the current position.
This method figures out which step is appropriate, and does it.
"""
# Get a reference to the superclass method that we'll be using for
# most of the real work.
sup = UTF8IncrementalDecoder._buffer_decode
# Find the next byte position that indicates a variant of UTF-8.
# CESU-8 sequences always start with 0xed, and Java nulls always
# start with 0xc0, both of which are conveniently impossible in
# real UTF-8.
cutoff1 = input.find(b'\xed')
cutoff2 = input.find(b'\xc0')
# Set `cutoff` to whichever cutoff comes first.
if cutoff1 != -1 and cutoff2 != -1:
cutoff = min(cutoff1, cutoff2)
elif cutoff1 != -1:
cutoff = cutoff1
elif cutoff2 != -1:
cutoff = cutoff2
else:
# The entire input can be decoded as UTF-8, so just do so.
return sup(input, errors, final)
if cutoff1 == 0:
# Decode a possible six-byte sequence starting with 0xed.
return self._buffer_decode_surrogates(sup, input, errors, final)
elif cutoff2 == 0:
# Decode a possible two-byte sequence, 0xc0 0x80.
return self._buffer_decode_null(sup, input, errors, final)
else:
# Decode the bytes up until the next weird thing as UTF-8.
# Set final=True because 0xc0 and 0xed don't make sense in the
# middle of a sequence, in any variant.
return sup(input[:cutoff], errors, True)
@staticmethod
def _buffer_decode_null(sup, input, errors, final):
"""
Decode the bytes 0xc0 0x80 as U+0000, like Java does.
"""
nextbyte = input[1:2]
if nextbyte == b'':
if final:
# We found 0xc0 at the end of the stream, which is an error.
# Delegate to the superclass method to handle that error.
return sup(input, errors, final)
else:
# We found 0xc0 and we don't know what comes next, so consume
# no bytes and wait.
return '', 0
elif nextbyte == b'\x80':
# We found the usual 0xc0 0x80 sequence, so decode it and consume
# two bytes.
return '\u0000', 2
else:
# We found 0xc0 followed by something else, which is an error.
# Whatever should happen is equivalent to what happens when the
# superclass is given just the byte 0xc0, with final=True.
return sup(b'\xc0', errors, True)
@staticmethod
def _buffer_decode_surrogates(sup, input, errors, final):
"""
When we have improperly encoded surrogates, we can still see the
bits that they were meant to represent.
The surrogates were meant to encode a 20-bit number, to which we
add 0x10000 to get a codepoint. That 20-bit number now appears in
this form:
11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
The CESU8_RE above matches byte sequences of this form. Then we need
to extract the bits and assemble a codepoint number from them.
"""
if len(input) < 6:
if final:
# We found 0xed near the end of the stream, and there aren't
# six bytes to decode. Delegate to the superclass method to
# handle it as normal UTF-8. It might be a Hangul character
# or an error.
if PYTHON2 and len(input) >= 3:
# We can't trust Python 2 to raise an error when it's
# asked to decode a surrogate, so let's force the issue.
input = mangle_surrogates(input)
return sup(input, errors, final)
else:
# We found 0xed, the stream isn't over yet, and we don't know
# enough of the following bytes to decode anything, so consume
# zero bytes and wait.
return '', 0
else:
if CESU8_RE.match(input):
# If this is a CESU-8 sequence, do some math to pull out
# the intended 20-bit value, and consume six bytes.
bytenums = bytes_to_ints(input[:6])
codepoint = (
((bytenums[1] & 0x0f) << 16) +
((bytenums[2] & 0x3f) << 10) +
((bytenums[4] & 0x0f) << 6) +
(bytenums[5] & 0x3f) +
0x10000
)
return unichr(codepoint), 6
else:
# This looked like a CESU-8 sequence, but it wasn't one.
# 0xed indicates the start of a three-byte sequence, so give
# three bytes to the superclass to decode as usual -- except
# for working around the Python 2 discrepancy as before.
if PYTHON2:
input = mangle_surrogates(input)
return sup(input[:3], errors, False)
def mangle_surrogates(bytestring):
"""
When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats
it as an error (which it is). In 'replace' mode, it will decode as three
replacement characters. But Python 2 will just output the surrogate
codepoint.
To ensure consistency between Python 2 and Python 3, and protect downstream
applications from malformed strings, we turn surrogate sequences at the
start of the string into the bytes `ff ff ff`, which we're *sure* won't
decode, and which turn into three replacement characters in 'replace' mode.
"""
if PYTHON2:
if bytestring.startswith(b'\xed') and len(bytestring) >= 3:
decoded = bytestring[:3].decode('utf-8', 'replace')
if '\ud800' <= decoded <= '\udfff':
return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:])
return bytestring
else:
# On Python 3, nothing needs to be done.
return bytestring
# The encoder is identical to UTF-8.
IncrementalEncoder = UTF8IncrementalEncoder
# Everything below here is boilerplate that matches the modules in the
# built-in `encodings` package.
def encode(input, errors='strict'):
return IncrementalEncoder(errors).encode(input, final=True), len(input)
def decode(input, errors='strict'):
return IncrementalDecoder(errors).decode(input, final=True), len(input)
class StreamWriter(codecs.StreamWriter):
encode = encode
class StreamReader(codecs.StreamReader):
decode = decode
CODEC_INFO = codecs.CodecInfo(
name=NAME,
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)

144
lib/ftfy/badness.py Normal file
View File

@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
"""
Heuristics to determine whether re-encoding text is actually making it
more reasonable.
"""
from __future__ import unicode_literals
from ftfy.chardata import chars_to_classes
import re
import unicodedata
# The following regex uses the mapping of character classes to ASCII
# characters defined in chardata.py and build_data.py:
#
# L = Latin capital letter
# l = Latin lowercase letter
# A = Non-latin capital or title-case letter
# a = Non-latin lowercase letter
# C = Non-cased letter (Lo)
# X = Control character (Cc)
# m = Letter modifier (Lm)
# M = Mark (Mc, Me, Mn)
# N = Miscellaneous numbers (No)
# 0 = Math symbol (Sm)
# 1 = Currency symbol (Sc)
# 2 = Symbol modifier (Sk)
# 3 = Other symbol (So)
# S = UTF-16 surrogate
# _ = Unassigned character
# = Whitespace
# o = Other
def _make_weirdness_regex():
"""
Creates a list of regexes that match 'weird' character sequences.
The more matches there are, the weirder the text is.
"""
groups = []
# Match lowercase letters that are followed by non-ASCII uppercase letters
groups.append('lA')
# Match diacritical marks, except when they modify a non-cased letter or
# another mark.
#
# You wouldn't put a diacritical mark on a digit or a space, for example.
# You might put it on a Latin letter, but in that case there will almost
# always be a pre-composed version, and we normalize to pre-composed
# versions first. The cases that can't be pre-composed tend to be in
# large scripts without case, which are in class C.
groups.append('[^CM]M')
# Match non-Latin characters adjacent to Latin characters.
#
# This is a simplification from ftfy version 2, which compared all
# adjacent scripts. However, the ambiguities we need to resolve come from
# encodings designed to represent Latin characters.
groups.append('[Ll][AaC]')
groups.append('[AaC][Ll]')
# Match C1 control characters, which are almost always the result of
# decoding Latin-1 that was meant to be Windows-1252.
groups.append('X')
# Match private use and unassigned characters.
groups.append('P')
groups.append('_')
# Match adjacent characters from any different pair of these categories:
# - Modifier marks (M)
# - Letter modifiers (m)
# - Miscellaneous numbers (N)
# - Symbols (0123)
exclusive_categories = 'MmN0123'
for cat1 in exclusive_categories:
others_range = ''.join(c for c in exclusive_categories if c != cat1)
groups.append('{cat1}[{others_range}]'.format(
cat1=cat1, others_range=others_range
))
regex = '|'.join('({0})'.format(group) for group in groups)
return re.compile(regex)
WEIRDNESS_RE = _make_weirdness_regex()
# A few characters are common ending punctuation that can show up at the end
# of a mojibake sequence. It's plausible that such a character could appear
# after an accented capital letter, for example, so we'll want to add a
# slight preference to leave these characters alone.
#
# The match ends with a + so that we only give the bonus once for a
# consecutive sequence of these characters.
ENDING_PUNCT_RE = re.compile(
'['
'\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}'
'\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}'
'\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}'
'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}'
']+'
)
def sequence_weirdness(text):
"""
Determine how often a text has unexpected characters or sequences of
characters. This metric is used to disambiguate when text should be
re-decoded or left as is.
We start by normalizing text in NFC form, so that penalties for
diacritical marks don't apply to characters that know what to do with
them.
The following things are deemed weird:
- Lowercase letters followed by non-ASCII uppercase letters
- Non-Latin characters next to Latin characters
- Un-combined diacritical marks, unless they're stacking on non-alphabetic
characters (in languages that do that kind of thing a lot) or other
marks
- C1 control characters
- Adjacent symbols from any different pair of these categories:
- Modifier marks
- Letter modifiers
- Non-digit numbers
- Symbols (including math and currency)
The return value is the number of instances of weirdness.
"""
text2 = unicodedata.normalize('NFC', text)
weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2)))
punct_discount = len(ENDING_PUNCT_RE.findall(text2))
return weirdness * 2 - punct_discount
def text_cost(text):
"""
An overall cost function for text. Weirder is worse, but all else being
equal, shorter strings are better.
The overall cost is measured as the "weirdness" (see
:func:`sequence_weirdness`) plus the length.
"""
return sequence_weirdness(text) + len(text)

111
lib/ftfy/build_data.py Normal file
View File

@ -0,0 +1,111 @@
"""
A script to make the char_classes.dat file.
This never needs to run in normal usage. It needs to be run if the character
classes we care about change, or if a new version of Python supports a new
Unicode standard and we want it to affect our string decoding.
The file that we generate is based on Unicode 6.1, as supported by Python 3.3.
You can certainly use it in earlier versions. This simply makes sure that we
get consistent results from running ftfy on different versions of Python.
The file will be written to the current directory.
"""
from __future__ import unicode_literals
import unicodedata
import sys
import zlib
if sys.hexversion >= 0x03000000:
unichr = chr
# L = Latin capital letter
# l = Latin lowercase letter
# A = Non-latin capital or title-case letter
# a = Non-latin lowercase letter
# C = Non-cased letter (Lo)
# X = Control character (Cc)
# m = Letter modifier (Lm)
# M = Mark (Mc, Me, Mn)
# N = Miscellaneous numbers (No)
# P = Private use (Co)
# 0 = Math symbol (Sm)
# 1 = Currency symbol (Sc)
# 2 = Symbol modifier (Sk)
# 3 = Other symbol (So)
# S = UTF-16 surrogate
# _ = Unassigned character
# = Whitespace
# o = Other
def make_char_data_file(do_it_anyway=False):
"""
Build the compressed data file 'char_classes.dat' and write it to the
current directory.
If you run this, run it in Python 3.3 or later. It will run in earlier
versions, but you won't get the current Unicode standard, leading to
inconsistent behavior. To protect against this, running this in the
wrong version of Python will raise an error unless you pass
`do_it_anyway=True`.
"""
if sys.hexversion < 0x03030000 and not do_it_anyway:
raise RuntimeError(
"This function should be run in Python 3.3 or later."
)
cclasses = [None] * 0x110000
for codepoint in range(0x0, 0x110000):
char = unichr(codepoint)
category = unicodedata.category(char)
if category.startswith('L'): # letters
is_latin = unicodedata.name(char).startswith('LATIN')
if is_latin and codepoint < 0x200:
if category == 'Lu':
cclasses[codepoint] = 'L'
else:
cclasses[codepoint] = 'l'
else: # non-Latin letter, or close enough
if category == 'Lu' or category == 'Lt':
cclasses[codepoint] = 'A'
elif category == 'Ll':
cclasses[codepoint] = 'a'
elif category == 'Lo':
cclasses[codepoint] = 'C'
elif category == 'Lm':
cclasses[codepoint] = 'm'
else:
raise ValueError('got some weird kind of letter')
elif category.startswith('M'): # marks
cclasses[codepoint] = 'M'
elif category == 'No':
cclasses[codepoint] = 'N'
elif category == 'Sm':
cclasses[codepoint] = '0'
elif category == 'Sc':
cclasses[codepoint] = '1'
elif category == 'Sk':
cclasses[codepoint] = '2'
elif category == 'So':
cclasses[codepoint] = '3'
elif category == 'Cn':
cclasses[codepoint] = '_'
elif category == 'Cc':
cclasses[codepoint] = 'X'
elif category == 'Cs':
cclasses[codepoint] = 'S'
elif category == 'Co':
cclasses[codepoint] = 'P'
elif category.startswith('Z'):
cclasses[codepoint] = ' '
else:
cclasses[codepoint] = 'o'
cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
out = open('char_classes.dat', 'wb')
out.write(zlib.compress(''.join(cclasses).encode('ascii')))
out.close()
if __name__ == '__main__':
make_char_data_file()

BIN
lib/ftfy/char_classes.dat Normal file

Binary file not shown.

81
lib/ftfy/chardata.py Normal file
View File

@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
"""
This gives other modules access to the gritty details about characters and the
encodings that use them.
"""
from __future__ import unicode_literals
import re
import zlib
from pkg_resources import resource_string
from ftfy.compatibility import unichr
# These are the five encodings we will try to fix in ftfy, in the
# order that they should be tried.
CHARMAP_ENCODINGS = [
'latin-1',
'sloppy-windows-1252',
'macroman',
'cp437',
'sloppy-windows-1251',
]
def _build_regexes():
"""
ENCODING_REGEXES contain reasonably fast ways to detect if we
could represent a given string in a given encoding. The simplest one is
the 'ascii' detector, which of course just determines if all characters
are between U+0000 and U+007F.
"""
# Define a regex that matches ASCII text.
encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
for encoding in CHARMAP_ENCODINGS:
latin1table = ''.join(unichr(i) for i in range(128, 256))
charlist = latin1table.encode('latin-1').decode(encoding)
# Build a regex from the ASCII range, followed by the decodings of
# bytes 0x80-0xff in this character set. (This uses the fact that all
# regex special characters are ASCII, and therefore won't appear in the
# string.)
regex = '^[\x00-\x7f{}]*$'.format(charlist)
encoding_regexes[encoding] = re.compile(regex)
return encoding_regexes
ENCODING_REGEXES = _build_regexes()
def possible_encoding(text, encoding):
"""
Given text and a single-byte encoding, check whether that text could have
been decoded from that single-byte encoding.
In other words, check whether it can be encoded in that encoding, possibly
sloppily.
"""
return bool(ENCODING_REGEXES[encoding].match(text))
CHAR_CLASS_STRING = zlib.decompress(
resource_string(__name__, 'char_classes.dat')
).decode('ascii')
def chars_to_classes(string):
"""
Convert each Unicode character to a letter indicating which of many
classes it's in.
See build_data.py for where this data comes from and what it means.
"""
return string.translate(CHAR_CLASS_STRING)
# A translate mapping that will strip all C0 control characters except
# those that represent whitespace.
CONTROL_CHARS = {}
for i in range(32):
CONTROL_CHARS[i] = None
# Map whitespace control characters to themselves.
for char in '\t\n\f\r':
del CONTROL_CHARS[ord(char)]

34
lib/ftfy/cli.py Normal file
View File

@ -0,0 +1,34 @@
"""
A simple command-line utility for fixing text found in a file.
Because files do not come with their encoding marked, it first runs the file
through `ftfy.guess_bytes`, then runs it through `ftfy.fix_text`.
"""
from ftfy import fix_file
import sys
ENCODE_STDOUT = (sys.hexversion < 0x03000000)
def main():
"""
Run ftfy as a command-line utility. (Requires Python 2.7 or later, or
the 'argparse' module.)
"""
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='file to transcode')
args = parser.parse_args()
file = open(args.filename)
for line in fix_file(file):
if ENCODE_STDOUT:
sys.stdout.write(line.encode('utf-8'))
else:
sys.stdout.write(line)
if __name__ == '__main__':
main()

79
lib/ftfy/compatibility.py Normal file
View File

@ -0,0 +1,79 @@
"""
Makes some function names and behavior consistent between Python 2 and
Python 3, and also between narrow and wide builds.
"""
from __future__ import unicode_literals
import sys
import re
import unicodedata
if sys.hexversion >= 0x03000000:
from html import entities
unichr = chr
xrange = range
PYTHON2 = False
else:
import htmlentitydefs as entities
unichr = unichr
xrange = xrange
PYTHON2 = True
htmlentitydefs = entities
PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000)
def _narrow_unichr_workaround(codepoint):
"""
A replacement for unichr() on narrow builds of Python. This will get
us the narrow representation of an astral character, which will be
a string of length two, containing two UTF-16 surrogates.
"""
escaped = b'\\U%08x' % codepoint
return escaped.decode('unicode-escape')
if sys.maxunicode < 0x10000:
unichr = _narrow_unichr_workaround
# In a narrow build of Python, we can't write a regex involving astral
# characters. If we want to write the regex:
#
# [\U00100000-\U0010ffff]
#
# The actual string that defines it quietly turns into:
#
# [\udbc0\udc00-\udbff\udfff]
#
# And now the range operator only applies to the middle two characters.
# It looks like a range that's going backwards from \dc00 to \dbff,
# which is an error.
#
# What we can do instead is rewrite the expression to be _about_ the two
# surrogates that make up the astral characters, instead of the characters
# themselves. This would be wrong on a wide build, but it works on a
# narrow build.
UNSAFE_PRIVATE_USE_RE = re.compile('[\udbc0-\udbff][\udc00-\udfff]')
else:
UNSAFE_PRIVATE_USE_RE = re.compile('[\U00100000-\U0010ffff]')
def bytes_to_ints(bytestring):
"""
No matter what version of Python this is, make a sequence of integers from
a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a
sequence of integers.
"""
if PYTHON2:
return [ord(b) for b in bytestring]
else:
return bytestring
def is_printable(char):
"""
str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so
let's make a crude approximation in Python 2.
"""
if PYTHON2:
return not unicodedata.category(char).startswith('C')
else:
return char.isprintable()

473
lib/ftfy/fixes.py Normal file
View File

@ -0,0 +1,473 @@
# -*- coding: utf-8 -*-
"""
This module contains the individual fixes that the main fix_text function
can perform.
"""
from __future__ import unicode_literals
from ftfy.chardata import (possible_encoding,
CHARMAP_ENCODINGS, CONTROL_CHARS)
from ftfy.badness import text_cost
from ftfy.compatibility import htmlentitydefs, unichr, UNSAFE_PRIVATE_USE_RE
import re
import sys
import codecs
BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
ftfy is designed to fix problems that were introduced by handling Unicode
incorrectly. It might be able to fix the bytes you just handed it, but the
fact that you just gave a pile of bytes to a function that fixes text means
that your code is *also* handling Unicode incorrectly.
ftfy takes Unicode text as input. You should take these bytes and decode
them from the encoding you think they are in. If you're not sure what encoding
they're in:
- First, try to find out. 'utf-8' is a good assumption.
- If the encoding is simply unknowable, try running your bytes through
ftfy.guess_bytes. As the name implies, this may not always be accurate.
If you're confused by this, please read the Python Unicode HOWTO:
http://docs.python.org/%d/howto/unicode.html
""" % sys.version_info[0]
def fix_text_encoding(text):
r"""
Fix text with incorrectly-decoded garbage ("mojibake") whenever possible.
Something you will find all over the place, in real-world text, is text
that's mistakenly encoded as utf-8, decoded in some ugly format like
latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
This causes your perfectly good Unicode-aware code to end up with garbage
text because someone else (or maybe "someone else") made a mistake.
This function looks for the evidence of that having happened and fixes it.
It determines whether it should replace nonsense sequences of single-byte
characters that were really meant to be UTF-8 characters, and if so, turns
them into the correctly-encoded Unicode character that they were meant to
represent.
The input to the function must be Unicode. If you don't have Unicode text,
you're not using the right tool to solve your problem.
.. note::
The following examples are written using unmarked literal strings,
but they are Unicode text. In Python 2 we have "unicode_literals"
turned on, and in Python 3 this is always the case.
ftfy decodes text that looks like it was decoded incorrectly. It leaves
alone text that doesn't.
>>> print(fix_text_encoding('único'))
único
>>> print(fix_text_encoding('This text is fine already :þ'))
This text is fine already :þ
Because these characters often come from Microsoft products, we allow
for the possibility that we get not just Unicode characters 128-255, but
also Windows's conflicting idea of what characters 128-160 are.
>>> print(fix_text_encoding('This — should be an em dash'))
This should be an em dash
We might have to deal with both Windows characters and raw control
characters at the same time, especially when dealing with characters like
0x81 that have no mapping in Windows. This is a string that Python's
standard `.encode` and `.decode` methods cannot correct.
>>> print(fix_text_encoding('This text is sad .â\x81”.'))
This text is sad ..
However, it has safeguards against fixing sequences of letters and
punctuation that can occur in valid text:
>>> print(fix_text_encoding('not such a fan of Charlotte Brontë…”'))
not such a fan of Charlotte Brontë
Cases of genuine ambiguity can sometimes be addressed by finding other
characters that are not double-encoded, and expecting the encoding to
be consistent:
>>> print(fix_text_encoding('AHÅ™, the new sofa from IKEA®'))
AHÅ, the new sofa from IKEA®
Finally, we handle the case where the text is in a single-byte encoding
that was intended as Windows-1252 all along but read as Latin-1:
>>> print(fix_text_encoding('This text was never UTF-8 at all\x85'))
This text was never UTF-8 at all
The best version of the text is found using
:func:`ftfy.badness.text_cost`.
"""
text, _plan = fix_encoding_and_explain(text)
return text
def fix_encoding_and_explain(text):
"""
Re-decodes text that has been decoded incorrectly, and also return a
"plan" indicating all the steps required to fix it.
To fix similar text in the same way, without having to detect anything,
you can use the ``apply_plan`` function.
"""
best_version = text
best_cost = text_cost(text)
best_plan = []
plan_so_far = []
while True:
prevtext = text
text, plan = fix_one_step_and_explain(text)
plan_so_far.extend(plan)
cost = text_cost(text)
# Add a penalty if we used a particularly obsolete encoding. The result
# is that we won't use these encodings unless they can successfully
# replace multiple characters.
if ('encode', 'macroman') in plan_so_far or\
('encode', 'cp437') in plan_so_far:
cost += 2
# We need pretty solid evidence to decode from Windows-1251 (Cyrillic).
if ('encode', 'sloppy-windows-1251') in plan_so_far:
cost += 5
if cost < best_cost:
best_cost = cost
best_version = text
best_plan = list(plan_so_far)
if text == prevtext:
return best_version, best_plan
def fix_one_step_and_explain(text):
"""
Performs a single step of re-decoding text that's been decoded incorrectly.
Returns the decoded text, plus a "plan" for how to reproduce what it
did.
"""
if isinstance(text, bytes):
raise UnicodeError(BYTES_ERROR_TEXT)
if len(text) == 0:
return text, []
# The first plan is to return ASCII text unchanged.
if possible_encoding(text, 'ascii'):
return text, []
# As we go through the next step, remember the possible encodings
# that we encounter but don't successfully fix yet. We may need them
# later.
possible_1byte_encodings = []
# Suppose the text was supposed to be UTF-8, but it was decoded using
# a single-byte encoding instead. When these cases can be fixed, they
# are usually the correct thing to do, so try them next.
for encoding in CHARMAP_ENCODINGS:
if possible_encoding(text, encoding):
encoded_bytes = text.encode(encoding)
# Now, find out if it's UTF-8 (or close enough). Otherwise,
# remember the encoding for later.
try:
decoding = 'utf-8'
if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes:
decoding = 'utf-8-variants'
fixed = encoded_bytes.decode(decoding)
steps = [('encode', encoding), ('decode', decoding)]
return fixed, steps
except UnicodeDecodeError:
possible_1byte_encodings.append(encoding)
# The next most likely case is that this is Latin-1 that was intended to
# be read as Windows-1252, because those two encodings in particular are
# easily confused.
if 'latin-1' in possible_1byte_encodings:
if 'windows-1252' in possible_1byte_encodings:
# This text is in the intersection of Latin-1 and
# Windows-1252, so it's probably legit.
return text, []
else:
# Otherwise, it means we have characters that are in Latin-1 but
# not in Windows-1252. Those are C1 control characters. Nobody
# wants those. Assume they were meant to be Windows-1252. Don't
# use the sloppy codec, because bad Windows-1252 characters are
# a bad sign.
encoded = text.encode('latin-1')
try:
fixed = encoded.decode('windows-1252')
steps = []
if fixed != text:
steps = [('encode', 'latin-1'), ('decode', 'windows-1252')]
return fixed, steps
except UnicodeDecodeError:
# This text contained characters that don't even make sense
# if you assume they were supposed to be Windows-1252. In
# that case, let's not assume anything.
pass
# The cases that remain are mixups between two different single-byte
# encodings, and not the common case of Latin-1 vs. Windows-1252.
#
# Those cases are somewhat rare, and impossible to solve without false
# positives. If you're in one of these situations, you should try using
# the `ftfy.guess_bytes` function.
# Return the text unchanged; the plan is empty.
return text, []
def apply_plan(text, plan):
"""
Apply a plan for fixing the encoding of text.
The plan is a list of tuples of the form (operation, encoding), where
`operation` is either 'encode' or 'decode', and `encoding` is an encoding
name such as 'utf-8' or 'latin-1'.
Because only text can be encoded, and only bytes can be decoded, the plan
should alternate 'encode' and 'decode' steps, or else this function will
encounter an error.
"""
obj = text
for operation, encoding in plan:
if operation == 'encode':
obj = obj.encode(encoding)
elif operation == 'decode':
obj = obj.decode(encoding)
else:
raise ValueError("Unknown plan step: %s" % operation)
return obj
HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};")
def unescape_html(text):
"""
Decode all three types of HTML entities/character references.
Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
to it for efficiency: it won't match entities longer than 8 characters,
because there are no valid entities like that.
>>> print(unescape_html('&lt;tag&gt;'))
<tag>
"""
def fixup(match):
"""
Replace one matched HTML entity with the character it represents,
if possible.
"""
text = match.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return HTML_ENTITY_RE.sub(fixup, text)
ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])')
def remove_terminal_escapes(text):
r"""
Strip out "ANSI" terminal escape sequences, such as those that produce
colored text on Unix.
>>> print(remove_terminal_escapes(
... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
... ))
I'm blue, da ba dee da ba doo...
"""
return ANSI_RE.sub('', text)
SINGLE_QUOTE_RE = re.compile('[\u2018-\u201b]')
DOUBLE_QUOTE_RE = re.compile('[\u201c-\u201f]')
def uncurl_quotes(text):
r"""
Replace curly quotation marks with straight equivalents.
>>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
"here's a test"
"""
return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
def fix_line_breaks(text):
r"""
Convert all line breaks to Unix style.
This will convert the following sequences into the standard \\n
line break:
- CRLF (\\r\\n), used on Windows and in some communication
protocols
- CR (\\r), once used on Mac OS Classic, and now kept alive
by misguided software such as Microsoft Office for Mac
- LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029),
defined by Unicode and used to sow confusion and discord
- NEXT LINE (\\x85), a C1 control character that is certainly
not what you meant
The NEXT LINE character is a bit of an odd case, because it
usually won't show up if `fix_encoding` is also being run.
\\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
>>> print(fix_line_breaks(
... "This string is made of two things:\u2029"
... "1. Unicode\u2028"
... "2. Spite"
... ))
This string is made of two things:
1. Unicode
2. Spite
For further testing and examples, let's define a function to make sure
we can see the control characters in their escaped form:
>>> def eprint(text):
... print(text.encode('unicode-escape').decode('ascii'))
>>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
Content-type: text/plain\n\nHi.
>>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
This is how Microsoft \n trolls Mac users
>>> eprint(fix_line_breaks("What is this \x85 I don't even"))
What is this \n I don't even
"""
return text.replace('\r\n', '\n').replace('\r', '\n')\
.replace('\u2028', '\n').replace('\u2029', '\n')\
.replace('\u0085', '\n')
def remove_control_chars(text):
"""
Remove all control characters except for the important ones.
This removes characters in these ranges:
- U+0000 to U+0008
- U+000B
- U+000E to U+001F
- U+007F
It leaves alone these characters that are commonly used for formatting:
- TAB (U+0009)
- LF (U+000A)
- FF (U+000C)
- CR (U+000D)
"""
return text.translate(CONTROL_CHARS)
def remove_bom(text):
r"""
Remove a left-over byte-order mark.
>>> print(remove_bom("\ufeffWhere do you want to go today?"))
Where do you want to go today?
"""
return text.lstrip(unichr(0xfeff))
def remove_unsafe_private_use(text):
r"""
Python 3.3's Unicode support isn't perfect, and in fact there are certain
string operations that will crash some versions of it with a SystemError:
http://bugs.python.org/issue18183
The best solution is to remove all characters from Supplementary Private
Use Area B, using a regex that is known not to crash given those
characters.
These are the characters from U+100000 to U+10FFFF. It's sad to lose an
entire plane of Unicode, but on the other hand, these characters are not
assigned and never will be. If you get one of these characters and don't
know what its purpose is, its purpose is probably to crash your code.
If you were using these for actual private use, this might be inconvenient.
You can turn off this fixer, of course, but I kind of encourage using
Supplementary Private Use Area A instead.
>>> print(remove_unsafe_private_use('\U0001F4A9\U00100000'))
💩
This fixer is off by default in Python 3.4 or later. (The bug is actually
fixed in 3.3.3 and 2.7.6, but I don't want the default behavior to change
based on a micro version upgrade of Python.)
"""
return UNSAFE_PRIVATE_USE_RE.sub('', text)
# Define a regex to match valid escape sequences in Python string literals.
ESCAPE_SEQUENCE_RE = re.compile(r'''
( \\U........ # 8-digit hex escapes
| \\u.... # 4-digit hex escapes
| \\x.. # 2-digit hex escapes
| \\[0-7]{1,3} # Octal escapes
| \\N\{[^}]+\} # Unicode characters by name
| \\[\\'"abfnrtv] # Single-character escapes
)''', re.UNICODE | re.VERBOSE)
def decode_escapes(text):
r"""
Decode backslashed escape sequences, including \\x, \\u, and \\U character
references, even in the presence of other Unicode.
This is what Python's "string-escape" and "unicode-escape" codecs were
meant to do, but in contrast, this actually works. It will decode the
string exactly the same way that the Python interpreter decodes its string
literals.
>>> factoid = '\\u20a1 is the currency symbol for the colón.'
>>> print(factoid[1:])
u20a1 is the currency symbol for the colón.
>>> print(decode_escapes(factoid))
is the currency symbol for the colón.
Even though Python itself can read string literals with a combination of
escapes and literal Unicode -- you're looking at one right now -- the
"unicode-escape" codec doesn't work on literal Unicode. (See
http://stackoverflow.com/a/24519338/773754 for more details.)
Instead, this function searches for just the parts of a string that
represent escape sequences, and decodes them, leaving the rest alone. All
valid escape sequences are made of ASCII characters, and this allows
"unicode-escape" to work correctly.
This fix cannot be automatically applied by the `ftfy.fix_text` function,
because escaped text is not necessarily a mistake, and there is no way
to distinguish text that's supposed to be escaped from text that isn't.
"""
def decode_match(match):
"Given a regex match, decode the escape sequence it contains."
return codecs.decode(match.group(0), 'unicode-escape')
return ESCAPE_SEQUENCE_RE.sub(decode_match, text)

View File

@ -0,0 +1,39 @@
"""
This file defines a general method for evaluating ftfy using data that arrives
in a stream. A concrete implementation of it is found in `twitter_tester.py`.
"""
from __future__ import print_function, unicode_literals
from ftfy.fixes import fix_text_encoding
from ftfy.chardata import possible_encoding
class StreamTester:
"""
Take in a sequence of texts, and show the ones that will be changed by
ftfy. This will also periodically show updates, such as the proportion of
texts that changed.
"""
def __init__(self):
self.num_fixed = 0
self.count = 0
def check_ftfy(self, text):
"""
Given a single text input, check whether `ftfy.fix_text_encoding`
would change it. If so, display the change.
"""
self.count += 1
if not possible_encoding(text, 'ascii'):
fixed = fix_text_encoding(text)
if text != fixed:
# possibly filter common bots before printing
print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format(
text=text, fixed=fixed
))
self.num_fixed += 1
# Print status updates once in a while
if self.count % 100 == 0:
print('.', end='', flush=True)
if self.count % 10000 == 0:
print('\n%d/%d fixed' % (self.num_fixed, self.count))

View File

@ -0,0 +1,73 @@
# coding: utf-8
"""
Do what is necessary to authenticate this tester as a Twitter "app", using
somebody's Twitter account.
"""
from __future__ import unicode_literals
import os
AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth')
def get_auth():
"""
Twitter has some bizarre requirements about how to authorize an "app" to
use its API.
The user of the app has to log in to get a secret token. That's fine. But
the app itself has its own "consumer secret" token. The app has to know it,
and the user of the app has to not know it.
This is, of course, impossible. It's equivalent to DRM. Your computer can't
*really* make use of secret information while hiding the same information
from you.
The threat appears to be that, if you have this super-sekrit token, you can
impersonate the app while doing something different. Well, of course you
can do that, because you *have the source code* and you can change it to do
what you want. You still have to log in as a particular user who has a
token that's actually secret, you know.
Even developers of closed-source applications that use the Twitter API are
unsure what to do, for good reason. These "secrets" are not secret in any
cryptographic sense. A bit of Googling shows that the secret tokens for
every popular Twitter app are already posted on the Web.
Twitter wants us to pretend this string can be kept secret, and hide this
secret behind a fig leaf like everybody else does. So that's what we've
done.
"""
from twitter.oauth import OAuth
from twitter import oauth_dance, read_token_file
def unhide(secret):
"""
Do something mysterious and exactly as secure as every other Twitter
app.
"""
return ''.join([chr(ord(c) - 0x2800) for c in secret])
fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁'
consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw'
if os.path.exists(AUTH_TOKEN_PATH):
token, token_secret = read_token_file(AUTH_TOKEN_PATH)
else:
authdir = os.path.dirname(AUTH_TOKEN_PATH)
if not os.path.exists(authdir):
os.makedirs(authdir)
token, token_secret = oauth_dance(
app_name='ftfy-tester',
consumer_key=consumer_key,
consumer_secret=unhide(fig_leaf),
token_filename=AUTH_TOKEN_PATH
)
return OAuth(
token=token,
token_secret=token_secret,
consumer_key=consumer_key,
consumer_secret=unhide(fig_leaf)
)

View File

@ -0,0 +1,89 @@
"""
Implements a StreamTester that runs over Twitter data. See the class
docstring.
This module is written for Python 3 only. The __future__ imports you see here
are just to let Python 2 scan the file without crashing with a SyntaxError.
"""
from __future__ import print_function, unicode_literals
import os
from collections import defaultdict
from ftfy.streamtester import StreamTester
class TwitterTester(StreamTester):
"""
This class uses the StreamTester code (defined in `__init__.py`) to
evaluate ftfy's real-world performance, by feeding it live data from
Twitter.
This is a semi-manual evaluation. It requires a human to look at the
results and determine if they are good. The three possible cases we
can see here are:
- Success: the process takes in mojibake and outputs correct text.
- False positive: the process takes in correct text, and outputs
mojibake. Every false positive should be considered a bug, and
reported on GitHub if it isn't already.
- Confusion: the process takes in mojibake and outputs different
mojibake. Not a great outcome, but not as dire as a false
positive.
This tester cannot reveal false negatives. So far, that can only be
done by the unit tests.
"""
OUTPUT_DIR = './twitterlogs'
def __init__(self):
self.lines_by_lang = defaultdict(list)
super().__init__()
def save_files(self):
"""
When processing data from live Twitter, save it to log files so that
it can be replayed later.
"""
if not os.path.exists(self.OUTPUT_DIR):
os.makedirs(self.OUTPUT_DIR)
for lang, lines in self.lines_by_lang.items():
filename = 'tweets.{}.txt'.format(lang)
fullname = os.path.join(self.OUTPUT_DIR, filename)
langfile = open(fullname, 'a')
for line in lines:
print(line.replace('\n', ' '), file=langfile)
langfile.close()
self.lines_by_lang = defaultdict(list)
def run_sample(self):
"""
Listen to live data from Twitter, and pass on the fully-formed tweets
to `check_ftfy`. This requires the `twitter` Python package as a
dependency.
"""
from twitter import TwitterStream
from ftfy.streamtester.oauth import get_auth
twitter_stream = TwitterStream(auth=get_auth())
iterator = twitter_stream.statuses.sample()
for tweet in iterator:
if 'text' in tweet:
self.check_ftfy(tweet['text'])
if 'user' in tweet:
lang = tweet['user'].get('lang', 'NONE')
self.lines_by_lang[lang].append(tweet['text'])
if self.count % 10000 == 100:
self.save_files()
def main():
"""
When run from the command line, this script connects to the Twitter stream
and runs the TwitterTester on it forever. Or at least until the stream
drops.
"""
tester = TwitterTester()
tester.run_sample()
if __name__ == '__main__':
main()

View File

@ -18,22 +18,23 @@
import os
from sickbeard import logger
import sickbeard
from sickbeard import logger
import ftfy
import ftfy.bad_codecs
# This module tries to deal with the apparently random behavior of python when dealing with unicode <-> utf-8
# encodings. It tries to just use unicode, but if that fails then it tries forcing it to utf-8. Any functions
# which return something should always return unicode.
def fixStupidEncodings(x, silent=False):
if type(x) == str:
if type(x) in [str, unicode]:
try:
return x.decode(sickbeard.SYS_ENCODING)
return ftfy.fix_text(u'' + x).decode(sickbeard.SYS_ENCODING)
except UnicodeDecodeError:
logger.log(u"Unable to decode value: " + repr(x), logger.ERROR)
return None
elif type(x) == unicode:
return x
else:
logger.log(
u"Unknown value passed in, ignoring it: " + str(type(x)) + " (" + repr(x) + ":" + repr(type(x)) + ")",
@ -49,12 +50,12 @@ def fixListEncodings(x):
def callPeopleStupid(x):
try:
return x.encode(sickbeard.SYS_ENCODING)
return ftfy.fix_text(x).encode(sickbeard.SYS_ENCODING)
except (UnicodeEncodeError, UnicodeDecodeError):
logger.log(
u"YOUR COMPUTER SUCKS! Your data is being corrupted by a bad locale/encoding setting. Report this error on the forums or IRC please: " + repr(
x) + ", " + sickbeard.SYS_ENCODING, logger.ERROR)
return x.encode(sickbeard.SYS_ENCODING, 'ignore')
return ftfy.fix_text(x).encode(sickbeard.SYS_ENCODING, 'ignore')
def ek(func, *args, **kwargs):
if os.name == 'nt':

View File

@ -26,6 +26,7 @@ from sickbeard.exceptions import ex, EpisodeNotFoundException
from sickbeard.history import dateFormat
from sickbeard.common import Quality
from sickbeard.common import WANTED, FAILED
from encodingKludge import fixStupidEncodings
def prepareFailedName(release):
@ -36,9 +37,7 @@ def prepareFailedName(release):
fixed = fixed.rpartition(".")[0]
fixed = re.sub("[\.\-\+\ ]", "_", fixed)
if not isinstance(fixed, unicode):
fixed = unicode(fixed, 'utf-8', 'replace')
fixed = fixStupidEncodings(fixed)
return fixed

View File

@ -20,6 +20,7 @@ import db
import datetime
from sickbeard.common import SNATCHED, SUBTITLED, FAILED, Quality
from encodingKludge import fixStupidEncodings
dateFormat = "%Y%m%d%H%M%S"
@ -27,9 +28,7 @@ dateFormat = "%Y%m%d%H%M%S"
def _logHistoryItem(action, showid, season, episode, quality, resource, provider, version=-1):
logDate = datetime.datetime.today().strftime(dateFormat)
if not isinstance(resource, unicode):
resource = unicode(resource, 'utf-8', 'replace')
resource = fixStupidEncodings(resource)
myDB = db.DBConnection()
myDB.action(

View File

@ -29,6 +29,7 @@ import sickbeard
from sickbeard import logger, common
from sickbeard import db
from encodingKludge import fixStupidEncodings
from sickbeard.exceptions import ex
@ -50,7 +51,7 @@ class EmailNotifier:
ep_name: The name of the episode that was snatched
title: The title of the notification (optional)
"""
ep_name = ep_name.encode('utf-8', 'replace')
ep_name = fixStupidEncodings(ep_name)
if sickbeard.EMAIL_NOTIFY_ONSNATCH:
show = self._parseEp(ep_name)
@ -85,7 +86,7 @@ class EmailNotifier:
ep_name: The name of the episode that was downloaded
title: The title of the notification (optional)
"""
ep_name = ep_name.encode('utf-8', 'replace')
ep_name = fixStupidEncodings(ep_name)
if sickbeard.EMAIL_NOTIFY_ONDOWNLOAD:
show = self._parseEp(ep_name)
@ -120,7 +121,7 @@ class EmailNotifier:
ep_name: The name of the episode that was downloaded
lang: Subtitle language wanted
"""
ep_name = ep_name.encode('utf-8', 'replace')
ep_name = fixStupidEncodings(ep_name)
if sickbeard.EMAIL_NOTIFY_ONSUBTITLEDOWNLOAD:
show = self._parseEp(ep_name)
@ -197,7 +198,7 @@ class EmailNotifier:
return False
def _parseEp(self, ep_name):
ep_name = ep_name.encode('utf-8', 'replace')
ep_name = fixStupidEncodings(ep_name)
sep = " - "
titles = ep_name.split(sep)

View File

@ -23,13 +23,14 @@ import xml.etree.cElementTree as etree
import xml.etree
import re
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
from sickbeard import logger, classes, helpers
from sickbeard.common import Quality
from sickbeard import encodingKludge as ek
from sickbeard.exceptions import ex
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
from encodingKludge import fixStupidEncodings
def getSeasonNZBs(name, urlData, season):
try:
@ -84,7 +85,7 @@ def createNZBString(fileElements, xmlns):
for curFile in fileElements:
rootElement.append(stripNS(curFile, xmlns))
return xml.etree.ElementTree.tostring(rootElement, 'utf-8', 'replace')
return xml.etree.ElementTree.tostring(fixStupidEncodings(rootElement))
def saveNZB(nzbName, nzbString):

View File

@ -20,13 +20,14 @@ import re
import time
import threading
import datetime
import sickbeard
from lib import adba
import sickbeard
import adba
from sickbeard import helpers
from sickbeard import name_cache
from sickbeard import logger
from sickbeard import db
from encodingKludge import fixStupidEncodings
exception_dict = {}
anidb_exception_dict = {}
@ -233,8 +234,7 @@ def retrieve_exceptions():
# if this exception isn't already in the DB then add it
if cur_exception not in existing_exceptions:
if not isinstance(cur_exception, unicode):
cur_exception = unicode(cur_exception, 'utf-8', 'replace')
cur_exception = fixStupidEncodings(cur_exception)
myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
[cur_indexer_id, cur_exception, curSeason])
@ -267,9 +267,7 @@ def update_scene_exceptions(indexer_id, scene_exceptions, season=-1):
exceptionsCache[indexer_id][season] = scene_exceptions
for cur_exception in scene_exceptions:
if not isinstance(cur_exception, unicode):
cur_exception = unicode(cur_exception, 'utf-8', 'replace')
cur_exception = fixStupidEncodings(cur_exception)
myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
[indexer_id, cur_exception, season])

View File

@ -20,19 +20,20 @@ from __future__ import with_statement
import time
import datetime
import itertools
import sickbeard
from sickbeard import db
from sickbeard import logger
from sickbeard.common import Quality
from sickbeard import helpers, show_name_helpers
from sickbeard.exceptions import MultipleShowObjectsException
from sickbeard.exceptions import AuthException
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
from sickbeard.rssfeeds import RSSFeeds
from sickbeard import clients
import itertools
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
from encodingKludge import fixStupidEncodings
class CacheDBConnection(db.DBConnection):
def __init__(self, providerName):
@ -262,8 +263,7 @@ class TVCache():
# get quality of release
quality = parse_result.quality
if not isinstance(name, unicode):
name = unicode(name, 'utf-8', 'replace')
name = fixStupidEncodings(name)
# get release group
release_group = parse_result.release_group

View File

@ -64,8 +64,8 @@ from browser import WebFileBrowser
from lib.dateutil import tz
from lib.unrar2 import RarFile
from lib import subliminal
from trakt import TraktCall
from lib import adba, subliminal
from lib.trakt import TraktCall
try:
import json
@ -77,7 +77,6 @@ try:
except ImportError:
import xml.etree.ElementTree as etree
from lib import adba
from Cheetah.Template import Template
from tornado.web import RequestHandler, HTTPError, asynchronous
@ -3289,7 +3288,7 @@ class ErrorLogs(MainHandler):
for x in reversed(data):
x = x.decode('utf-8', 'replace')
x = ek.fixStupidEncodings(x)
match = re.match(regex, x)
if match: