Removed FTFY, python 2.6 compatibility issues.

Re-coded encodingKludge encode/decode for unicode <-> utf-8
2024-12-12 11:02:21 -05:00 · 2014-11-25 17:22:31 -08:00 · 2014-11-25 17:22:31 -08:00 · 360c3afa08
commit 360c3afa08
parent 468af14dfd
29 changed files with 95 additions and 2080 deletions
--- a/lib/ftfy/init.py
+++ b/lib/ftfy/init.py
@ -1,351 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 ftfy: fixes text for you
 This is a module for making text less broken. See the `fix_text` function
 for more information.
 """
 from __future__ import unicode_literals
 # See the docstring for ftfy.bad_codecs to see what we're doing here.
 import ftfy.bad_codecs
 ftfy.bad_codecs.ok()
 from ftfy import fixes
 from ftfy.fixes import fix_text_encoding
 from ftfy.compatibility import PYTHON34_OR_LATER, is_printable
 import unicodedata
 import warnings
 def fix_text(text,
             remove_unsafe_private_use=(not PYTHON34_OR_LATER),
             fix_entities='auto',
             remove_terminal_escapes=True,
             fix_encoding=True,
             normalization='NFKC',
             uncurl_quotes=True,
             fix_line_breaks=True,
             remove_control_chars=True,
             remove_bom=True,
             max_decode_length=2**16):
    r"""
    Given Unicode text as input, make its representation consistent and
    possibly less broken.
    Let's start with some examples:
        >>> print(fix_text('uÌˆnicode'))
        ünicode
        >>> print(fix_text('Broken text&hellip; it&#x2019;s ﬂubberiﬁc!'))
        Broken text... it's flubberific!
        >>> print(fix_text('HTML entities &lt;3'))
        HTML entities <3
        >>> print(fix_text('<em>HTML entities &lt;3</em>'))
        <em>HTML entities &lt;3</em>
        >>> print(fix_text('\001\033[36;44mI&#x92;m blue, da ba dee da ba '
        ...               'doo&#133;\033[0m'))
        I'm blue, da ba dee da ba doo...
        >>> # This example string starts with a byte-order mark, even if
        >>> # you can't see it on the Web.
        >>> print(fix_text('\ufeffParty like\nit&rsquo;s 1999!'))
        Party like
        it's 1999!
        >>> len(fix_text('ﬁ' * 100000))
        200000
        >>> len(fix_text(''))
        0
    Based on the options you provide, ftfy applies these steps in order:
    - If `remove_unsafe_private_use` is True, it removes a range of private-use
      characters that could trigger a Python bug. The bug is fixed in
      the most recent versions of Python, so this will default to False
      starting on Python 3.4.
    - If `fix_entities` is True, replace HTML entities with their equivalent
      characters. If it's "auto" (the default), then consider replacing HTML
      entities, but don't do so in text where you have seen a pair of actual
      angle brackets (that's probably actually HTML and you shouldn't mess
      with the entities).
    - If `remove_terminal_escapes` is True, remove sequences of bytes that are
      instructions for Unix terminals, such as the codes that make text appear
      in different colors.
    - If `fix_encoding` is True, look for common mistakes that come from
      encoding or decoding Unicode text incorrectly, and fix them if they are
      reasonably fixable. See `fix_text_encoding` for details.
    - If `normalization` is not None, apply the specified form of Unicode
      normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
      The default, 'NFKC', applies the following relevant transformations:
      - C: Combine characters and diacritics that are written using separate
        code points, such as converting "e" plus an acute accent modifier
        into "é", or converting "ka" (か) plus a dakuten into the
        single character "ga" (が).
      - K: Replace characters that are functionally equivalent with the most
        common form. For example, half-width katakana will be replaced with
        full-width versions, full-width Roman characters will be replaced with
        ASCII characters, ellipsis characters will be replaced with three
        periods, and the ligature 'ﬂ' will be replaced with 'fl'.
    - If `uncurl_quotes` is True, replace various curly quotation marks with
      plain-ASCII straight quotes.
    - If `fix_line_breaks` is true, convert all line breaks to Unix style
      (CRLF and CR line breaks become LF line breaks).
    - If `fix_control_characters` is true, remove all C0 control characters
      except the common useful ones: TAB, CR, LF, and FF. (CR characters
      may have already been removed by the `fix_line_breaks` step.)
    - If `remove_bom` is True, remove the Byte-Order Mark if it exists.
    - If anything was changed, repeat all the steps, so that the function is
      idempotent. "&amp;amp;" will become "&", for example, not "&amp;".
    `fix_text` will work one line at a time, with the possibility that some
    lines are in different encodings. When it encounters lines longer than
    `max_decode_length`, it will not run the `fix_encoding` step, to avoid
    unbounded slowdowns.
    If you are certain your entire text is in the same encoding (though that
    encoding is possibly flawed), and do not mind performing operations on
    the whole text at once, use `fix_text_segment`.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)
    out = []
    pos = 0
    while pos < len(text):
        textbreak = text.find('\n', pos) + 1
        fix_encoding_this_time = fix_encoding
        if textbreak == 0:
            textbreak = len(text)
        if (textbreak - pos) > max_decode_length:
            fix_encoding_this_time = False
        substring = text[pos:textbreak]
        if fix_entities == 'auto' and '<' in substring and '>' in substring:
            # we see angle brackets together; this could be HTML
            fix_entities = False
        out.append(
            fix_text_segment(
                substring,
                remove_unsafe_private_use=remove_unsafe_private_use,
                fix_entities=fix_entities,
                remove_terminal_escapes=remove_terminal_escapes,
                fix_encoding=fix_encoding_this_time,
                normalization=normalization,
                uncurl_quotes=uncurl_quotes,
                fix_line_breaks=fix_line_breaks,
                remove_control_chars=remove_control_chars,
                remove_bom=remove_bom
            )
        )
        pos = textbreak
    return ''.join(out)
 ftfy = fix_text
 def fix_file(input_file,
             remove_unsafe_private_use=True,
             fix_entities='auto',
             remove_terminal_escapes=True,
             fix_encoding=True,
             normalization='NFKC',
             uncurl_quotes=True,
             fix_line_breaks=True,
             remove_control_chars=True,
             remove_bom=True):
    """
    Fix text that is found in a file.
    If the file is being read as Unicode text, use that. If it's being read as
    bytes, then unfortunately, we have to guess what encoding it is. We'll try
    a few common encodings, but we make no promises. See the `guess_bytes`
    function for how this is done.
    The output is a stream of fixed lines of text.
    """
    entities = fix_entities
    for line in input_file:
        if isinstance(line, bytes):
            line, encoding = guess_bytes(line)
        if fix_entities == 'auto' and '<' in line and '>' in line:
            entities = False
        yield fix_text_segment(
            line,
            remove_unsafe_private_use=remove_unsafe_private_use,
            fix_entities=entities,
            remove_terminal_escapes=remove_terminal_escapes,
            fix_encoding=fix_encoding,
            normalization=normalization,
            uncurl_quotes=uncurl_quotes,
            fix_line_breaks=fix_line_breaks,
            remove_control_chars=remove_control_chars,
            remove_bom=remove_bom
        )
 def fix_text_segment(text,
                     remove_unsafe_private_use=True,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.
    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)
    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
 def guess_bytes(bstring):
    """
    If you have some bytes in an unknown encoding, here's a reasonable
    strategy for decoding them, by trying a few common encodings that
    can be distinguished from each other.
    This is not a magic bullet. If the bytes are coming from some MySQL
    database with the "character set" set to ISO Elbonian, this won't figure
    it out. Perhaps more relevantly, this currently doesn't try East Asian
    encodings.
    The encodings we try are:
    - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
      like nothing else
    - UTF-8, because it's the global de facto standard
    - "utf-8-variants", because it's what people actually implement when they
      think they're doing UTF-8
    - MacRoman, because Microsoft Office thinks it's still a thing, and it
      can be distinguished by its line breaks. (If there are no line breaks in
      the string, though, you're out of luck.)
    - "sloppy-windows-1252", the Latin-1-like encoding that is the most common
      single-byte encoding
    """
    if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
        return bstring.decode('utf-16'), 'utf-16'
    byteset = set(bytes(bstring))
    byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
    try:
        if byte_ed in byteset or byte_c0 in byteset:
            # Byte 0xed can be used to encode a range of codepoints that
            # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
            # so when we see 0xed, it's very likely we're being asked to
            # decode CESU-8, the variant that encodes UTF-16 surrogates
            # instead of the original characters themselves.
            #
            # This will occasionally trigger on standard UTF-8, as there
            # are some Korean characters that also use byte 0xed, but that's
            # not harmful.
            #
            # Byte 0xc0 is impossible because, numerically, it would only
            # encode characters lower than U+0040. Those already have
            # single-byte representations, and UTF-8 requires using the
            # shortest possible representation. However, Java hides the null
            # codepoint, U+0000, in a non-standard longer representation -- it
            # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
            # will never appear in the encoded bytes.
            #
            # The 'utf-8-variants' decoder can handle both of these cases, as
            # well as standard UTF-8, at the cost of a bit of speed.
            return bstring.decode('utf-8-variants'), 'utf-8-variants'
        else:
            return bstring.decode('utf-8'), 'utf-8'
    except UnicodeDecodeError:
        pass
    if byte_CR in bstring and byte_LF not in bstring:
        return bstring.decode('macroman'), 'macroman'
    else:
        return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
 def explain_unicode(text):
    """
    A utility method that's useful for debugging mysterious Unicode.
    It breaks down a string, showing you for each codepoint its number in
    hexadecimal, its glyph, its category in the Unicode standard, and its name
    in the Unicode standard.
        >>> explain_unicode('(╯°□°)╯︵ ┻━┻')
        U+0028  (       [Ps] LEFT PARENTHESIS
        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
        U+00B0  °       [So] DEGREE SIGN
        U+25A1  □       [So] WHITE SQUARE
        U+00B0  °       [So] DEGREE SIGN
        U+0029  )       [Pe] RIGHT PARENTHESIS
        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
        U+FE35  ︵       [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
        U+0020          [Zs] SPACE
        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
        U+2501  ━       [So] BOX DRAWINGS HEAVY HORIZONTAL
        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
    """
    for char in text:
        if is_printable(char):
            display = char
        else:
            display = char.encode('unicode-escape').decode('ascii')
        print('U+{code:04X}  {display:<7} [{category}] {name}'.format(
            display=display,
            code=ord(char),
            category=unicodedata.category(char),
            name=unicodedata.name(char, '<unknown>')
        ))
 def fix_bad_encoding(text):
    """
    Kept for compatibility with previous versions of ftfy.
    """
    warnings.warn(
        'fix_bad_encoding is now known as fix_text_encoding',
        DeprecationWarning
    )
    return fix_text_encoding(text)
--- a/lib/ftfy/bad_codecs/init.py
+++ b/lib/ftfy/bad_codecs/init.py
@ -1,94 +0,0 @@
 # coding: utf-8
 r"""
 Give Python the ability to decode some common, flawed encodings.
 Python does not want you to be sloppy with your text. Its encoders and decoders
 ("codecs") follow the relevant standards whenever possible, which means that
 when you get text that *doesn't* follow those standards, you'll probably fail
 to decode it. Or you might succeed at decoding it for implementation-specific
 reasons, which is perhaps worse.
 There are some encodings out there that Python wishes didn't exist, which are
 widely used outside of Python:
 - "utf-8-variants", a family of not-quite-UTF-8 encodings, including the
  ever-popular CESU-8 and "Java modified UTF-8".
 - "Sloppy" versions of character map encodings, where bytes that don't map to
  anything will instead map to the Unicode character with the same number.
 Simply importing this module, or in fact any part of the `ftfy` package, will
 make these new "bad codecs" available to Python through the standard Codecs
 API. You never have to actually call any functions inside `ftfy.bad_codecs`.
 However, if you want to call something because your code checker insists on it,
 you can call ``ftfy.bad_codecs.ok()``.
 A quick example of decoding text that's encoded in CESU-8:
    >>> import ftfy.bad_codecs
    >>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
    😍
 """
 from __future__ import unicode_literals
 from encodings import normalize_encoding
 import codecs
 _CACHE = {}
 # Define some aliases for 'utf-8-variants'. All hyphens get turned into
 # underscores, because of `normalize_encoding`.
 UTF8_VAR_NAMES = (
    'utf_8_variants', 'utf8_variants',
    'utf_8_variant', 'utf8_variant',
    'utf_8_var', 'utf8_var',
    'cesu_8', 'cesu8',
    'java_utf_8', 'java_utf8'
 )
 def search_function(encoding):
    """
    Register our "bad codecs" with Python's codecs API. This involves adding
    a search function that takes in an encoding name, and returns a codec
    for that encoding if it knows one, or None if it doesn't.
    The encodings this will match are:
    - Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N',
      where the non-sloppy version is an encoding that leaves some bytes
      unmapped to characters.
    - The 'utf-8-variants' encoding, which has the several aliases seen
      above.
    """
    if encoding in _CACHE:
        return _CACHE[encoding]
    norm_encoding = normalize_encoding(encoding)
    codec = None
    if norm_encoding in UTF8_VAR_NAMES:
        from ftfy.bad_codecs.utf8_variants import CODEC_INFO
        codec = CODEC_INFO
    elif norm_encoding.startswith('sloppy_'):
        from ftfy.bad_codecs.sloppy import CODECS
        codec = CODECS.get(norm_encoding)
    if codec is not None:
        _CACHE[encoding] = codec
    return codec
 def ok():
    """
    A feel-good function that gives you something to call after importing
    this package.
    Why is this here? Pyflakes. Pyflakes gets upset when you import a module
    and appear not to use it. It doesn't know that you're using it when
    you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
    encodings.
    """
    pass
 codecs.register(search_function)
--- a/lib/ftfy/bad_codecs/sloppy.py
+++ b/lib/ftfy/bad_codecs/sloppy.py
@ -1,156 +0,0 @@
 # coding: utf-8
 r"""
 Decodes single-byte encodings, filling their "holes" in the same messy way that
 everyone else does.
 A single-byte encoding maps each byte to a Unicode character, except that some
 bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
 example, bytes 0x81 and 0x8D, among others, have no meaning.
 Python, wanting to preserve some sense of decorum, will handle these bytes
 as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
 different from each other. It just hasn't defined what they are in terms of
 Unicode.
 Software that has to interoperate with Windows-1252 and Unicode -- such as all
 the common Web browsers -- will pick some Unicode characters for them to map
 to, and the characters they pick are the Unicode characters with the same
 numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
 resulting characters tend to fall into a range of Unicode that's set aside for
 obselete Latin-1 control characters anyway.
 These sloppy codecs let Python do the same thing, thus interoperating with
 other software that works this way. It defines a sloppy version of many
 single-byte encodings with holes. (There is no need for a sloppy version of
 an encoding without holes: for example, there is no such thing as
 sloppy-iso-8859-2 or sloppy-macroman.)
 The following encodings will become defined:
 - sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
 - sloppy-windows-1251 (Cyrillic)
 - sloppy-windows-1252 (Western European, based on Latin-1)
 - sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
 - sloppy-windows-1254 (Turkish, based on ISO-8859-9)
 - sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
 - sloppy-windows-1256 (Arabic)
 - sloppy-windows-1257 (Baltic, based on ISO-8859-13)
 - sloppy-windows-1258 (Vietnamese)
 - sloppy-cp874 (Thai, based on ISO-8859-11)
 - sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
 - sloppy-iso-8859-6 (different Arabic)
 - sloppy-iso-8859-7 (Greek)
 - sloppy-iso-8859-8 (Hebrew)
 - sloppy-iso-8859-11 (Thai)
 Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
 defined.
 Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
 the rest are rather uncommon.
 Here are some examples, using `ftfy.explain_unicode` to illustrate how
 sloppy-windows-1252 merges Windows-1252 with Latin-1:
    >>> from ftfy import explain_unicode
    >>> some_bytes = b'\x80\x81\x82'
    >>> explain_unicode(some_bytes.decode('latin-1'))
    U+0080  \x80    [Cc] <unknown>
    U+0081  \x81    [Cc] <unknown>
    U+0082  \x82    [Cc] <unknown>
    >>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
    U+20AC  €       [Sc] EURO SIGN
    U+FFFD  <EFBFBD>       [So] REPLACEMENT CHARACTER
    U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
    >>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
    U+20AC  €       [Sc] EURO SIGN
    U+0081  \x81    [Cc] <unknown>
    U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
 """
 from __future__ import unicode_literals
 import codecs
 from encodings import normalize_encoding
 REPLACEMENT_CHAR = '\ufffd'
 def make_sloppy_codec(encoding):
    """
    Take a codec name, and return a 'sloppy' version of that codec that can
    encode and decode the unassigned bytes in that encoding.
    Single-byte encodings in the standard library are defined using some
    boilerplate classes surrounding the functions that do the actual work,
    `codecs.charmap_decode` and `charmap_encode`. This function, given an
    encoding name, *defines* those boilerplate classes.
    """
    # Make an array of all 256 possible bytes.
    all_bytes = bytearray(range(256))
    # Get a list of what they would decode to in Latin-1.
    sloppy_chars = list(all_bytes.decode('latin-1'))
    # Get a list of what they decode to in the given encoding. Use the
    # replacement character for unassigned bytes.
    decoded_chars = all_bytes.decode(encoding, 'replace')
    # Update the sloppy_chars list. Each byte that was successfully decoded
    # gets its decoded value in the list. The unassigned bytes are left as
    # they are, which gives their decoding in Latin-1.
    for i, char in enumerate(decoded_chars):
        if char != REPLACEMENT_CHAR:
            sloppy_chars[i] = char
    # Create the data structures that tell the charmap methods how to encode
    # and decode in this sloppy encoding.
    decoding_table = ''.join(sloppy_chars)
    encoding_table = codecs.charmap_build(decoding_table)
    # Now produce all the class boilerplate. Look at the Python source for
    # `encodings.cp1252` for comparison; this is almost exactly the same,
    # except I made it follow pep8.
    class Codec(codecs.Codec):
        def encode(self, input, errors='strict'):
            return codecs.charmap_encode(input, errors, encoding_table)
        def decode(self, input, errors='strict'):
            return codecs.charmap_decode(input, errors, decoding_table)
    class IncrementalEncoder(codecs.IncrementalEncoder):
        def encode(self, input, final=False):
            return codecs.charmap_encode(input, self.errors, encoding_table)[0]
    class IncrementalDecoder(codecs.IncrementalDecoder):
        def decode(self, input, final=False):
            return codecs.charmap_decode(input, self.errors, decoding_table)[0]
    class StreamWriter(Codec, codecs.StreamWriter):
        pass
    class StreamReader(Codec, codecs.StreamReader):
        pass
    return codecs.CodecInfo(
        name='sloppy-' + encoding,
        encode=Codec().encode,
        decode=Codec().decode,
        incrementalencoder=IncrementalEncoder,
        incrementaldecoder=IncrementalDecoder,
        streamreader=StreamReader,
        streamwriter=StreamWriter,
    )
 # Define a codec for each incomplete encoding. The resulting CODECS dictionary
 # can be used by the main module of ftfy.bad_codecs.
 CODECS = {}
 INCOMPLETE_ENCODINGS = (
    ['windows-%s' % num for num in range(1250, 1259)] +
    ['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
    ['cp%s' % num for num in range(1250, 1259)] + ['cp874']
 )
 for _encoding in INCOMPLETE_ENCODINGS:
    _new_name = normalize_encoding('sloppy-' + _encoding)
    CODECS[_new_name] = make_sloppy_codec(_encoding)
--- a/lib/ftfy/bad_codecs/utf8_variants.py
+++ b/lib/ftfy/bad_codecs/utf8_variants.py
@ -1,281 +0,0 @@
 r"""
 This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can
 decode text that's been encoded with a popular non-standard version of UTF-8.
 This includes CESU-8, the accidental encoding made by layering UTF-8 on top of
 UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for
 codepoint 0.
 This is particularly relevant in Python 3, which provides no other way of
 decoding CESU-8 or Java's encoding. [1]
 The easiest way to use the codec is to simply import `ftfy.bad_codecs`:
    >>> import ftfy.bad_codecs
    >>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var')
    >>> print(repr(result).lstrip('u'))
    'here comes a null! \x00'
 The codec does not at all enforce "correct" CESU-8. For example, the Unicode
 Consortium's not-quite-standard describing CESU-8 requires that there is only
 one possible encoding of any character, so it does not allow mixing of valid
 UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8
 decoder does.
 Characters in the Basic Multilingual Plane still have only one encoding. This
 codec still enforces the rule, within the BMP, that characters must appear in
 their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`,
 instead of just `0x00`, may be used to encode the null character `U+0000`, like
 in Java.
 If you encode with this codec, you get legitimate UTF-8. Decoding with this
 codec and then re-encoding is not idempotent, although encoding and then
 decoding is. So this module won't produce CESU-8 for you. Look for that
 functionality in the sister module, "Breaks Text For You", coming approximately
 never.
 [1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: first
 decode the bytes (incorrectly), then encode them, then decode them again, using
 UTF-8 as the codec every time.
 """
 from __future__ import unicode_literals
 from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2
 from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder,
                             IncrementalEncoder as UTF8IncrementalEncoder)
 import re
 import codecs
 NAME = 'utf-8-variants'
 # This regular expression matches all possible six-byte CESU-8 sequences.
 CESU8_RE = re.compile(b'\xed[\xa0-\xaf][\x80-\xbf]\xed[\xb0-\xbf][\x80-\xbf]')
 class IncrementalDecoder(UTF8IncrementalDecoder):
    """
    An incremental decoder that extends Python's built-in UTF-8 decoder.
    This encoder needs to take in bytes, possibly arriving in a stream, and
    output the correctly decoded text. The general strategy for doing this
    is to fall back on the real UTF-8 decoder whenever possible, because
    the real UTF-8 decoder is way optimized, but to call specialized methods
    we define here for the cases the real encoder isn't expecting.
    """
    def _buffer_decode(self, input, errors, final):
        """
        Decode bytes that may be arriving in a stream, following the Codecs
        API.
        `input` is the incoming sequence of bytes. `errors` tells us how to
        handle errors, though we delegate all error-handling cases to the real
        UTF-8 decoder to ensure correct behavior. `final` indicates whether
        this is the end of the sequence, in which case we should raise an
        error given incomplete input.
        Returns as much decoded text as possible, and the number of bytes
        consumed.
        """
        # decoded_segments are the pieces of text we have decoded so far,
        # and position is our current position in the byte string. (Bytes
        # before this position have been consumed, and bytes after it have
        # yet to be decoded.)
        decoded_segments = []
        position = 0
        while True:
            # Use _buffer_decode_step to decode a segment of text.
            decoded, consumed = self._buffer_decode_step(
                input[position:],
                errors,
                final
            )
            if consumed == 0:
                # Either there's nothing left to decode, or we need to wait
                # for more input. Either way, we're done for now.
                break
            # Append the decoded text to the list, and update our position.
            decoded_segments.append(decoded)
            position += consumed
        if final:
            # _buffer_decode_step must consume all the bytes when `final` is
            # true.
            assert position == len(input)
        return ''.join(decoded_segments), position
    def _buffer_decode_step(self, input, errors, final):
        """
        There are three possibilities for each decoding step:
        - Decode as much real UTF-8 as possible.
        - Decode a six-byte CESU-8 sequence at the current position.
        - Decode a Java-style null at the current position.
        This method figures out which step is appropriate, and does it.
        """
        # Get a reference to the superclass method that we'll be using for
        # most of the real work.
        sup = UTF8IncrementalDecoder._buffer_decode
        # Find the next byte position that indicates a variant of UTF-8.
        # CESU-8 sequences always start with 0xed, and Java nulls always
        # start with 0xc0, both of which are conveniently impossible in
        # real UTF-8.
        cutoff1 = input.find(b'\xed')
        cutoff2 = input.find(b'\xc0')
        # Set `cutoff` to whichever cutoff comes first.
        if cutoff1 != -1 and cutoff2 != -1:
            cutoff = min(cutoff1, cutoff2)
        elif cutoff1 != -1:
            cutoff = cutoff1
        elif cutoff2 != -1:
            cutoff = cutoff2
        else:
            # The entire input can be decoded as UTF-8, so just do so.
            return sup(input, errors, final)
        if cutoff1 == 0:
            # Decode a possible six-byte sequence starting with 0xed.
            return self._buffer_decode_surrogates(sup, input, errors, final)
        elif cutoff2 == 0:
            # Decode a possible two-byte sequence, 0xc0 0x80.
            return self._buffer_decode_null(sup, input, errors, final)
        else:
            # Decode the bytes up until the next weird thing as UTF-8.
            # Set final=True because 0xc0 and 0xed don't make sense in the
            # middle of a sequence, in any variant.
            return sup(input[:cutoff], errors, True)
    @staticmethod
    def _buffer_decode_null(sup, input, errors, final):
        """
        Decode the bytes 0xc0 0x80 as U+0000, like Java does.
        """
        nextbyte = input[1:2]
        if nextbyte == b'':
            if final:
                # We found 0xc0 at the end of the stream, which is an error.
                # Delegate to the superclass method to handle that error.
                return sup(input, errors, final)
            else:
                # We found 0xc0 and we don't know what comes next, so consume
                # no bytes and wait.
                return '', 0
        elif nextbyte == b'\x80':
            # We found the usual 0xc0 0x80 sequence, so decode it and consume
            # two bytes.
            return '\u0000', 2
        else:
            # We found 0xc0 followed by something else, which is an error.
            # Whatever should happen is equivalent to what happens when the
            # superclass is given just the byte 0xc0, with final=True.
            return sup(b'\xc0', errors, True)
    @staticmethod
    def _buffer_decode_surrogates(sup, input, errors, final):
        """
        When we have improperly encoded surrogates, we can still see the
        bits that they were meant to represent.
        The surrogates were meant to encode a 20-bit number, to which we
        add 0x10000 to get a codepoint. That 20-bit number now appears in
        this form:
          11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
        The CESU8_RE above matches byte sequences of this form. Then we need
        to extract the bits and assemble a codepoint number from them.
        """
        if len(input) < 6:
            if final:
                # We found 0xed near the end of the stream, and there aren't
                # six bytes to decode. Delegate to the superclass method to
                # handle it as normal UTF-8. It might be a Hangul character
                # or an error.
                if PYTHON2 and len(input) >= 3:
                    # We can't trust Python 2 to raise an error when it's
                    # asked to decode a surrogate, so let's force the issue.
                    input = mangle_surrogates(input)
                return sup(input, errors, final)
            else:
                # We found 0xed, the stream isn't over yet, and we don't know
                # enough of the following bytes to decode anything, so consume
                # zero bytes and wait.
                return '', 0
        else:
            if CESU8_RE.match(input):
                # If this is a CESU-8 sequence, do some math to pull out
                # the intended 20-bit value, and consume six bytes.
                bytenums = bytes_to_ints(input[:6])
                codepoint = (
                    ((bytenums[1] & 0x0f) << 16) +
                    ((bytenums[2] & 0x3f) << 10) +
                    ((bytenums[4] & 0x0f) << 6) +
                    (bytenums[5] & 0x3f) +
                    0x10000
                )
                return unichr(codepoint), 6
            else:
                # This looked like a CESU-8 sequence, but it wasn't one.
                # 0xed indicates the start of a three-byte sequence, so give
                # three bytes to the superclass to decode as usual -- except
                # for working around the Python 2 discrepancy as before.
                if PYTHON2:
                    input = mangle_surrogates(input)
                return sup(input[:3], errors, False)
 def mangle_surrogates(bytestring):
    """
    When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats
    it as an error (which it is). In 'replace' mode, it will decode as three
    replacement characters. But Python 2 will just output the surrogate
    codepoint.
    To ensure consistency between Python 2 and Python 3, and protect downstream
    applications from malformed strings, we turn surrogate sequences at the
    start of the string into the bytes `ff ff ff`, which we're *sure* won't
    decode, and which turn into three replacement characters in 'replace' mode.
    """
    if PYTHON2:
        if bytestring.startswith(b'\xed') and len(bytestring) >= 3:
            decoded = bytestring[:3].decode('utf-8', 'replace')
            if '\ud800' <= decoded <= '\udfff':
                return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:])
        return bytestring
    else:
        # On Python 3, nothing needs to be done.
        return bytestring
 # The encoder is identical to UTF-8.
 IncrementalEncoder = UTF8IncrementalEncoder
 # Everything below here is boilerplate that matches the modules in the
 # built-in `encodings` package.
 def encode(input, errors='strict'):
    return IncrementalEncoder(errors).encode(input, final=True), len(input)
 def decode(input, errors='strict'):
    return IncrementalDecoder(errors).decode(input, final=True), len(input)
 class StreamWriter(codecs.StreamWriter):
    encode = encode
 class StreamReader(codecs.StreamReader):
    decode = decode
 CODEC_INFO = codecs.CodecInfo(
    name=NAME,
    encode=encode,
    decode=decode,
    incrementalencoder=IncrementalEncoder,
    incrementaldecoder=IncrementalDecoder,
    streamreader=StreamReader,
    streamwriter=StreamWriter,
 )
--- a/lib/ftfy/badness.py
+++ b/lib/ftfy/badness.py
@ -1,144 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 Heuristics to determine whether re-encoding text is actually making it
 more reasonable.
 """
 from __future__ import unicode_literals
 from ftfy.chardata import chars_to_classes
 import re
 import unicodedata
 # The following regex uses the mapping of character classes to ASCII
 # characters defined in chardata.py and build_data.py:
 #
 # L = Latin capital letter
 # l = Latin lowercase letter
 # A = Non-latin capital or title-case letter
 # a = Non-latin lowercase letter
 # C = Non-cased letter (Lo)
 # X = Control character (Cc)
 # m = Letter modifier (Lm)
 # M = Mark (Mc, Me, Mn)
 # N = Miscellaneous numbers (No)
 # 0 = Math symbol (Sm)
 # 1 = Currency symbol (Sc)
 # 2 = Symbol modifier (Sk)
 # 3 = Other symbol (So)
 # S = UTF-16 surrogate
 # _ = Unassigned character
 #   = Whitespace
 # o = Other
 def _make_weirdness_regex():
    """
    Creates a list of regexes that match 'weird' character sequences.
    The more matches there are, the weirder the text is.
    """
    groups = []
    # Match lowercase letters that are followed by non-ASCII uppercase letters
    groups.append('lA')
    # Match diacritical marks, except when they modify a non-cased letter or
    # another mark.
    #
    # You wouldn't put a diacritical mark on a digit or a space, for example.
    # You might put it on a Latin letter, but in that case there will almost
    # always be a pre-composed version, and we normalize to pre-composed
    # versions first. The cases that can't be pre-composed tend to be in
    # large scripts without case, which are in class C.
    groups.append('[^CM]M')
    # Match non-Latin characters adjacent to Latin characters.
    #
    # This is a simplification from ftfy version 2, which compared all
    # adjacent scripts. However, the ambiguities we need to resolve come from
    # encodings designed to represent Latin characters.
    groups.append('[Ll][AaC]')
    groups.append('[AaC][Ll]')
    # Match C1 control characters, which are almost always the result of
    # decoding Latin-1 that was meant to be Windows-1252.
    groups.append('X')
    # Match private use and unassigned characters.
    groups.append('P')
    groups.append('_')
    # Match adjacent characters from any different pair of these categories:
    # - Modifier marks (M)
    # - Letter modifiers (m)
    # - Miscellaneous numbers (N)
    # - Symbols (0123)
    exclusive_categories = 'MmN0123'
    for cat1 in exclusive_categories:
        others_range = ''.join(c for c in exclusive_categories if c != cat1)
        groups.append('{cat1}[{others_range}]'.format(
            cat1=cat1, others_range=others_range
        ))
    regex = '|'.join('({0})'.format(group) for group in groups)
    return re.compile(regex)
 WEIRDNESS_RE = _make_weirdness_regex()
 # A few characters are common ending punctuation that can show up at the end
 # of a mojibake sequence. It's plausible that such a character could appear
 # after an accented capital letter, for example, so we'll want to add a
 # slight preference to leave these characters alone.
 #
 # The match ends with a + so that we only give the bonus once for a
 # consecutive sequence of these characters.
 ENDING_PUNCT_RE = re.compile(
    '['
    '\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}'
    '\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}'
    '\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}'
    '\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}'
    ']+'
 )
 def sequence_weirdness(text):
    """
    Determine how often a text has unexpected characters or sequences of
    characters. This metric is used to disambiguate when text should be
    re-decoded or left as is.
    We start by normalizing text in NFC form, so that penalties for
    diacritical marks don't apply to characters that know what to do with
    them.
    The following things are deemed weird:
    - Lowercase letters followed by non-ASCII uppercase letters
    - Non-Latin characters next to Latin characters
    - Un-combined diacritical marks, unless they're stacking on non-alphabetic
      characters (in languages that do that kind of thing a lot) or other
      marks
    - C1 control characters
    - Adjacent symbols from any different pair of these categories:
        - Modifier marks
        - Letter modifiers
        - Non-digit numbers
        - Symbols (including math and currency)
    The return value is the number of instances of weirdness.
    """
    text2 = unicodedata.normalize('NFC', text)
    weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2)))
    punct_discount = len(ENDING_PUNCT_RE.findall(text2))
    return weirdness * 2 - punct_discount
 def text_cost(text):
    """
    An overall cost function for text. Weirder is worse, but all else being
    equal, shorter strings are better.
    The overall cost is measured as the "weirdness" (see
    :func:`sequence_weirdness`) plus the length.
    """
    return sequence_weirdness(text) + len(text)
--- a/lib/ftfy/build_data.py
+++ b/lib/ftfy/build_data.py
@ -1,111 +0,0 @@
 """
 A script to make the char_classes.dat file.
 This never needs to run in normal usage. It needs to be run if the character
 classes we care about change, or if a new version of Python supports a new
 Unicode standard and we want it to affect our string decoding.
 The file that we generate is based on Unicode 6.1, as supported by Python 3.3.
 You can certainly use it in earlier versions. This simply makes sure that we
 get consistent results from running ftfy on different versions of Python.
 The file will be written to the current directory.
 """
 from __future__ import unicode_literals
 import unicodedata
 import sys
 import zlib
 if sys.hexversion >= 0x03000000:
    unichr = chr
 # L = Latin capital letter
 # l = Latin lowercase letter
 # A = Non-latin capital or title-case letter
 # a = Non-latin lowercase letter
 # C = Non-cased letter (Lo)
 # X = Control character (Cc)
 # m = Letter modifier (Lm)
 # M = Mark (Mc, Me, Mn)
 # N = Miscellaneous numbers (No)
 # P = Private use (Co)
 # 0 = Math symbol (Sm)
 # 1 = Currency symbol (Sc)
 # 2 = Symbol modifier (Sk)
 # 3 = Other symbol (So)
 # S = UTF-16 surrogate
 # _ = Unassigned character
 #   = Whitespace
 # o = Other
 def make_char_data_file(do_it_anyway=False):
    """
    Build the compressed data file 'char_classes.dat' and write it to the
    current directory.
    If you run this, run it in Python 3.3 or later. It will run in earlier
    versions, but you won't get the current Unicode standard, leading to
    inconsistent behavior. To protect against this, running this in the
    wrong version of Python will raise an error unless you pass
    `do_it_anyway=True`.
    """
    if sys.hexversion < 0x03030000 and not do_it_anyway:
        raise RuntimeError(
            "This function should be run in Python 3.3 or later."
        )
    cclasses = [None] * 0x110000
    for codepoint in range(0x0, 0x110000):
        char = unichr(codepoint)
        category = unicodedata.category(char)
        if category.startswith('L'):  # letters
            is_latin = unicodedata.name(char).startswith('LATIN')
            if is_latin and codepoint < 0x200:
                if category == 'Lu':
                    cclasses[codepoint] = 'L'
                else:
                    cclasses[codepoint] = 'l'
            else:  # non-Latin letter, or close enough
                if category == 'Lu' or category == 'Lt':
                    cclasses[codepoint] = 'A'
                elif category == 'Ll':
                    cclasses[codepoint] = 'a'
                elif category == 'Lo':
                    cclasses[codepoint] = 'C'
                elif category == 'Lm':
                    cclasses[codepoint] = 'm'
                else:
                    raise ValueError('got some weird kind of letter')
        elif category.startswith('M'):  # marks
            cclasses[codepoint] = 'M'
        elif category == 'No':
            cclasses[codepoint] = 'N'
        elif category == 'Sm':
            cclasses[codepoint] = '0'
        elif category == 'Sc':
            cclasses[codepoint] = '1'
        elif category == 'Sk':
            cclasses[codepoint] = '2'
        elif category == 'So':
            cclasses[codepoint] = '3'
        elif category == 'Cn':
            cclasses[codepoint] = '_'
        elif category == 'Cc':
            cclasses[codepoint] = 'X'
        elif category == 'Cs':
            cclasses[codepoint] = 'S'
        elif category == 'Co':
            cclasses[codepoint] = 'P'
        elif category.startswith('Z'):
            cclasses[codepoint] = ' '
        else:
            cclasses[codepoint] = 'o'
    cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
    out = open('char_classes.dat', 'wb')
    out.write(zlib.compress(''.join(cclasses).encode('ascii')))
    out.close()
 if __name__ == '__main__':
    make_char_data_file()
--- a/lib/ftfy/char_classes.dat
+++ b/lib/ftfy/char_classes.dat
--- a/lib/ftfy/chardata.py
+++ b/lib/ftfy/chardata.py
@ -1,81 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 This gives other modules access to the gritty details about characters and the
 encodings that use them.
 """
 from __future__ import unicode_literals
 import re
 import zlib
 from pkg_resources import resource_string
 from ftfy.compatibility import unichr
 # These are the five encodings we will try to fix in ftfy, in the
 # order that they should be tried.
 CHARMAP_ENCODINGS = [
    'latin-1',
    'sloppy-windows-1252',
    'macroman',
    'cp437',
    'sloppy-windows-1251',
 ]
 def _build_regexes():
    """
    ENCODING_REGEXES contain reasonably fast ways to detect if we
    could represent a given string in a given encoding. The simplest one is
    the 'ascii' detector, which of course just determines if all characters
    are between U+0000 and U+007F.
    """
    # Define a regex that matches ASCII text.
    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
    for encoding in CHARMAP_ENCODINGS:
        latin1table = ''.join(unichr(i) for i in range(128, 256))
        charlist = latin1table.encode('latin-1').decode(encoding)
        # Build a regex from the ASCII range, followed by the decodings of
        # bytes 0x80-0xff in this character set. (This uses the fact that all
        # regex special characters are ASCII, and therefore won't appear in the
        # string.)
        regex = '^[\x00-\x7f{0}]*$'.format(charlist)
        encoding_regexes[encoding] = re.compile(regex)
    return encoding_regexes
 ENCODING_REGEXES = _build_regexes()
 def possible_encoding(text, encoding):
    """
    Given text and a single-byte encoding, check whether that text could have
    been decoded from that single-byte encoding.
    In other words, check whether it can be encoded in that encoding, possibly
    sloppily.
    """
    return bool(ENCODING_REGEXES[encoding].match(text))
 CHAR_CLASS_STRING = zlib.decompress(
    resource_string(__name__, 'char_classes.dat')
 ).decode('ascii')
 def chars_to_classes(string):
    """
    Convert each Unicode character to a letter indicating which of many
    classes it's in.
    See build_data.py for where this data comes from and what it means.
    """
    return string.translate(CHAR_CLASS_STRING)
 # A translate mapping that will strip all C0 control characters except
 # those that represent whitespace.
 CONTROL_CHARS = {}
 for i in range(32):
    CONTROL_CHARS[i] = None
 # Map whitespace control characters to themselves.
 for char in '\t\n\f\r':
    del CONTROL_CHARS[ord(char)]
--- a/lib/ftfy/cli.py
+++ b/lib/ftfy/cli.py
@ -1,34 +0,0 @@
 """
 A simple command-line utility for fixing text found in a file.
 Because files do not come with their encoding marked, it first runs the file
 through `ftfy.guess_bytes`, then runs it through `ftfy.fix_text`.
 """
 from ftfy import fix_file
 import sys
 ENCODE_STDOUT = (sys.hexversion < 0x03000000)
 def main():
    """
    Run ftfy as a command-line utility. (Requires Python 2.7 or later, or
    the 'argparse' module.)
    """
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help='file to transcode')
    args = parser.parse_args()
    file = open(args.filename)
    for line in fix_file(file):
        if ENCODE_STDOUT:
            sys.stdout.write(line.encode('utf-8'))
        else:
            sys.stdout.write(line)
 if __name__ == '__main__':
    main()
--- a/lib/ftfy/compatibility.py
+++ b/lib/ftfy/compatibility.py
@ -1,79 +0,0 @@
 """
 Makes some function names and behavior consistent between Python 2 and
 Python 3, and also between narrow and wide builds.
 """
 from __future__ import unicode_literals
 import sys
 import re
 import unicodedata
 if sys.hexversion >= 0x03000000:
    from html import entities
    unichr = chr
    xrange = range
    PYTHON2 = False
 else:
    import htmlentitydefs as entities
    unichr = unichr
    xrange = xrange
    PYTHON2 = True
 htmlentitydefs = entities
 PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000)
 def _narrow_unichr_workaround(codepoint):
    """
    A replacement for unichr() on narrow builds of Python. This will get
    us the narrow representation of an astral character, which will be
    a string of length two, containing two UTF-16 surrogates.
    """
    escaped = b'\\U%08x' % codepoint
    return escaped.decode('unicode-escape')
 if sys.maxunicode < 0x10000:
    unichr = _narrow_unichr_workaround
    # In a narrow build of Python, we can't write a regex involving astral
    # characters. If we want to write the regex:
    #
    #   [\U00100000-\U0010ffff]
    #
    # The actual string that defines it quietly turns into:
    #
    #   [\udbc0\udc00-\udbff\udfff]
    #
    # And now the range operator only applies to the middle two characters.
    # It looks like a range that's going backwards from \dc00 to \dbff,
    # which is an error.
    #
    # What we can do instead is rewrite the expression to be _about_ the two
    # surrogates that make up the astral characters, instead of the characters
    # themselves. This would be wrong on a wide build, but it works on a
    # narrow build.
    UNSAFE_PRIVATE_USE_RE = re.compile('[\udbc0-\udbff][\udc00-\udfff]')
 else:
    UNSAFE_PRIVATE_USE_RE = re.compile('[\U00100000-\U0010ffff]')
 def bytes_to_ints(bytestring):
    """
    No matter what version of Python this is, make a sequence of integers from
    a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a
    sequence of integers.
    """
    if PYTHON2:
        return [ord(b) for b in bytestring]
    else:
        return bytestring
 def is_printable(char):
    """
    str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so
    let's make a crude approximation in Python 2.
    """
    if PYTHON2:
        return not unicodedata.category(char).startswith('C')
    else:
        return char.isprintable()
--- a/lib/ftfy/fixes.py
+++ b/lib/ftfy/fixes.py
@ -1,473 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 This module contains the individual fixes that the main fix_text function
 can perform.
 """
 from __future__ import unicode_literals
 from ftfy.chardata import (possible_encoding,
                           CHARMAP_ENCODINGS, CONTROL_CHARS)
 from ftfy.badness import text_cost
 from ftfy.compatibility import htmlentitydefs, unichr, UNSAFE_PRIVATE_USE_RE
 import re
 import sys
 import codecs
 BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
 ftfy is designed to fix problems that were introduced by handling Unicode
 incorrectly. It might be able to fix the bytes you just handed it, but the
 fact that you just gave a pile of bytes to a function that fixes text means
 that your code is *also* handling Unicode incorrectly.
 ftfy takes Unicode text as input. You should take these bytes and decode
 them from the encoding you think they are in. If you're not sure what encoding
 they're in:
 - First, try to find out. 'utf-8' is a good assumption.
 - If the encoding is simply unknowable, try running your bytes through
  ftfy.guess_bytes. As the name implies, this may not always be accurate.
 If you're confused by this, please read the Python Unicode HOWTO:
    http://docs.python.org/%d/howto/unicode.html
 """ % sys.version_info[0]
 def fix_text_encoding(text):
    r"""
    Fix text with incorrectly-decoded garbage ("mojibake") whenever possible.
    Something you will find all over the place, in real-world text, is text
    that's mistakenly encoded as utf-8, decoded in some ugly format like
    latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
    This causes your perfectly good Unicode-aware code to end up with garbage
    text because someone else (or maybe "someone else") made a mistake.
    This function looks for the evidence of that having happened and fixes it.
    It determines whether it should replace nonsense sequences of single-byte
    characters that were really meant to be UTF-8 characters, and if so, turns
    them into the correctly-encoded Unicode character that they were meant to
    represent.
    The input to the function must be Unicode. If you don't have Unicode text,
    you're not using the right tool to solve your problem.
    .. note::
        The following examples are written using unmarked literal strings,
        but they are Unicode text. In Python 2 we have "unicode_literals"
        turned on, and in Python 3 this is always the case.
    ftfy decodes text that looks like it was decoded incorrectly. It leaves
    alone text that doesn't.
        >>> print(fix_text_encoding('Ãºnico'))
        único
        >>> print(fix_text_encoding('This text is fine already :þ'))
        This text is fine already :þ
    Because these characters often come from Microsoft products, we allow
    for the possibility that we get not just Unicode characters 128-255, but
    also Windows's conflicting idea of what characters 128-160 are.
        >>> print(fix_text_encoding('This â€” should be an em dash'))
        This — should be an em dash
    We might have to deal with both Windows characters and raw control
    characters at the same time, especially when dealing with characters like
    0x81 that have no mapping in Windows. This is a string that Python's
    standard `.encode` and `.decode` methods cannot correct.
        >>> print(fix_text_encoding('This text is sad .â\x81”.'))
        This text is sad .⁔.
    However, it has safeguards against fixing sequences of letters and
    punctuation that can occur in valid text:
        >>> print(fix_text_encoding('not such a fan of Charlotte Brontë…”'))
        not such a fan of Charlotte Brontë…”
    Cases of genuine ambiguity can sometimes be addressed by finding other
    characters that are not double-encoded, and expecting the encoding to
    be consistent:
        >>> print(fix_text_encoding('AHÅ™, the new sofa from IKEA®'))
        AHÅ™, the new sofa from IKEA®
    Finally, we handle the case where the text is in a single-byte encoding
    that was intended as Windows-1252 all along but read as Latin-1:
        >>> print(fix_text_encoding('This text was never UTF-8 at all\x85'))
        This text was never UTF-8 at all…
    The best version of the text is found using
    :func:`ftfy.badness.text_cost`.
    """
    text, _plan = fix_encoding_and_explain(text)
    return text
 def fix_encoding_and_explain(text):
    """
    Re-decodes text that has been decoded incorrectly, and also return a
    "plan" indicating all the steps required to fix it.
    To fix similar text in the same way, without having to detect anything,
    you can use the ``apply_plan`` function.
    """
    best_version = text
    best_cost = text_cost(text)
    best_plan = []
    plan_so_far = []
    while True:
        prevtext = text
        text, plan = fix_one_step_and_explain(text)
        plan_so_far.extend(plan)
        cost = text_cost(text)
        # Add a penalty if we used a particularly obsolete encoding. The result
        # is that we won't use these encodings unless they can successfully
        # replace multiple characters.
        if ('encode', 'macroman') in plan_so_far or\
           ('encode', 'cp437') in plan_so_far:
            cost += 2
        # We need pretty solid evidence to decode from Windows-1251 (Cyrillic).
        if ('encode', 'sloppy-windows-1251') in plan_so_far:
            cost += 5
        if cost < best_cost:
            best_cost = cost
            best_version = text
            best_plan = list(plan_so_far)
        if text == prevtext:
            return best_version, best_plan
 def fix_one_step_and_explain(text):
    """
    Performs a single step of re-decoding text that's been decoded incorrectly.
    Returns the decoded text, plus a "plan" for how to reproduce what it
    did.
    """
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)
    if len(text) == 0:
        return text, []
    # The first plan is to return ASCII text unchanged.
    if possible_encoding(text, 'ascii'):
        return text, []
    # As we go through the next step, remember the possible encodings
    # that we encounter but don't successfully fix yet. We may need them
    # later.
    possible_1byte_encodings = []
    # Suppose the text was supposed to be UTF-8, but it was decoded using
    # a single-byte encoding instead. When these cases can be fixed, they
    # are usually the correct thing to do, so try them next.
    for encoding in CHARMAP_ENCODINGS:
        if possible_encoding(text, encoding):
            encoded_bytes = text.encode(encoding)
            # Now, find out if it's UTF-8 (or close enough). Otherwise,
            # remember the encoding for later.
            try:
                decoding = 'utf-8'
                if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes:
                    decoding = 'utf-8-variants'
                fixed = encoded_bytes.decode(decoding)
                steps = [('encode', encoding), ('decode', decoding)]
                return fixed, steps
            except UnicodeDecodeError:
                possible_1byte_encodings.append(encoding)
    # The next most likely case is that this is Latin-1 that was intended to
    # be read as Windows-1252, because those two encodings in particular are
    # easily confused.
    if 'latin-1' in possible_1byte_encodings:
        if 'windows-1252' in possible_1byte_encodings:
            # This text is in the intersection of Latin-1 and
            # Windows-1252, so it's probably legit.
            return text, []
        else:
            # Otherwise, it means we have characters that are in Latin-1 but
            # not in Windows-1252. Those are C1 control characters. Nobody
            # wants those. Assume they were meant to be Windows-1252. Don't
            # use the sloppy codec, because bad Windows-1252 characters are
            # a bad sign.
            encoded = text.encode('latin-1')
            try:
                fixed = encoded.decode('windows-1252')
                steps = []
                if fixed != text:
                    steps = [('encode', 'latin-1'), ('decode', 'windows-1252')]
                return fixed, steps
            except UnicodeDecodeError:
                # This text contained characters that don't even make sense
                # if you assume they were supposed to be Windows-1252. In
                # that case, let's not assume anything.
                pass
    # The cases that remain are mixups between two different single-byte
    # encodings, and not the common case of Latin-1 vs. Windows-1252.
    #
    # Those cases are somewhat rare, and impossible to solve without false
    # positives. If you're in one of these situations, you should try using
    # the `ftfy.guess_bytes` function.
    # Return the text unchanged; the plan is empty.
    return text, []
 def apply_plan(text, plan):
    """
    Apply a plan for fixing the encoding of text.
    The plan is a list of tuples of the form (operation, encoding), where
    `operation` is either 'encode' or 'decode', and `encoding` is an encoding
    name such as 'utf-8' or 'latin-1'.
    Because only text can be encoded, and only bytes can be decoded, the plan
    should alternate 'encode' and 'decode' steps, or else this function will
    encounter an error.
    """
    obj = text
    for operation, encoding in plan:
        if operation == 'encode':
            obj = obj.encode(encoding)
        elif operation == 'decode':
            obj = obj.decode(encoding)
        else:
            raise ValueError("Unknown plan step: %s" % operation)
    return obj
 HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};")
 def unescape_html(text):
    """
    Decode all three types of HTML entities/character references.
    Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
    to it for efficiency: it won't match entities longer than 8 characters,
    because there are no valid entities like that.
        >>> print(unescape_html('&lt;tag&gt;'))
        <tag>
    """
    def fixup(match):
        """
        Replace one matched HTML entity with the character it represents,
        if possible.
        """
        text = match.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text  # leave as is
    return HTML_ENTITY_RE.sub(fixup, text)
 ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])')
 def remove_terminal_escapes(text):
    r"""
    Strip out "ANSI" terminal escape sequences, such as those that produce
    colored text on Unix.
        >>> print(remove_terminal_escapes(
        ...     "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
        ... ))
        I'm blue, da ba dee da ba doo...
    """
    return ANSI_RE.sub('', text)
 SINGLE_QUOTE_RE = re.compile('[\u2018-\u201b]')
 DOUBLE_QUOTE_RE = re.compile('[\u201c-\u201f]')
 def uncurl_quotes(text):
    r"""
    Replace curly quotation marks with straight equivalents.
        >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
        "here's a test"
    """
    return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
 def fix_line_breaks(text):
    r"""
    Convert all line breaks to Unix style.
    This will convert the following sequences into the standard \\n
    line break:
        - CRLF (\\r\\n), used on Windows and in some communication
          protocols
        - CR (\\r), once used on Mac OS Classic, and now kept alive
          by misguided software such as Microsoft Office for Mac
        - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029),
          defined by Unicode and used to sow confusion and discord
        - NEXT LINE (\\x85), a C1 control character that is certainly
          not what you meant
    The NEXT LINE character is a bit of an odd case, because it
    usually won't show up if `fix_encoding` is also being run.
    \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
        >>> print(fix_line_breaks(
        ...     "This string is made of two things:\u2029"
        ...     "1. Unicode\u2028"
        ...     "2. Spite"
        ... ))
        This string is made of two things:
        1. Unicode
        2. Spite
    For further testing and examples, let's define a function to make sure
    we can see the control characters in their escaped form:
        >>> def eprint(text):
        ...     print(text.encode('unicode-escape').decode('ascii'))
        >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
        Content-type: text/plain\n\nHi.
        >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
        This is how Microsoft \n trolls Mac users
        >>> eprint(fix_line_breaks("What is this \x85 I don't even"))
        What is this \n I don't even
    """
    return text.replace('\r\n', '\n').replace('\r', '\n')\
               .replace('\u2028', '\n').replace('\u2029', '\n')\
               .replace('\u0085', '\n')
 def remove_control_chars(text):
    """
    Remove all control characters except for the important ones.
    This removes characters in these ranges:
    - U+0000 to U+0008
    - U+000B
    - U+000E to U+001F
    - U+007F
    It leaves alone these characters that are commonly used for formatting:
    - TAB (U+0009)
    - LF (U+000A)
    - FF (U+000C)
    - CR (U+000D)
    """
    return text.translate(CONTROL_CHARS)
 def remove_bom(text):
    r"""
    Remove a left-over byte-order mark.
    >>> print(remove_bom("\ufeffWhere do you want to go today?"))
    Where do you want to go today?
    """
    return text.lstrip(unichr(0xfeff))
 def remove_unsafe_private_use(text):
    r"""
    Python 3.3's Unicode support isn't perfect, and in fact there are certain
    string operations that will crash some versions of it with a SystemError:
    http://bugs.python.org/issue18183
    The best solution is to remove all characters from Supplementary Private
    Use Area B, using a regex that is known not to crash given those
    characters.
    These are the characters from U+100000 to U+10FFFF. It's sad to lose an
    entire plane of Unicode, but on the other hand, these characters are not
    assigned and never will be. If you get one of these characters and don't
    know what its purpose is, its purpose is probably to crash your code.
    If you were using these for actual private use, this might be inconvenient.
    You can turn off this fixer, of course, but I kind of encourage using
    Supplementary Private Use Area A instead.
        >>> print(remove_unsafe_private_use('\U0001F4A9\U00100000'))
        💩
    This fixer is off by default in Python 3.4 or later. (The bug is actually
    fixed in 3.3.3 and 2.7.6, but I don't want the default behavior to change
    based on a micro version upgrade of Python.)
    """
    return UNSAFE_PRIVATE_USE_RE.sub('', text)
 # Define a regex to match valid escape sequences in Python string literals.
 ESCAPE_SEQUENCE_RE = re.compile(r'''
    ( \\U........      # 8-digit hex escapes
    | \\u....          # 4-digit hex escapes
    | \\x..            # 2-digit hex escapes
    | \\[0-7]{1,3}     # Octal escapes
    | \\N\{[^}]+\}     # Unicode characters by name
    | \\[\\'"abfnrtv]  # Single-character escapes
    )''', re.UNICODE | re.VERBOSE)
 def decode_escapes(text):
    r"""
    Decode backslashed escape sequences, including \\x, \\u, and \\U character
    references, even in the presence of other Unicode.
    This is what Python's "string-escape" and "unicode-escape" codecs were
    meant to do, but in contrast, this actually works. It will decode the
    string exactly the same way that the Python interpreter decodes its string
    literals.
        >>> factoid = '\\u20a1 is the currency symbol for the colón.'
        >>> print(factoid[1:])
        u20a1 is the currency symbol for the colón.
        >>> print(decode_escapes(factoid))
        ₡ is the currency symbol for the colón.
    Even though Python itself can read string literals with a combination of
    escapes and literal Unicode -- you're looking at one right now -- the
    "unicode-escape" codec doesn't work on literal Unicode. (See
    http://stackoverflow.com/a/24519338/773754 for more details.)
    Instead, this function searches for just the parts of a string that
    represent escape sequences, and decodes them, leaving the rest alone. All
    valid escape sequences are made of ASCII characters, and this allows
    "unicode-escape" to work correctly.
    This fix cannot be automatically applied by the `ftfy.fix_text` function,
    because escaped text is not necessarily a mistake, and there is no way
    to distinguish text that's supposed to be escaped from text that isn't.
    """
    def decode_match(match):
        "Given a regex match, decode the escape sequence it contains."
        return codecs.decode(match.group(0), 'unicode-escape')
    return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
--- a/lib/ftfy/streamtester/init.py
+++ b/lib/ftfy/streamtester/init.py
@ -1,39 +0,0 @@
 """
 This file defines a general method for evaluating ftfy using data that arrives
 in a stream. A concrete implementation of it is found in `twitter_tester.py`.
 """
 from __future__ import print_function, unicode_literals
 from ftfy.fixes import fix_text_encoding
 from ftfy.chardata import possible_encoding
 class StreamTester:
    """
    Take in a sequence of texts, and show the ones that will be changed by
    ftfy. This will also periodically show updates, such as the proportion of
    texts that changed.
    """
    def __init__(self):
        self.num_fixed = 0
        self.count = 0
    def check_ftfy(self, text):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        if not possible_encoding(text, 'ascii'):
            fixed = fix_text_encoding(text)
            if text != fixed:
                # possibly filter common bots before printing
                print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format(
                    text=text, fixed=fixed
                ))
                self.num_fixed += 1
        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))
--- a/lib/ftfy/streamtester/oauth.py
+++ b/lib/ftfy/streamtester/oauth.py
@ -1,73 +0,0 @@
 # coding: utf-8
 """
 Do what is necessary to authenticate this tester as a Twitter "app", using
 somebody's Twitter account.
 """
 from __future__ import unicode_literals
 import os
 AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth')
 def get_auth():
    """
    Twitter has some bizarre requirements about how to authorize an "app" to
    use its API.
    The user of the app has to log in to get a secret token. That's fine. But
    the app itself has its own "consumer secret" token. The app has to know it,
    and the user of the app has to not know it.
    This is, of course, impossible. It's equivalent to DRM. Your computer can't
    *really* make use of secret information while hiding the same information
    from you.
    The threat appears to be that, if you have this super-sekrit token, you can
    impersonate the app while doing something different. Well, of course you
    can do that, because you *have the source code* and you can change it to do
    what you want. You still have to log in as a particular user who has a
    token that's actually secret, you know.
    Even developers of closed-source applications that use the Twitter API are
    unsure what to do, for good reason. These "secrets" are not secret in any
    cryptographic sense. A bit of Googling shows that the secret tokens for
    every popular Twitter app are already posted on the Web.
    Twitter wants us to pretend this string can be kept secret, and hide this
    secret behind a fig leaf like everybody else does. So that's what we've
    done.
    """
    from twitter.oauth import OAuth
    from twitter import oauth_dance, read_token_file
    def unhide(secret):
        """
        Do something mysterious and exactly as secure as every other Twitter
        app.
        """
        return ''.join([chr(ord(c) - 0x2800) for c in secret])
    fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁'
    consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw'
    if os.path.exists(AUTH_TOKEN_PATH):
        token, token_secret = read_token_file(AUTH_TOKEN_PATH)
    else:
        authdir = os.path.dirname(AUTH_TOKEN_PATH)
        if not os.path.exists(authdir):
            os.makedirs(authdir)
        token, token_secret = oauth_dance(
            app_name='ftfy-tester',
            consumer_key=consumer_key,
            consumer_secret=unhide(fig_leaf),
            token_filename=AUTH_TOKEN_PATH
        )
    return OAuth(
        token=token,
        token_secret=token_secret,
        consumer_key=consumer_key,
        consumer_secret=unhide(fig_leaf)
    )
--- a/lib/ftfy/streamtester/twitter_tester.py
+++ b/lib/ftfy/streamtester/twitter_tester.py
@ -1,89 +0,0 @@
 """
 Implements a StreamTester that runs over Twitter data. See the class
 docstring.
 This module is written for Python 3 only. The __future__ imports you see here
 are just to let Python 2 scan the file without crashing with a SyntaxError.
 """
 from __future__ import print_function, unicode_literals
 import os
 from collections import defaultdict
 from ftfy.streamtester import StreamTester
 class TwitterTester(StreamTester):
    """
    This class uses the StreamTester code (defined in `__init__.py`) to
    evaluate ftfy's real-world performance, by feeding it live data from
    Twitter.
    This is a semi-manual evaluation. It requires a human to look at the
    results and determine if they are good. The three possible cases we
    can see here are:
        - Success: the process takes in mojibake and outputs correct text.
        - False positive: the process takes in correct text, and outputs
          mojibake. Every false positive should be considered a bug, and
          reported on GitHub if it isn't already.
        - Confusion: the process takes in mojibake and outputs different
          mojibake. Not a great outcome, but not as dire as a false
          positive.
    This tester cannot reveal false negatives. So far, that can only be
    done by the unit tests.
    """
    OUTPUT_DIR = './twitterlogs'
    def __init__(self):
        self.lines_by_lang = defaultdict(list)
        super().__init__()
    def save_files(self):
        """
        When processing data from live Twitter, save it to log files so that
        it can be replayed later.
        """
        if not os.path.exists(self.OUTPUT_DIR):
            os.makedirs(self.OUTPUT_DIR)
        for lang, lines in self.lines_by_lang.items():
            filename = 'tweets.{}.txt'.format(lang)
            fullname = os.path.join(self.OUTPUT_DIR, filename)
            langfile = open(fullname, 'a')
            for line in lines:
                print(line.replace('\n', ' '), file=langfile)
            langfile.close()
        self.lines_by_lang = defaultdict(list)
    def run_sample(self):
        """
        Listen to live data from Twitter, and pass on the fully-formed tweets
        to `check_ftfy`. This requires the `twitter` Python package as a
        dependency.
        """
        from twitter import TwitterStream
        from ftfy.streamtester.oauth import get_auth
        twitter_stream = TwitterStream(auth=get_auth())
        iterator = twitter_stream.statuses.sample()
        for tweet in iterator:
            if 'text' in tweet:
                self.check_ftfy(tweet['text'])
                if 'user' in tweet:
                    lang = tweet['user'].get('lang', 'NONE')
                    self.lines_by_lang[lang].append(tweet['text'])
                if self.count % 10000 == 100:
                    self.save_files()
 def main():
    """
    When run from the command line, this script connects to the Twitter stream
    and runs the TwitterTester on it forever. Or at least until the stream
    drops.
    """
    tester = TwitterTester()
    tester.run_sample()
 if __name__ == '__main__':
    main()
--- a/sickbeard/encodingKludge.py
+++ b/sickbeard/encodingKludge.py
@ -17,53 +17,71 @@
 # along with SickRage.  If not, see <http://www.gnu.org/licenses/>.
 import os
 import traceback
 import sickbeard
 from sickbeard import logger
-import ftfy
+import six
-import ftfy.bad_codecs
+import chardet
 # This module tries to deal with the apparently random behavior of python when dealing with unicode <-> utf-8
 # encodings. It tries to just use unicode, but if that fails then it tries forcing it to utf-8. Any functions
 # which return something should always return unicode.
-def fixStupidEncodings(x, silent=False):
+def toUnicode(x):
    if type(x) == str:
    try:
-            return str(ftfy.fix_text(u'' + x)).decode(sickbeard.SYS_ENCODING)
+        if isinstance(x, unicode):
        except UnicodeDecodeError:
            logger.log(u"Unable to decode value: " + repr(x), logger.ERROR)
            return x
        except UnicodeEncodeError:
            logger.log(u"Unable to encode value: " + repr(x), logger.ERROR)
            return x
    elif type(x) == unicode:
            return x
        else:
-        logger.log(
+            try:
-            u"Unknown value passed in, ignoring it: " + str(type(x)) + " (" + repr(x) + ":" + repr(type(x)) + ")",
+                return six.text_type(x)
-            logger.DEBUG if silent else logger.ERROR)
+            except:
                try:
                    if chardet.detect(x).get('encoding') == 'utf-8':
                        return x.decode('utf-8')
                    if isinstance(x, str):
                        try:
                            return x.decode(sickbeard.SYS_ENCODING)
                        except UnicodeDecodeError:
                            raise
                    return x
                except:
                    raise
    except:
        logger.log('Unable to decode value "%s..." : %s ' % (repr(x)[:20], traceback.format_exc()), logger.WARNING)
        ascii_text = str(x).encode('string_escape')
        return toUnicode(ascii_text)
 def ss(x):
    u_x = toUnicode(x)
    try:
        return u_x.encode(sickbeard.SYS_ENCODING)
    except Exception as e:
        logger.log('Failed ss encoding char, force UTF8: %s' % e, logger.WARNING)
        try:
            return u_x.encode(sickbeard.SYS_ENCODING, 'replace')
        except:
            return u_x.encode('utf-8', 'replace')
 def fixListEncodings(x):
-    if type(x) != list and type(x) != tuple:
+    if not isinstance(x, (list, tuple)):
        return x
    else:
-        return filter(lambda x: x != None, map(fixStupidEncodings, x))
+        return filter(lambda x: x != None, map(toUnicode, x))
 def ek(func, *args, **kwargs):
    if os.name == 'nt':
        result = func(*args, **kwargs)
    else:
-        result = func(
+        result = func(*[ss(x) if isinstance(x, (str, unicode)) else x for x in args], **kwargs)
            *[fixStupidEncodings(x).encode(sickbeard.SYS_ENCODING) if type(x) in (str, unicode) else x for x in args],
            **kwargs)
-    if type(result) in (list, tuple):
+    if isinstance(result, (list, tuple)):
        return fixListEncodings(result)
-    elif type(result) == str:
+    elif isinstance(result, str):
-        return fixStupidEncodings(result)
+        return toUnicode(result)
    else:
        return result
--- a/sickbeard/exceptions.py
+++ b/sickbeard/exceptions.py
@ -16,7 +16,7 @@
 # You should have received a copy of the GNU General Public License
 # along with SickRage.  If not, see <http://www.gnu.org/licenses/>.
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
 def ex(e):
    """
@ -32,11 +32,11 @@ def ex(e):
        if arg is not None:
            if isinstance(arg, (str, unicode)):
-                fixed_arg = fixStupidEncodings(arg, True)
+                fixed_arg = toUnicode(arg, True)
            else:
                try:
-                    fixed_arg = u"error " + fixStupidEncodings(str(arg), True)
+                    fixed_arg = u"error " + toUnicode(str(arg), True)
                except:
                    fixed_arg = None
--- a/sickbeard/failed_history.py
+++ b/sickbeard/failed_history.py
@ -26,7 +26,7 @@ from sickbeard.exceptions import ex, EpisodeNotFoundException
 from sickbeard.history import dateFormat
 from sickbeard.common import Quality
 from sickbeard.common import WANTED, FAILED
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
 def prepareFailedName(release):
    """Standardizes release name for failed DB"""
@ -36,7 +36,7 @@ def prepareFailedName(release):
        fixed = fixed.rpartition(".")[0]
    fixed = re.sub("[\.\-\+\ ]", "_", fixed)
-    fixed = fixStupidEncodings(fixed)
+    fixed = toUnicode(fixed)
    return fixed
--- a/sickbeard/history.py
+++ b/sickbeard/history.py
@ -20,7 +20,7 @@ import db
 import datetime
 from sickbeard.common import SNATCHED, SUBTITLED, FAILED, Quality
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
 dateFormat = "%Y%m%d%H%M%S"
@ -28,7 +28,7 @@ dateFormat = "%Y%m%d%H%M%S"
 def _logHistoryItem(action, showid, season, episode, quality, resource, provider, version=-1):
    logDate = datetime.datetime.today().strftime(dateFormat)
-    resource = fixStupidEncodings(resource)
+    resource = toUnicode(resource)
    myDB = db.DBConnection()
    myDB.action(
--- a/sickbeard/notifiers/emailnotify.py
+++ b/sickbeard/notifiers/emailnotify.py
@ -29,7 +29,7 @@ import sickbeard
 from sickbeard import logger, common
 from sickbeard import db
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
 from sickbeard.exceptions import ex
@ -51,7 +51,7 @@ class EmailNotifier:
        ep_name: The name of the episode that was snatched
        title: The title of the notification (optional)
        """
-        ep_name = fixStupidEncodings(ep_name)
+        ep_name = toUnicode(ep_name)
        if sickbeard.EMAIL_NOTIFY_ONSNATCH:
            show = self._parseEp(ep_name)
@ -86,7 +86,7 @@ class EmailNotifier:
        ep_name: The name of the episode that was downloaded
        title: The title of the notification (optional)
        """
-        ep_name = fixStupidEncodings(ep_name)
+        ep_name = toUnicode(ep_name)
        if sickbeard.EMAIL_NOTIFY_ONDOWNLOAD:
            show = self._parseEp(ep_name)
@ -121,7 +121,7 @@ class EmailNotifier:
        ep_name: The name of the episode that was downloaded
        lang: Subtitle language wanted
        """
-        ep_name = fixStupidEncodings(ep_name)
+        ep_name = toUnicode(ep_name)
        if sickbeard.EMAIL_NOTIFY_ONSUBTITLEDOWNLOAD:
            show = self._parseEp(ep_name)
@ -198,7 +198,7 @@ class EmailNotifier:
            return False
    def _parseEp(self, ep_name):
-        ep_name = fixStupidEncodings(ep_name)
+        ep_name = toUnicode(ep_name)
        sep = " - "
        titles = ep_name.split(sep)
--- a/sickbeard/notifiers/plex.py
+++ b/sickbeard/notifiers/plex.py
@ -25,7 +25,7 @@ import sickbeard
 from sickbeard import logger
 from sickbeard import common
 from sickbeard.exceptions import ex
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
 from sickbeard.notifiers.xbmc import XBMCNotifier
--- a/sickbeard/notifiers/xbmc.py
+++ b/sickbeard/notifiers/xbmc.py
@ -26,7 +26,7 @@ import sickbeard
 from sickbeard import logger
 from sickbeard import common
 from sickbeard.exceptions import ex
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
 try:
@ -236,9 +236,9 @@ class XBMCNotifier:
                base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
                authheader = "Basic %s" % base64string
                req.add_header("Authorization", authheader)
-                logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG)
+                logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG)
            else:
-                logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG)
+                logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG)
            response = urllib2.urlopen(req)
            result = response.read().decode(sickbeard.SYS_ENCODING)
@ -248,7 +248,7 @@ class XBMCNotifier:
            return result
        except (urllib2.URLError, IOError), e:
-            logger.log(u"Warning: Couldn't contact XBMC HTTP at " + fixStupidEncodings(url) + " " + ex(e),
+            logger.log(u"Warning: Couldn't contact XBMC HTTP at " + toUnicode(url) + " " + ex(e),
                       logger.WARNING)
            return False
@ -379,9 +379,9 @@ class XBMCNotifier:
                base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
                authheader = "Basic %s" % base64string
                req.add_header("Authorization", authheader)
-                logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG)
+                logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG)
            else:
-                logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG)
+                logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG)
            try:
                response = urllib2.urlopen(req)
@ -401,7 +401,7 @@ class XBMCNotifier:
                return False
        except IOError, e:
-            logger.log(u"Warning: Couldn't contact XBMC JSON API at " + fixStupidEncodings(url) + " " + ex(e),
+            logger.log(u"Warning: Couldn't contact XBMC JSON API at " + toUnicode(url) + " " + ex(e),
                       logger.WARNING)
            return False
--- a/sickbeard/nzbSplitter.py
+++ b/sickbeard/nzbSplitter.py
@ -29,7 +29,7 @@ from sickbeard import encodingKludge as ek
 from sickbeard.exceptions import ex
 from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
 def getSeasonNZBs(name, urlData, season):
@ -85,7 +85,7 @@ def createNZBString(fileElements, xmlns):
    for curFile in fileElements:
        rootElement.append(stripNS(curFile, xmlns))
-    return xml.etree.ElementTree.tostring(fixStupidEncodings(rootElement))
+    return xml.etree.ElementTree.tostring(toUnicode(rootElement))
 def saveNZB(nzbName, nzbString):
--- a/sickbeard/scene_exceptions.py
+++ b/sickbeard/scene_exceptions.py
@ -27,7 +27,7 @@ from sickbeard import helpers
 from sickbeard import name_cache
 from sickbeard import logger
 from sickbeard import db
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
 exception_dict = {}
 anidb_exception_dict = {}
@ -234,7 +234,7 @@ def retrieve_exceptions():
            # if this exception isn't already in the DB then add it
            if cur_exception not in existing_exceptions:
-                cur_exception = fixStupidEncodings(cur_exception)
+                cur_exception = toUnicode(cur_exception)
                myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
                            [cur_indexer_id, cur_exception, curSeason])
@ -267,7 +267,7 @@ def update_scene_exceptions(indexer_id, scene_exceptions, season=-1):
        exceptionsCache[indexer_id][season] = scene_exceptions
    for cur_exception in scene_exceptions:
-        cur_exception = fixStupidEncodings(cur_exception)
+        cur_exception = toUnicode(cur_exception)
        myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
                    [indexer_id, cur_exception, season])
--- a/sickbeard/show_name_helpers.py
+++ b/sickbeard/show_name_helpers.py
@ -234,7 +234,7 @@ def isGoodResult(name, show, log=True, season=-1):
    all_show_names = allPossibleShowNames(show, season=season)
    showNames = map(sanitizeSceneName, all_show_names) + all_show_names
-    showNames += map(unidecode, all_show_names)
+    showNames += map(ek.toUnicode, all_show_names)
    for curName in set(showNames):
        if not show.is_anime:
--- a/sickbeard/tvcache.py
+++ b/sickbeard/tvcache.py
@ -33,7 +33,7 @@ from sickbeard.exceptions import AuthException
 from sickbeard.rssfeeds import RSSFeeds
 from sickbeard import clients
 from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
 class CacheDBConnection(db.DBConnection):
    def __init__(self, providerName):
@ -263,7 +263,7 @@ class TVCache():
            # get quality of release
            quality = parse_result.quality
-            name = fixStupidEncodings(name)
+            name = toUnicode(name)
            # get release group
            release_group = parse_result.release_group
--- a/sickbeard/webserve.py
+++ b/sickbeard/webserve.py
@ -3288,7 +3288,7 @@ class ErrorLogs(MainHandler):
        for x in reversed(data):
-            x = ek.fixStupidEncodings(x)
+            x = ek.toUnicode(x)
            match = re.match(regex, x)
            if match:
--- a/tests/all_tests.py
+++ b/tests/all_tests.py
@ -18,23 +18,27 @@
 # You should have received a copy of the GNU General Public License
 # along with SickRage.  If not, see <http://www.gnu.org/licenses/>.
-if __name__ == "__main__":
+import glob
-    import glob
+import unittest
-    import unittest
+import sys
    import sys
-    test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__]
+class AllTests(unittest.TestCase):
-    module_strings = [file_string[0:len(file_string) - 3] for file_string in test_file_strings]
+    def setUp(self):
-    suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in module_strings]
+        self.test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__]
-    testSuite = unittest.TestSuite(suites)
+        self.module_strings = [file_string[0:len(file_string) - 3] for file_string in self.test_file_strings]
        self.suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in self.module_strings]
        self.testSuite = unittest.TestSuite(self.suites)
    def testAll(self):
        print "=================="
        print "STARTING - ALL TESTS"
        print "=================="
-    print "this will include"
+        for includedfiles in self.test_file_strings:
    for includedfiles in test_file_strings:
            print "- " + includedfiles
-    text_runner = unittest.TextTestRunner().run(testSuite)
+        text_runner = unittest.TextTestRunner().run(self.testSuite)
        if not text_runner.wasSuccessful():
            sys.exit(-1)
 if __name__ == "__main__":
    unittest.main()
--- a/tests/common_tests.py
+++ b/tests/common_tests.py
@ -8,7 +8,6 @@ sys.path.append(os.path.abspath('../lib'))
 from sickbeard import common
 class QualityTests(unittest.TestCase):
    # TODO: repack / proper ? air-by-date ? season rip? multi-ep?
--- a/tests/test_lib.py
+++ b/tests/test_lib.py
@ -51,7 +51,6 @@ EPISODE = 2
 FILENAME = u"show name - s0" + str(SEASON) + "e0" + str(EPISODE) + ".mkv"
 FILEDIR = os.path.join(TESTDIR, SHOWNAME)
 FILEPATH = os.path.join(FILEDIR, FILENAME)
 SHOWDIR = os.path.join(TESTDIR, SHOWNAME + " final")
 #sickbeard.logger.sb_log_instance = sickbeard.logger.SBRotatingLogHandler(os.path.join(TESTDIR, 'sickbeard.log'), sickbeard.logger.NUM_LOGS, sickbeard.logger.LOG_SIZE)