Removed FTFY, python 2.6 compatibility issues.

Re-coded encodingKludge encode/decode for unicode <-> utf-8
2024-12-12 02:52:20 -05:00 · 2014-11-25 17:22:31 -08:00 · 2014-11-25 17:22:31 -08:00 · 360c3afa08
commit 360c3afa08
parent 468af14dfd
29 changed files with 95 additions and 2080 deletions
--- a/lib/ftfy/init.py
+++ b/lib/ftfy/init.py
@ -1,351 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-ftfy: fixes text for you
-
-This is a module for making text less broken. See the `fix_text` function
-for more information.
-"""
-
-from __future__ import unicode_literals
-
-# See the docstring for ftfy.bad_codecs to see what we're doing here.
-import ftfy.bad_codecs
-ftfy.bad_codecs.ok()
-
-from ftfy import fixes
-from ftfy.fixes import fix_text_encoding
-from ftfy.compatibility import PYTHON34_OR_LATER, is_printable
-import unicodedata
-import warnings
-
-
-def fix_text(text,
-             remove_unsafe_private_use=(not PYTHON34_OR_LATER),
-             fix_entities='auto',
-             remove_terminal_escapes=True,
-             fix_encoding=True,
-             normalization='NFKC',
-             uncurl_quotes=True,
-             fix_line_breaks=True,
-             remove_control_chars=True,
-             remove_bom=True,
-             max_decode_length=2**16):
-    r"""
-    Given Unicode text as input, make its representation consistent and
-    possibly less broken.
-
-    Let's start with some examples:
-
-        >>> print(fix_text('uÌˆnicode'))
-        ünicode
-
-        >>> print(fix_text('Broken text&hellip; it&#x2019;s ﬂubberiﬁc!'))
-        Broken text... it's flubberific!
-
-        >>> print(fix_text('HTML entities &lt;3'))
-        HTML entities <3
-
-        >>> print(fix_text('<em>HTML entities &lt;3</em>'))
-        <em>HTML entities &lt;3</em>
-
-        >>> print(fix_text('\001\033[36;44mI&#x92;m blue, da ba dee da ba '
-        ...               'doo&#133;\033[0m'))
-        I'm blue, da ba dee da ba doo...
-
-        >>> # This example string starts with a byte-order mark, even if
-        >>> # you can't see it on the Web.
-        >>> print(fix_text('\ufeffParty like\nit&rsquo;s 1999!'))
-        Party like
-        it's 1999!
-
-        >>> len(fix_text('ﬁ' * 100000))
-        200000
-
-        >>> len(fix_text(''))
-        0
-
-    Based on the options you provide, ftfy applies these steps in order:
-
-    - If `remove_unsafe_private_use` is True, it removes a range of private-use
-      characters that could trigger a Python bug. The bug is fixed in
-      the most recent versions of Python, so this will default to False
-      starting on Python 3.4.
-    - If `fix_entities` is True, replace HTML entities with their equivalent
-      characters. If it's "auto" (the default), then consider replacing HTML
-      entities, but don't do so in text where you have seen a pair of actual
-      angle brackets (that's probably actually HTML and you shouldn't mess
-      with the entities).
-    - If `remove_terminal_escapes` is True, remove sequences of bytes that are
-      instructions for Unix terminals, such as the codes that make text appear
-      in different colors.
-    - If `fix_encoding` is True, look for common mistakes that come from
-      encoding or decoding Unicode text incorrectly, and fix them if they are
-      reasonably fixable. See `fix_text_encoding` for details.
-    - If `normalization` is not None, apply the specified form of Unicode
-      normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
-      The default, 'NFKC', applies the following relevant transformations:
-
-      - C: Combine characters and diacritics that are written using separate
-        code points, such as converting "e" plus an acute accent modifier
-        into "é", or converting "ka" (か) plus a dakuten into the
-        single character "ga" (が).
-      - K: Replace characters that are functionally equivalent with the most
-        common form. For example, half-width katakana will be replaced with
-        full-width versions, full-width Roman characters will be replaced with
-        ASCII characters, ellipsis characters will be replaced with three
-        periods, and the ligature 'ﬂ' will be replaced with 'fl'.
-
-    - If `uncurl_quotes` is True, replace various curly quotation marks with
-      plain-ASCII straight quotes.
-    - If `fix_line_breaks` is true, convert all line breaks to Unix style
-      (CRLF and CR line breaks become LF line breaks).
-    - If `fix_control_characters` is true, remove all C0 control characters
-      except the common useful ones: TAB, CR, LF, and FF. (CR characters
-      may have already been removed by the `fix_line_breaks` step.)
-    - If `remove_bom` is True, remove the Byte-Order Mark if it exists.
-    - If anything was changed, repeat all the steps, so that the function is
-      idempotent. "&amp;amp;" will become "&", for example, not "&amp;".
-
-    `fix_text` will work one line at a time, with the possibility that some
-    lines are in different encodings. When it encounters lines longer than
-    `max_decode_length`, it will not run the `fix_encoding` step, to avoid
-    unbounded slowdowns.
-
-    If you are certain your entire text is in the same encoding (though that
-    encoding is possibly flawed), and do not mind performing operations on
-    the whole text at once, use `fix_text_segment`.
-    """
-    if isinstance(text, bytes):
-        raise UnicodeError(fixes.BYTES_ERROR_TEXT)
-
-    out = []
-    pos = 0
-    while pos < len(text):
-        textbreak = text.find('\n', pos) + 1
-        fix_encoding_this_time = fix_encoding
-        if textbreak == 0:
-            textbreak = len(text)
-        if (textbreak - pos) > max_decode_length:
-            fix_encoding_this_time = False
-
-        substring = text[pos:textbreak]
-
-        if fix_entities == 'auto' and '<' in substring and '>' in substring:
-            # we see angle brackets together; this could be HTML
-            fix_entities = False
-
-        out.append(
-            fix_text_segment(
-                substring,
-                remove_unsafe_private_use=remove_unsafe_private_use,
-                fix_entities=fix_entities,
-                remove_terminal_escapes=remove_terminal_escapes,
-                fix_encoding=fix_encoding_this_time,
-                normalization=normalization,
-                uncurl_quotes=uncurl_quotes,
-                fix_line_breaks=fix_line_breaks,
-                remove_control_chars=remove_control_chars,
-                remove_bom=remove_bom
-            )
-        )
-        pos = textbreak
-
-    return ''.join(out)
-
-ftfy = fix_text
-
-
-def fix_file(input_file,
-             remove_unsafe_private_use=True,
-             fix_entities='auto',
-             remove_terminal_escapes=True,
-             fix_encoding=True,
-             normalization='NFKC',
-             uncurl_quotes=True,
-             fix_line_breaks=True,
-             remove_control_chars=True,
-             remove_bom=True):
-    """
-    Fix text that is found in a file.
-
-    If the file is being read as Unicode text, use that. If it's being read as
-    bytes, then unfortunately, we have to guess what encoding it is. We'll try
-    a few common encodings, but we make no promises. See the `guess_bytes`
-    function for how this is done.
-
-    The output is a stream of fixed lines of text.
-    """
-    entities = fix_entities
-    for line in input_file:
-        if isinstance(line, bytes):
-            line, encoding = guess_bytes(line)
-        if fix_entities == 'auto' and '<' in line and '>' in line:
-            entities = False
-        yield fix_text_segment(
-            line,
-            remove_unsafe_private_use=remove_unsafe_private_use,
-            fix_entities=entities,
-            remove_terminal_escapes=remove_terminal_escapes,
-            fix_encoding=fix_encoding,
-            normalization=normalization,
-            uncurl_quotes=uncurl_quotes,
-            fix_line_breaks=fix_line_breaks,
-            remove_control_chars=remove_control_chars,
-            remove_bom=remove_bom
-        )
-
-
-def fix_text_segment(text,
-                     remove_unsafe_private_use=True,
-                     fix_entities='auto',
-                     remove_terminal_escapes=True,
-                     fix_encoding=True,
-                     normalization='NFKC',
-                     uncurl_quotes=True,
-                     fix_line_breaks=True,
-                     remove_control_chars=True,
-                     remove_bom=True):
-    """
-    Apply fixes to text in a single chunk. This could be a line of text
-    within a larger run of `fix_text`, or it could be a larger amount
-    of text that you are certain is all in the same encoding.
-
-    See `fix_text` for a description of the parameters.
-    """
-    if isinstance(text, bytes):
-        raise UnicodeError(fixes.BYTES_ERROR_TEXT)
-
-    if fix_entities == 'auto' and '<' in text and '>' in text:
-        fix_entities = False
-    while True:
-        origtext = text
-        if remove_unsafe_private_use:
-            text = fixes.remove_unsafe_private_use(text)
-        if fix_entities:
-            text = fixes.unescape_html(text)
-        if remove_terminal_escapes:
-            text = fixes.remove_terminal_escapes(text)
-        if fix_encoding:
-            text = fixes.fix_text_encoding(text)
-        if normalization is not None:
-            text = unicodedata.normalize(normalization, text)
-        if uncurl_quotes:
-            text = fixes.uncurl_quotes(text)
-        if fix_line_breaks:
-            text = fixes.fix_line_breaks(text)
-        if remove_control_chars:
-            text = fixes.remove_control_chars(text)
-        if remove_bom:
-            text = fixes.remove_bom(text)
-        if text == origtext:
-            return text
-
-
-def guess_bytes(bstring):
-    """
-    If you have some bytes in an unknown encoding, here's a reasonable
-    strategy for decoding them, by trying a few common encodings that
-    can be distinguished from each other.
-
-    This is not a magic bullet. If the bytes are coming from some MySQL
-    database with the "character set" set to ISO Elbonian, this won't figure
-    it out. Perhaps more relevantly, this currently doesn't try East Asian
-    encodings.
-
-    The encodings we try are:
-
-    - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
-      like nothing else
-    - UTF-8, because it's the global de facto standard
-    - "utf-8-variants", because it's what people actually implement when they
-      think they're doing UTF-8
-    - MacRoman, because Microsoft Office thinks it's still a thing, and it
-      can be distinguished by its line breaks. (If there are no line breaks in
-      the string, though, you're out of luck.)
-    - "sloppy-windows-1252", the Latin-1-like encoding that is the most common
-      single-byte encoding
-    """
-    if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
-        return bstring.decode('utf-16'), 'utf-16'
-
-    byteset = set(bytes(bstring))
-    byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
-
-    try:
-        if byte_ed in byteset or byte_c0 in byteset:
-            # Byte 0xed can be used to encode a range of codepoints that
-            # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
-            # so when we see 0xed, it's very likely we're being asked to
-            # decode CESU-8, the variant that encodes UTF-16 surrogates
-            # instead of the original characters themselves.
-            #
-            # This will occasionally trigger on standard UTF-8, as there
-            # are some Korean characters that also use byte 0xed, but that's
-            # not harmful.
-            #
-            # Byte 0xc0 is impossible because, numerically, it would only
-            # encode characters lower than U+0040. Those already have
-            # single-byte representations, and UTF-8 requires using the
-            # shortest possible representation. However, Java hides the null
-            # codepoint, U+0000, in a non-standard longer representation -- it
-            # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
-            # will never appear in the encoded bytes.
-            #
-            # The 'utf-8-variants' decoder can handle both of these cases, as
-            # well as standard UTF-8, at the cost of a bit of speed.
-            return bstring.decode('utf-8-variants'), 'utf-8-variants'
-        else:
-            return bstring.decode('utf-8'), 'utf-8'
-    except UnicodeDecodeError:
-        pass
-
-    if byte_CR in bstring and byte_LF not in bstring:
-        return bstring.decode('macroman'), 'macroman'
-    else:
-        return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
-
-
-def explain_unicode(text):
-    """
-    A utility method that's useful for debugging mysterious Unicode.
-
-    It breaks down a string, showing you for each codepoint its number in
-    hexadecimal, its glyph, its category in the Unicode standard, and its name
-    in the Unicode standard.
-
-        >>> explain_unicode('(╯°□°)╯︵ ┻━┻')
-        U+0028  (       [Ps] LEFT PARENTHESIS
-        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
-        U+00B0  °       [So] DEGREE SIGN
-        U+25A1  □       [So] WHITE SQUARE
-        U+00B0  °       [So] DEGREE SIGN
-        U+0029  )       [Pe] RIGHT PARENTHESIS
-        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
-        U+FE35  ︵       [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
-        U+0020          [Zs] SPACE
-        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
-        U+2501  ━       [So] BOX DRAWINGS HEAVY HORIZONTAL
-        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
-    """
-    for char in text:
-        if is_printable(char):
-            display = char
-        else:
-            display = char.encode('unicode-escape').decode('ascii')
-        print('U+{code:04X}  {display:<7} [{category}] {name}'.format(
-            display=display,
-            code=ord(char),
-            category=unicodedata.category(char),
-            name=unicodedata.name(char, '<unknown>')
-        ))
-
-
-def fix_bad_encoding(text):
-    """
-    Kept for compatibility with previous versions of ftfy.
-    """
-    warnings.warn(
-        'fix_bad_encoding is now known as fix_text_encoding',
-        DeprecationWarning
-    )
-    return fix_text_encoding(text)
--- a/lib/ftfy/bad_codecs/init.py
+++ b/lib/ftfy/bad_codecs/init.py
@ -1,94 +0,0 @@
-# coding: utf-8
-r"""
-Give Python the ability to decode some common, flawed encodings.
-
-Python does not want you to be sloppy with your text. Its encoders and decoders
-("codecs") follow the relevant standards whenever possible, which means that
-when you get text that *doesn't* follow those standards, you'll probably fail
-to decode it. Or you might succeed at decoding it for implementation-specific
-reasons, which is perhaps worse.
-
-There are some encodings out there that Python wishes didn't exist, which are
-widely used outside of Python:
-
- "utf-8-variants", a family of not-quite-UTF-8 encodings, including the
-  ever-popular CESU-8 and "Java modified UTF-8".
- "Sloppy" versions of character map encodings, where bytes that don't map to
-  anything will instead map to the Unicode character with the same number.
-
-Simply importing this module, or in fact any part of the `ftfy` package, will
-make these new "bad codecs" available to Python through the standard Codecs
-API. You never have to actually call any functions inside `ftfy.bad_codecs`.
-
-However, if you want to call something because your code checker insists on it,
-you can call ``ftfy.bad_codecs.ok()``.
-
-A quick example of decoding text that's encoded in CESU-8:
-
-    >>> import ftfy.bad_codecs
-    >>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
-    😍
-"""
-from __future__ import unicode_literals
-from encodings import normalize_encoding
-import codecs
-
-_CACHE = {}
-
-# Define some aliases for 'utf-8-variants'. All hyphens get turned into
-# underscores, because of `normalize_encoding`.
-UTF8_VAR_NAMES = (
-    'utf_8_variants', 'utf8_variants',
-    'utf_8_variant', 'utf8_variant',
-    'utf_8_var', 'utf8_var',
-    'cesu_8', 'cesu8',
-    'java_utf_8', 'java_utf8'
-)
-
-
-def search_function(encoding):
-    """
-    Register our "bad codecs" with Python's codecs API. This involves adding
-    a search function that takes in an encoding name, and returns a codec
-    for that encoding if it knows one, or None if it doesn't.
-
-    The encodings this will match are:
-
-    - Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N',
-      where the non-sloppy version is an encoding that leaves some bytes
-      unmapped to characters.
-    - The 'utf-8-variants' encoding, which has the several aliases seen
-      above.
-    """
-    if encoding in _CACHE:
-        return _CACHE[encoding]
-
-    norm_encoding = normalize_encoding(encoding)
-    codec = None
-    if norm_encoding in UTF8_VAR_NAMES:
-        from ftfy.bad_codecs.utf8_variants import CODEC_INFO
-        codec = CODEC_INFO
-    elif norm_encoding.startswith('sloppy_'):
-        from ftfy.bad_codecs.sloppy import CODECS
-        codec = CODECS.get(norm_encoding)
-
-    if codec is not None:
-        _CACHE[encoding] = codec
-
-    return codec
-
-
-def ok():
-    """
-    A feel-good function that gives you something to call after importing
-    this package.
-
-    Why is this here? Pyflakes. Pyflakes gets upset when you import a module
-    and appear not to use it. It doesn't know that you're using it when
-    you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
-    encodings.
-    """
-    pass
-
-
-codecs.register(search_function)
--- a/lib/ftfy/bad_codecs/sloppy.py
+++ b/lib/ftfy/bad_codecs/sloppy.py
@ -1,156 +0,0 @@
-# coding: utf-8
-r"""
-Decodes single-byte encodings, filling their "holes" in the same messy way that
-everyone else does.
-
-A single-byte encoding maps each byte to a Unicode character, except that some
-bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
-example, bytes 0x81 and 0x8D, among others, have no meaning.
-
-Python, wanting to preserve some sense of decorum, will handle these bytes
-as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
-different from each other. It just hasn't defined what they are in terms of
-Unicode.
-
-Software that has to interoperate with Windows-1252 and Unicode -- such as all
-the common Web browsers -- will pick some Unicode characters for them to map
-to, and the characters they pick are the Unicode characters with the same
-numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
-resulting characters tend to fall into a range of Unicode that's set aside for
-obselete Latin-1 control characters anyway.
-
-These sloppy codecs let Python do the same thing, thus interoperating with
-other software that works this way. It defines a sloppy version of many
-single-byte encodings with holes. (There is no need for a sloppy version of
-an encoding without holes: for example, there is no such thing as
-sloppy-iso-8859-2 or sloppy-macroman.)
-
-The following encodings will become defined:
-
- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
- sloppy-windows-1251 (Cyrillic)
- sloppy-windows-1252 (Western European, based on Latin-1)
- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
- sloppy-windows-1256 (Arabic)
- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
- sloppy-windows-1258 (Vietnamese)
- sloppy-cp874 (Thai, based on ISO-8859-11)
- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
- sloppy-iso-8859-6 (different Arabic)
- sloppy-iso-8859-7 (Greek)
- sloppy-iso-8859-8 (Hebrew)
- sloppy-iso-8859-11 (Thai)
-
-Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
-defined.
-
-Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
-the rest are rather uncommon.
-
-Here are some examples, using `ftfy.explain_unicode` to illustrate how
-sloppy-windows-1252 merges Windows-1252 with Latin-1:
-
-    >>> from ftfy import explain_unicode
-    >>> some_bytes = b'\x80\x81\x82'
-    >>> explain_unicode(some_bytes.decode('latin-1'))
-    U+0080  \x80    [Cc] <unknown>
-    U+0081  \x81    [Cc] <unknown>
-    U+0082  \x82    [Cc] <unknown>
-
-    >>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
-    U+20AC  €       [Sc] EURO SIGN
-    U+FFFD  <EFBFBD>       [So] REPLACEMENT CHARACTER
-    U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
-
-    >>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
-    U+20AC  €       [Sc] EURO SIGN
-    U+0081  \x81    [Cc] <unknown>
-    U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
-"""
-from __future__ import unicode_literals
-import codecs
-from encodings import normalize_encoding
-
-REPLACEMENT_CHAR = '\ufffd'
-
-
-def make_sloppy_codec(encoding):
-    """
-    Take a codec name, and return a 'sloppy' version of that codec that can
-    encode and decode the unassigned bytes in that encoding.
-
-    Single-byte encodings in the standard library are defined using some
-    boilerplate classes surrounding the functions that do the actual work,
-    `codecs.charmap_decode` and `charmap_encode`. This function, given an
-    encoding name, *defines* those boilerplate classes.
-    """
-    # Make an array of all 256 possible bytes.
-    all_bytes = bytearray(range(256))
-
-    # Get a list of what they would decode to in Latin-1.
-    sloppy_chars = list(all_bytes.decode('latin-1'))
-
-    # Get a list of what they decode to in the given encoding. Use the
-    # replacement character for unassigned bytes.
-    decoded_chars = all_bytes.decode(encoding, 'replace')
-
-    # Update the sloppy_chars list. Each byte that was successfully decoded
-    # gets its decoded value in the list. The unassigned bytes are left as
-    # they are, which gives their decoding in Latin-1.
-    for i, char in enumerate(decoded_chars):
-        if char != REPLACEMENT_CHAR:
-            sloppy_chars[i] = char
-
-    # Create the data structures that tell the charmap methods how to encode
-    # and decode in this sloppy encoding.
-    decoding_table = ''.join(sloppy_chars)
-    encoding_table = codecs.charmap_build(decoding_table)
-
-    # Now produce all the class boilerplate. Look at the Python source for
-    # `encodings.cp1252` for comparison; this is almost exactly the same,
-    # except I made it follow pep8.
-    class Codec(codecs.Codec):
-        def encode(self, input, errors='strict'):
-            return codecs.charmap_encode(input, errors, encoding_table)
-
-        def decode(self, input, errors='strict'):
-            return codecs.charmap_decode(input, errors, decoding_table)
-
-    class IncrementalEncoder(codecs.IncrementalEncoder):
-        def encode(self, input, final=False):
-            return codecs.charmap_encode(input, self.errors, encoding_table)[0]
-
-    class IncrementalDecoder(codecs.IncrementalDecoder):
-        def decode(self, input, final=False):
-            return codecs.charmap_decode(input, self.errors, decoding_table)[0]
-
-    class StreamWriter(Codec, codecs.StreamWriter):
-        pass
-
-    class StreamReader(Codec, codecs.StreamReader):
-        pass
-
-    return codecs.CodecInfo(
-        name='sloppy-' + encoding,
-        encode=Codec().encode,
-        decode=Codec().decode,
-        incrementalencoder=IncrementalEncoder,
-        incrementaldecoder=IncrementalDecoder,
-        streamreader=StreamReader,
-        streamwriter=StreamWriter,
-    )
-
-# Define a codec for each incomplete encoding. The resulting CODECS dictionary
-# can be used by the main module of ftfy.bad_codecs.
-CODECS = {}
-INCOMPLETE_ENCODINGS = (
-    ['windows-%s' % num for num in range(1250, 1259)] +
-    ['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
-    ['cp%s' % num for num in range(1250, 1259)] + ['cp874']
-)
-
-for _encoding in INCOMPLETE_ENCODINGS:
-    _new_name = normalize_encoding('sloppy-' + _encoding)
-    CODECS[_new_name] = make_sloppy_codec(_encoding)
--- a/lib/ftfy/bad_codecs/utf8_variants.py
+++ b/lib/ftfy/bad_codecs/utf8_variants.py
@ -1,281 +0,0 @@
-r"""
-This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can
-decode text that's been encoded with a popular non-standard version of UTF-8.
-This includes CESU-8, the accidental encoding made by layering UTF-8 on top of
-UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for
-codepoint 0.
-
-This is particularly relevant in Python 3, which provides no other way of
-decoding CESU-8 or Java's encoding. [1]
-
-The easiest way to use the codec is to simply import `ftfy.bad_codecs`:
-
-    >>> import ftfy.bad_codecs
-    >>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var')
-    >>> print(repr(result).lstrip('u'))
-    'here comes a null! \x00'
-
-The codec does not at all enforce "correct" CESU-8. For example, the Unicode
-Consortium's not-quite-standard describing CESU-8 requires that there is only
-one possible encoding of any character, so it does not allow mixing of valid
-UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8
-decoder does.
-
-Characters in the Basic Multilingual Plane still have only one encoding. This
-codec still enforces the rule, within the BMP, that characters must appear in
-their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`,
-instead of just `0x00`, may be used to encode the null character `U+0000`, like
-in Java.
-
-If you encode with this codec, you get legitimate UTF-8. Decoding with this
-codec and then re-encoding is not idempotent, although encoding and then
-decoding is. So this module won't produce CESU-8 for you. Look for that
-functionality in the sister module, "Breaks Text For You", coming approximately
-never.
-
-[1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: first
-decode the bytes (incorrectly), then encode them, then decode them again, using
-UTF-8 as the codec every time.
-"""
-
-from __future__ import unicode_literals
-from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2
-from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder,
-                             IncrementalEncoder as UTF8IncrementalEncoder)
-import re
-import codecs
-
-NAME = 'utf-8-variants'
-# This regular expression matches all possible six-byte CESU-8 sequences.
-CESU8_RE = re.compile(b'\xed[\xa0-\xaf][\x80-\xbf]\xed[\xb0-\xbf][\x80-\xbf]')
-
-
-class IncrementalDecoder(UTF8IncrementalDecoder):
-    """
-    An incremental decoder that extends Python's built-in UTF-8 decoder.
-
-    This encoder needs to take in bytes, possibly arriving in a stream, and
-    output the correctly decoded text. The general strategy for doing this
-    is to fall back on the real UTF-8 decoder whenever possible, because
-    the real UTF-8 decoder is way optimized, but to call specialized methods
-    we define here for the cases the real encoder isn't expecting.
-    """
-    def _buffer_decode(self, input, errors, final):
-        """
-        Decode bytes that may be arriving in a stream, following the Codecs
-        API.
-
-        `input` is the incoming sequence of bytes. `errors` tells us how to
-        handle errors, though we delegate all error-handling cases to the real
-        UTF-8 decoder to ensure correct behavior. `final` indicates whether
-        this is the end of the sequence, in which case we should raise an
-        error given incomplete input.
-
-        Returns as much decoded text as possible, and the number of bytes
-        consumed.
-        """
-        # decoded_segments are the pieces of text we have decoded so far,
-        # and position is our current position in the byte string. (Bytes
-        # before this position have been consumed, and bytes after it have
-        # yet to be decoded.)
-        decoded_segments = []
-        position = 0
-        while True:
-            # Use _buffer_decode_step to decode a segment of text.
-            decoded, consumed = self._buffer_decode_step(
-                input[position:],
-                errors,
-                final
-            )
-            if consumed == 0:
-                # Either there's nothing left to decode, or we need to wait
-                # for more input. Either way, we're done for now.
-                break
-
-            # Append the decoded text to the list, and update our position.
-            decoded_segments.append(decoded)
-            position += consumed
-
-        if final:
-            # _buffer_decode_step must consume all the bytes when `final` is
-            # true.
-            assert position == len(input)
-
-        return ''.join(decoded_segments), position
-
-    def _buffer_decode_step(self, input, errors, final):
-        """
-        There are three possibilities for each decoding step:
-
-        - Decode as much real UTF-8 as possible.
-        - Decode a six-byte CESU-8 sequence at the current position.
-        - Decode a Java-style null at the current position.
-
-        This method figures out which step is appropriate, and does it.
-        """
-        # Get a reference to the superclass method that we'll be using for
-        # most of the real work.
-        sup = UTF8IncrementalDecoder._buffer_decode
-
-        # Find the next byte position that indicates a variant of UTF-8.
-        # CESU-8 sequences always start with 0xed, and Java nulls always
-        # start with 0xc0, both of which are conveniently impossible in
-        # real UTF-8.
-        cutoff1 = input.find(b'\xed')
-        cutoff2 = input.find(b'\xc0')
-
-        # Set `cutoff` to whichever cutoff comes first.
-        if cutoff1 != -1 and cutoff2 != -1:
-            cutoff = min(cutoff1, cutoff2)
-        elif cutoff1 != -1:
-            cutoff = cutoff1
-        elif cutoff2 != -1:
-            cutoff = cutoff2
-        else:
-            # The entire input can be decoded as UTF-8, so just do so.
-            return sup(input, errors, final)
-
-        if cutoff1 == 0:
-            # Decode a possible six-byte sequence starting with 0xed.
-            return self._buffer_decode_surrogates(sup, input, errors, final)
-        elif cutoff2 == 0:
-            # Decode a possible two-byte sequence, 0xc0 0x80.
-            return self._buffer_decode_null(sup, input, errors, final)
-        else:
-            # Decode the bytes up until the next weird thing as UTF-8.
-            # Set final=True because 0xc0 and 0xed don't make sense in the
-            # middle of a sequence, in any variant.
-            return sup(input[:cutoff], errors, True)
-
-    @staticmethod
-    def _buffer_decode_null(sup, input, errors, final):
-        """
-        Decode the bytes 0xc0 0x80 as U+0000, like Java does.
-        """
-        nextbyte = input[1:2]
-        if nextbyte == b'':
-            if final:
-                # We found 0xc0 at the end of the stream, which is an error.
-                # Delegate to the superclass method to handle that error.
-                return sup(input, errors, final)
-            else:
-                # We found 0xc0 and we don't know what comes next, so consume
-                # no bytes and wait.
-                return '', 0
-        elif nextbyte == b'\x80':
-            # We found the usual 0xc0 0x80 sequence, so decode it and consume
-            # two bytes.
-            return '\u0000', 2
-        else:
-            # We found 0xc0 followed by something else, which is an error.
-            # Whatever should happen is equivalent to what happens when the
-            # superclass is given just the byte 0xc0, with final=True.
-            return sup(b'\xc0', errors, True)
-
-    @staticmethod
-    def _buffer_decode_surrogates(sup, input, errors, final):
-        """
-        When we have improperly encoded surrogates, we can still see the
-        bits that they were meant to represent.
-
-        The surrogates were meant to encode a 20-bit number, to which we
-        add 0x10000 to get a codepoint. That 20-bit number now appears in
-        this form:
-
-          11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
-
-        The CESU8_RE above matches byte sequences of this form. Then we need
-        to extract the bits and assemble a codepoint number from them.
-        """
-        if len(input) < 6:
-            if final:
-                # We found 0xed near the end of the stream, and there aren't
-                # six bytes to decode. Delegate to the superclass method to
-                # handle it as normal UTF-8. It might be a Hangul character
-                # or an error.
-                if PYTHON2 and len(input) >= 3:
-                    # We can't trust Python 2 to raise an error when it's
-                    # asked to decode a surrogate, so let's force the issue.
-                    input = mangle_surrogates(input)
-                return sup(input, errors, final)
-            else:
-                # We found 0xed, the stream isn't over yet, and we don't know
-                # enough of the following bytes to decode anything, so consume
-                # zero bytes and wait.
-                return '', 0
-        else:
-            if CESU8_RE.match(input):
-                # If this is a CESU-8 sequence, do some math to pull out
-                # the intended 20-bit value, and consume six bytes.
-                bytenums = bytes_to_ints(input[:6])
-                codepoint = (
-                    ((bytenums[1] & 0x0f) << 16) +
-                    ((bytenums[2] & 0x3f) << 10) +
-                    ((bytenums[4] & 0x0f) << 6) +
-                    (bytenums[5] & 0x3f) +
-                    0x10000
-                )
-                return unichr(codepoint), 6
-            else:
-                # This looked like a CESU-8 sequence, but it wasn't one.
-                # 0xed indicates the start of a three-byte sequence, so give
-                # three bytes to the superclass to decode as usual -- except
-                # for working around the Python 2 discrepancy as before.
-                if PYTHON2:
-                    input = mangle_surrogates(input)
-                return sup(input[:3], errors, False)
-
-
-def mangle_surrogates(bytestring):
-    """
-    When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats
-    it as an error (which it is). In 'replace' mode, it will decode as three
-    replacement characters. But Python 2 will just output the surrogate
-    codepoint.
-
-    To ensure consistency between Python 2 and Python 3, and protect downstream
-    applications from malformed strings, we turn surrogate sequences at the
-    start of the string into the bytes `ff ff ff`, which we're *sure* won't
-    decode, and which turn into three replacement characters in 'replace' mode.
-    """
-    if PYTHON2:
-        if bytestring.startswith(b'\xed') and len(bytestring) >= 3:
-            decoded = bytestring[:3].decode('utf-8', 'replace')
-            if '\ud800' <= decoded <= '\udfff':
-                return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:])
-        return bytestring
-    else:
-        # On Python 3, nothing needs to be done.
-        return bytestring
-
-# The encoder is identical to UTF-8.
-IncrementalEncoder = UTF8IncrementalEncoder
-
-
-# Everything below here is boilerplate that matches the modules in the
-# built-in `encodings` package.
-def encode(input, errors='strict'):
-    return IncrementalEncoder(errors).encode(input, final=True), len(input)
-
-
-def decode(input, errors='strict'):
-    return IncrementalDecoder(errors).decode(input, final=True), len(input)
-
-
-class StreamWriter(codecs.StreamWriter):
-    encode = encode
-
-
-class StreamReader(codecs.StreamReader):
-    decode = decode
-
-
-CODEC_INFO = codecs.CodecInfo(
-    name=NAME,
-    encode=encode,
-    decode=decode,
-    incrementalencoder=IncrementalEncoder,
-    incrementaldecoder=IncrementalDecoder,
-    streamreader=StreamReader,
-    streamwriter=StreamWriter,
-)
--- a/lib/ftfy/badness.py
+++ b/lib/ftfy/badness.py
@ -1,144 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Heuristics to determine whether re-encoding text is actually making it
-more reasonable.
-"""
-
-from __future__ import unicode_literals
-from ftfy.chardata import chars_to_classes
-import re
-import unicodedata
-
-# The following regex uses the mapping of character classes to ASCII
-# characters defined in chardata.py and build_data.py:
-#
-# L = Latin capital letter
-# l = Latin lowercase letter
-# A = Non-latin capital or title-case letter
-# a = Non-latin lowercase letter
-# C = Non-cased letter (Lo)
-# X = Control character (Cc)
-# m = Letter modifier (Lm)
-# M = Mark (Mc, Me, Mn)
-# N = Miscellaneous numbers (No)
-# 0 = Math symbol (Sm)
-# 1 = Currency symbol (Sc)
-# 2 = Symbol modifier (Sk)
-# 3 = Other symbol (So)
-# S = UTF-16 surrogate
-# _ = Unassigned character
-#   = Whitespace
-# o = Other
-
-
-def _make_weirdness_regex():
-    """
-    Creates a list of regexes that match 'weird' character sequences.
-    The more matches there are, the weirder the text is.
-    """
-    groups = []
-
-    # Match lowercase letters that are followed by non-ASCII uppercase letters
-    groups.append('lA')
-
-    # Match diacritical marks, except when they modify a non-cased letter or
-    # another mark.
-    #
-    # You wouldn't put a diacritical mark on a digit or a space, for example.
-    # You might put it on a Latin letter, but in that case there will almost
-    # always be a pre-composed version, and we normalize to pre-composed
-    # versions first. The cases that can't be pre-composed tend to be in
-    # large scripts without case, which are in class C.
-    groups.append('[^CM]M')
-
-    # Match non-Latin characters adjacent to Latin characters.
-    #
-    # This is a simplification from ftfy version 2, which compared all
-    # adjacent scripts. However, the ambiguities we need to resolve come from
-    # encodings designed to represent Latin characters.
-    groups.append('[Ll][AaC]')
-    groups.append('[AaC][Ll]')
-
-    # Match C1 control characters, which are almost always the result of
-    # decoding Latin-1 that was meant to be Windows-1252.
-    groups.append('X')
-
-    # Match private use and unassigned characters.
-    groups.append('P')
-    groups.append('_')
-
-    # Match adjacent characters from any different pair of these categories:
-    # - Modifier marks (M)
-    # - Letter modifiers (m)
-    # - Miscellaneous numbers (N)
-    # - Symbols (0123)
-
-    exclusive_categories = 'MmN0123'
-    for cat1 in exclusive_categories:
-        others_range = ''.join(c for c in exclusive_categories if c != cat1)
-        groups.append('{cat1}[{others_range}]'.format(
-            cat1=cat1, others_range=others_range
-        ))
-    regex = '|'.join('({0})'.format(group) for group in groups)
-    return re.compile(regex)
-
-WEIRDNESS_RE = _make_weirdness_regex()
-
-# A few characters are common ending punctuation that can show up at the end
-# of a mojibake sequence. It's plausible that such a character could appear
-# after an accented capital letter, for example, so we'll want to add a
-# slight preference to leave these characters alone.
-#
-# The match ends with a + so that we only give the bonus once for a
-# consecutive sequence of these characters.
-ENDING_PUNCT_RE = re.compile(
-    '['
-    '\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}'
-    '\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}'
-    '\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}'
-    '\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}'
-    ']+'
-)
-
-def sequence_weirdness(text):
-    """
-    Determine how often a text has unexpected characters or sequences of
-    characters. This metric is used to disambiguate when text should be
-    re-decoded or left as is.
-
-    We start by normalizing text in NFC form, so that penalties for
-    diacritical marks don't apply to characters that know what to do with
-    them.
-
-    The following things are deemed weird:
-
-    - Lowercase letters followed by non-ASCII uppercase letters
-    - Non-Latin characters next to Latin characters
-    - Un-combined diacritical marks, unless they're stacking on non-alphabetic
-      characters (in languages that do that kind of thing a lot) or other
-      marks
-    - C1 control characters
-    - Adjacent symbols from any different pair of these categories:
-
-        - Modifier marks
-        - Letter modifiers
-        - Non-digit numbers
-        - Symbols (including math and currency)
-
-    The return value is the number of instances of weirdness.
-    """
-    text2 = unicodedata.normalize('NFC', text)
-    weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2)))
-    punct_discount = len(ENDING_PUNCT_RE.findall(text2))
-    return weirdness * 2 - punct_discount
-
-
-def text_cost(text):
-    """
-    An overall cost function for text. Weirder is worse, but all else being
-    equal, shorter strings are better.
-
-    The overall cost is measured as the "weirdness" (see
-    :func:`sequence_weirdness`) plus the length.
-    """
-    return sequence_weirdness(text) + len(text)
--- a/lib/ftfy/build_data.py
+++ b/lib/ftfy/build_data.py
@ -1,111 +0,0 @@
-"""
-A script to make the char_classes.dat file.
-
-This never needs to run in normal usage. It needs to be run if the character
-classes we care about change, or if a new version of Python supports a new
-Unicode standard and we want it to affect our string decoding.
-
-The file that we generate is based on Unicode 6.1, as supported by Python 3.3.
-You can certainly use it in earlier versions. This simply makes sure that we
-get consistent results from running ftfy on different versions of Python.
-
-The file will be written to the current directory.
-"""
-from __future__ import unicode_literals
-import unicodedata
-import sys
-import zlib
-if sys.hexversion >= 0x03000000:
-    unichr = chr
-
-# L = Latin capital letter
-# l = Latin lowercase letter
-# A = Non-latin capital or title-case letter
-# a = Non-latin lowercase letter
-# C = Non-cased letter (Lo)
-# X = Control character (Cc)
-# m = Letter modifier (Lm)
-# M = Mark (Mc, Me, Mn)
-# N = Miscellaneous numbers (No)
-# P = Private use (Co)
-# 0 = Math symbol (Sm)
-# 1 = Currency symbol (Sc)
-# 2 = Symbol modifier (Sk)
-# 3 = Other symbol (So)
-# S = UTF-16 surrogate
-# _ = Unassigned character
-#   = Whitespace
-# o = Other
-
-
-def make_char_data_file(do_it_anyway=False):
-    """
-    Build the compressed data file 'char_classes.dat' and write it to the
-    current directory.
-
-    If you run this, run it in Python 3.3 or later. It will run in earlier
-    versions, but you won't get the current Unicode standard, leading to
-    inconsistent behavior. To protect against this, running this in the
-    wrong version of Python will raise an error unless you pass
-    `do_it_anyway=True`.
-    """
-    if sys.hexversion < 0x03030000 and not do_it_anyway:
-        raise RuntimeError(
-            "This function should be run in Python 3.3 or later."
-        )
-
-    cclasses = [None] * 0x110000
-    for codepoint in range(0x0, 0x110000):
-        char = unichr(codepoint)
-        category = unicodedata.category(char)
-
-        if category.startswith('L'):  # letters
-            is_latin = unicodedata.name(char).startswith('LATIN')
-            if is_latin and codepoint < 0x200:
-                if category == 'Lu':
-                    cclasses[codepoint] = 'L'
-                else:
-                    cclasses[codepoint] = 'l'
-            else:  # non-Latin letter, or close enough
-                if category == 'Lu' or category == 'Lt':
-                    cclasses[codepoint] = 'A'
-                elif category == 'Ll':
-                    cclasses[codepoint] = 'a'
-                elif category == 'Lo':
-                    cclasses[codepoint] = 'C'
-                elif category == 'Lm':
-                    cclasses[codepoint] = 'm'
-                else:
-                    raise ValueError('got some weird kind of letter')
-        elif category.startswith('M'):  # marks
-            cclasses[codepoint] = 'M'
-        elif category == 'No':
-            cclasses[codepoint] = 'N'
-        elif category == 'Sm':
-            cclasses[codepoint] = '0'
-        elif category == 'Sc':
-            cclasses[codepoint] = '1'
-        elif category == 'Sk':
-            cclasses[codepoint] = '2'
-        elif category == 'So':
-            cclasses[codepoint] = '3'
-        elif category == 'Cn':
-            cclasses[codepoint] = '_'
-        elif category == 'Cc':
-            cclasses[codepoint] = 'X'
-        elif category == 'Cs':
-            cclasses[codepoint] = 'S'
-        elif category == 'Co':
-            cclasses[codepoint] = 'P'
-        elif category.startswith('Z'):
-            cclasses[codepoint] = ' '
-        else:
-            cclasses[codepoint] = 'o'
-
-    cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
-    out = open('char_classes.dat', 'wb')
-    out.write(zlib.compress(''.join(cclasses).encode('ascii')))
-    out.close()
-
-if __name__ == '__main__':
-    make_char_data_file()
--- a/lib/ftfy/char_classes.dat
+++ b/lib/ftfy/char_classes.dat
--- a/lib/ftfy/chardata.py
+++ b/lib/ftfy/chardata.py
@ -1,81 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-This gives other modules access to the gritty details about characters and the
-encodings that use them.
-"""
-
-from __future__ import unicode_literals
-import re
-import zlib
-from pkg_resources import resource_string
-from ftfy.compatibility import unichr
-
-# These are the five encodings we will try to fix in ftfy, in the
-# order that they should be tried.
-CHARMAP_ENCODINGS = [
-    'latin-1',
-    'sloppy-windows-1252',
-    'macroman',
-    'cp437',
-    'sloppy-windows-1251',
-]
-
-
-def _build_regexes():
-    """
-    ENCODING_REGEXES contain reasonably fast ways to detect if we
-    could represent a given string in a given encoding. The simplest one is
-    the 'ascii' detector, which of course just determines if all characters
-    are between U+0000 and U+007F.
-    """
-    # Define a regex that matches ASCII text.
-    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
-
-    for encoding in CHARMAP_ENCODINGS:
-        latin1table = ''.join(unichr(i) for i in range(128, 256))
-        charlist = latin1table.encode('latin-1').decode(encoding)
-
-        # Build a regex from the ASCII range, followed by the decodings of
-        # bytes 0x80-0xff in this character set. (This uses the fact that all
-        # regex special characters are ASCII, and therefore won't appear in the
-        # string.)
-        regex = '^[\x00-\x7f{0}]*$'.format(charlist)
-        encoding_regexes[encoding] = re.compile(regex)
-    return encoding_regexes
-ENCODING_REGEXES = _build_regexes()
-
-
-def possible_encoding(text, encoding):
-    """
-    Given text and a single-byte encoding, check whether that text could have
-    been decoded from that single-byte encoding.
-
-    In other words, check whether it can be encoded in that encoding, possibly
-    sloppily.
-    """
-    return bool(ENCODING_REGEXES[encoding].match(text))
-
-
-CHAR_CLASS_STRING = zlib.decompress(
-    resource_string(__name__, 'char_classes.dat')
-).decode('ascii')
-
-def chars_to_classes(string):
-    """
-    Convert each Unicode character to a letter indicating which of many
-    classes it's in.
-
-    See build_data.py for where this data comes from and what it means.
-    """
-    return string.translate(CHAR_CLASS_STRING)
-
-
-# A translate mapping that will strip all C0 control characters except
-# those that represent whitespace.
-CONTROL_CHARS = {}
-for i in range(32):
-    CONTROL_CHARS[i] = None
-
-# Map whitespace control characters to themselves.
-for char in '\t\n\f\r':
-    del CONTROL_CHARS[ord(char)]
--- a/lib/ftfy/cli.py
+++ b/lib/ftfy/cli.py
@ -1,34 +0,0 @@
-"""
-A simple command-line utility for fixing text found in a file.
-
-Because files do not come with their encoding marked, it first runs the file
-through `ftfy.guess_bytes`, then runs it through `ftfy.fix_text`.
-"""
-from ftfy import fix_file
-
-import sys
-ENCODE_STDOUT = (sys.hexversion < 0x03000000)
-
-
-def main():
-    """
-    Run ftfy as a command-line utility. (Requires Python 2.7 or later, or
-    the 'argparse' module.)
-    """
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('filename', help='file to transcode')
-
-    args = parser.parse_args()
-
-    file = open(args.filename)
-    for line in fix_file(file):
-        if ENCODE_STDOUT:
-            sys.stdout.write(line.encode('utf-8'))
-        else:
-            sys.stdout.write(line)
-
-
-if __name__ == '__main__':
-    main()
--- a/lib/ftfy/compatibility.py
+++ b/lib/ftfy/compatibility.py
@ -1,79 +0,0 @@
-"""
-Makes some function names and behavior consistent between Python 2 and
-Python 3, and also between narrow and wide builds.
-"""
-from __future__ import unicode_literals
-import sys
-import re
-import unicodedata
-
-if sys.hexversion >= 0x03000000:
-    from html import entities
-    unichr = chr
-    xrange = range
-    PYTHON2 = False
-else:
-    import htmlentitydefs as entities
-    unichr = unichr
-    xrange = xrange
-    PYTHON2 = True
-htmlentitydefs = entities
-
-PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000)
-
-
-def _narrow_unichr_workaround(codepoint):
-    """
-    A replacement for unichr() on narrow builds of Python. This will get
-    us the narrow representation of an astral character, which will be
-    a string of length two, containing two UTF-16 surrogates.
-    """
-    escaped = b'\\U%08x' % codepoint
-    return escaped.decode('unicode-escape')
-
-
-if sys.maxunicode < 0x10000:
-    unichr = _narrow_unichr_workaround
-    # In a narrow build of Python, we can't write a regex involving astral
-    # characters. If we want to write the regex:
-    #
-    #   [\U00100000-\U0010ffff]
-    #
-    # The actual string that defines it quietly turns into:
-    #
-    #   [\udbc0\udc00-\udbff\udfff]
-    #
-    # And now the range operator only applies to the middle two characters.
-    # It looks like a range that's going backwards from \dc00 to \dbff,
-    # which is an error.
-    #
-    # What we can do instead is rewrite the expression to be _about_ the two
-    # surrogates that make up the astral characters, instead of the characters
-    # themselves. This would be wrong on a wide build, but it works on a
-    # narrow build.
-    UNSAFE_PRIVATE_USE_RE = re.compile('[\udbc0-\udbff][\udc00-\udfff]')
-else:
-    UNSAFE_PRIVATE_USE_RE = re.compile('[\U00100000-\U0010ffff]')
-
-
-def bytes_to_ints(bytestring):
-    """
-    No matter what version of Python this is, make a sequence of integers from
-    a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a
-    sequence of integers.
-    """
-    if PYTHON2:
-        return [ord(b) for b in bytestring]
-    else:
-        return bytestring
-
-
-def is_printable(char):
-    """
-    str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so
-    let's make a crude approximation in Python 2.
-    """
-    if PYTHON2:
-        return not unicodedata.category(char).startswith('C')
-    else:
-        return char.isprintable()
--- a/lib/ftfy/fixes.py
+++ b/lib/ftfy/fixes.py
@ -1,473 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-This module contains the individual fixes that the main fix_text function
-can perform.
-"""
-
-from __future__ import unicode_literals
-from ftfy.chardata import (possible_encoding,
-                           CHARMAP_ENCODINGS, CONTROL_CHARS)
-from ftfy.badness import text_cost
-from ftfy.compatibility import htmlentitydefs, unichr, UNSAFE_PRIVATE_USE_RE
-import re
-import sys
-import codecs
-
-
-BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
-
-ftfy is designed to fix problems that were introduced by handling Unicode
-incorrectly. It might be able to fix the bytes you just handed it, but the
-fact that you just gave a pile of bytes to a function that fixes text means
-that your code is *also* handling Unicode incorrectly.
-
-ftfy takes Unicode text as input. You should take these bytes and decode
-them from the encoding you think they are in. If you're not sure what encoding
-they're in:
-
- First, try to find out. 'utf-8' is a good assumption.
- If the encoding is simply unknowable, try running your bytes through
-  ftfy.guess_bytes. As the name implies, this may not always be accurate.
-
-If you're confused by this, please read the Python Unicode HOWTO:
-
-    http://docs.python.org/%d/howto/unicode.html
-""" % sys.version_info[0]
-
-
-def fix_text_encoding(text):
-    r"""
-    Fix text with incorrectly-decoded garbage ("mojibake") whenever possible.
-
-    Something you will find all over the place, in real-world text, is text
-    that's mistakenly encoded as utf-8, decoded in some ugly format like
-    latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
-
-    This causes your perfectly good Unicode-aware code to end up with garbage
-    text because someone else (or maybe "someone else") made a mistake.
-
-    This function looks for the evidence of that having happened and fixes it.
-    It determines whether it should replace nonsense sequences of single-byte
-    characters that were really meant to be UTF-8 characters, and if so, turns
-    them into the correctly-encoded Unicode character that they were meant to
-    represent.
-
-    The input to the function must be Unicode. If you don't have Unicode text,
-    you're not using the right tool to solve your problem.
-
-    .. note::
-        The following examples are written using unmarked literal strings,
-        but they are Unicode text. In Python 2 we have "unicode_literals"
-        turned on, and in Python 3 this is always the case.
-
-    ftfy decodes text that looks like it was decoded incorrectly. It leaves
-    alone text that doesn't.
-
-        >>> print(fix_text_encoding('Ãºnico'))
-        único
-
-        >>> print(fix_text_encoding('This text is fine already :þ'))
-        This text is fine already :þ
-
-    Because these characters often come from Microsoft products, we allow
-    for the possibility that we get not just Unicode characters 128-255, but
-    also Windows's conflicting idea of what characters 128-160 are.
-
-        >>> print(fix_text_encoding('This â€” should be an em dash'))
-        This — should be an em dash
-
-    We might have to deal with both Windows characters and raw control
-    characters at the same time, especially when dealing with characters like
-    0x81 that have no mapping in Windows. This is a string that Python's
-    standard `.encode` and `.decode` methods cannot correct.
-
-        >>> print(fix_text_encoding('This text is sad .â\x81”.'))
-        This text is sad .⁔.
-
-    However, it has safeguards against fixing sequences of letters and
-    punctuation that can occur in valid text:
-
-        >>> print(fix_text_encoding('not such a fan of Charlotte Brontë…”'))
-        not such a fan of Charlotte Brontë…”
-
-    Cases of genuine ambiguity can sometimes be addressed by finding other
-    characters that are not double-encoded, and expecting the encoding to
-    be consistent:
-
-        >>> print(fix_text_encoding('AHÅ™, the new sofa from IKEA®'))
-        AHÅ™, the new sofa from IKEA®
-
-    Finally, we handle the case where the text is in a single-byte encoding
-    that was intended as Windows-1252 all along but read as Latin-1:
-
-        >>> print(fix_text_encoding('This text was never UTF-8 at all\x85'))
-        This text was never UTF-8 at all…
-
-    The best version of the text is found using
-    :func:`ftfy.badness.text_cost`.
-    """
-    text, _plan = fix_encoding_and_explain(text)
-    return text
-
-
-def fix_encoding_and_explain(text):
-    """
-    Re-decodes text that has been decoded incorrectly, and also return a
-    "plan" indicating all the steps required to fix it.
-
-    To fix similar text in the same way, without having to detect anything,
-    you can use the ``apply_plan`` function.
-    """
-    best_version = text
-    best_cost = text_cost(text)
-    best_plan = []
-    plan_so_far = []
-    while True:
-        prevtext = text
-        text, plan = fix_one_step_and_explain(text)
-        plan_so_far.extend(plan)
-        cost = text_cost(text)
-
-        # Add a penalty if we used a particularly obsolete encoding. The result
-        # is that we won't use these encodings unless they can successfully
-        # replace multiple characters.
-        if ('encode', 'macroman') in plan_so_far or\
-           ('encode', 'cp437') in plan_so_far:
-            cost += 2
-
-        # We need pretty solid evidence to decode from Windows-1251 (Cyrillic).
-        if ('encode', 'sloppy-windows-1251') in plan_so_far:
-            cost += 5
-
-        if cost < best_cost:
-            best_cost = cost
-            best_version = text
-            best_plan = list(plan_so_far)
-        if text == prevtext:
-            return best_version, best_plan
-
-
-def fix_one_step_and_explain(text):
-    """
-    Performs a single step of re-decoding text that's been decoded incorrectly.
-
-    Returns the decoded text, plus a "plan" for how to reproduce what it
-    did.
-    """
-    if isinstance(text, bytes):
-        raise UnicodeError(BYTES_ERROR_TEXT)
-    if len(text) == 0:
-        return text, []
-
-    # The first plan is to return ASCII text unchanged.
-    if possible_encoding(text, 'ascii'):
-        return text, []
-
-    # As we go through the next step, remember the possible encodings
-    # that we encounter but don't successfully fix yet. We may need them
-    # later.
-    possible_1byte_encodings = []
-
-    # Suppose the text was supposed to be UTF-8, but it was decoded using
-    # a single-byte encoding instead. When these cases can be fixed, they
-    # are usually the correct thing to do, so try them next.
-    for encoding in CHARMAP_ENCODINGS:
-        if possible_encoding(text, encoding):
-            encoded_bytes = text.encode(encoding)
-
-            # Now, find out if it's UTF-8 (or close enough). Otherwise,
-            # remember the encoding for later.
-            try:
-                decoding = 'utf-8'
-                if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes:
-                    decoding = 'utf-8-variants'
-                fixed = encoded_bytes.decode(decoding)
-                steps = [('encode', encoding), ('decode', decoding)]
-                return fixed, steps
-            except UnicodeDecodeError:
-                possible_1byte_encodings.append(encoding)
-
-    # The next most likely case is that this is Latin-1 that was intended to
-    # be read as Windows-1252, because those two encodings in particular are
-    # easily confused.
-    if 'latin-1' in possible_1byte_encodings:
-        if 'windows-1252' in possible_1byte_encodings:
-            # This text is in the intersection of Latin-1 and
-            # Windows-1252, so it's probably legit.
-            return text, []
-        else:
-            # Otherwise, it means we have characters that are in Latin-1 but
-            # not in Windows-1252. Those are C1 control characters. Nobody
-            # wants those. Assume they were meant to be Windows-1252. Don't
-            # use the sloppy codec, because bad Windows-1252 characters are
-            # a bad sign.
-            encoded = text.encode('latin-1')
-            try:
-                fixed = encoded.decode('windows-1252')
-                steps = []
-                if fixed != text:
-                    steps = [('encode', 'latin-1'), ('decode', 'windows-1252')]
-                return fixed, steps
-            except UnicodeDecodeError:
-                # This text contained characters that don't even make sense
-                # if you assume they were supposed to be Windows-1252. In
-                # that case, let's not assume anything.
-                pass
-
-    # The cases that remain are mixups between two different single-byte
-    # encodings, and not the common case of Latin-1 vs. Windows-1252.
-    #
-    # Those cases are somewhat rare, and impossible to solve without false
-    # positives. If you're in one of these situations, you should try using
-    # the `ftfy.guess_bytes` function.
-
-    # Return the text unchanged; the plan is empty.
-    return text, []
-
-
-def apply_plan(text, plan):
-    """
-    Apply a plan for fixing the encoding of text.
-
-    The plan is a list of tuples of the form (operation, encoding), where
-    `operation` is either 'encode' or 'decode', and `encoding` is an encoding
-    name such as 'utf-8' or 'latin-1'.
-
-    Because only text can be encoded, and only bytes can be decoded, the plan
-    should alternate 'encode' and 'decode' steps, or else this function will
-    encounter an error.
-    """
-    obj = text
-    for operation, encoding in plan:
-        if operation == 'encode':
-            obj = obj.encode(encoding)
-        elif operation == 'decode':
-            obj = obj.decode(encoding)
-        else:
-            raise ValueError("Unknown plan step: %s" % operation)
-
-    return obj
-
-
-HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};")
-
-
-def unescape_html(text):
-    """
-    Decode all three types of HTML entities/character references.
-
-    Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
-    to it for efficiency: it won't match entities longer than 8 characters,
-    because there are no valid entities like that.
-
-        >>> print(unescape_html('&lt;tag&gt;'))
-        <tag>
-    """
-    def fixup(match):
-        """
-        Replace one matched HTML entity with the character it represents,
-        if possible.
-        """
-        text = match.group(0)
-        if text[:2] == "&#":
-            # character reference
-            try:
-                if text[:3] == "&#x":
-                    return unichr(int(text[3:-1], 16))
-                else:
-                    return unichr(int(text[2:-1]))
-            except ValueError:
-                pass
-        else:
-            # named entity
-            try:
-                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
-            except KeyError:
-                pass
-        return text  # leave as is
-    return HTML_ENTITY_RE.sub(fixup, text)
-
-
-ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])')
-
-def remove_terminal_escapes(text):
-    r"""
-    Strip out "ANSI" terminal escape sequences, such as those that produce
-    colored text on Unix.
-
-        >>> print(remove_terminal_escapes(
-        ...     "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
-        ... ))
-        I'm blue, da ba dee da ba doo...
-    """
-    return ANSI_RE.sub('', text)
-
-
-SINGLE_QUOTE_RE = re.compile('[\u2018-\u201b]')
-DOUBLE_QUOTE_RE = re.compile('[\u201c-\u201f]')
-
-def uncurl_quotes(text):
-    r"""
-    Replace curly quotation marks with straight equivalents.
-
-        >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
-        "here's a test"
-    """
-    return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
-
-
-def fix_line_breaks(text):
-    r"""
-    Convert all line breaks to Unix style.
-
-    This will convert the following sequences into the standard \\n
-    line break:
-
-        - CRLF (\\r\\n), used on Windows and in some communication
-          protocols
-        - CR (\\r), once used on Mac OS Classic, and now kept alive
-          by misguided software such as Microsoft Office for Mac
-        - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029),
-          defined by Unicode and used to sow confusion and discord
-        - NEXT LINE (\\x85), a C1 control character that is certainly
-          not what you meant
-
-    The NEXT LINE character is a bit of an odd case, because it
-    usually won't show up if `fix_encoding` is also being run.
-    \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
-
-        >>> print(fix_line_breaks(
-        ...     "This string is made of two things:\u2029"
-        ...     "1. Unicode\u2028"
-        ...     "2. Spite"
-        ... ))
-        This string is made of two things:
-        1. Unicode
-        2. Spite
-
-    For further testing and examples, let's define a function to make sure
-    we can see the control characters in their escaped form:
-
-        >>> def eprint(text):
-        ...     print(text.encode('unicode-escape').decode('ascii'))
-
-        >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
-        Content-type: text/plain\n\nHi.
-
-        >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
-        This is how Microsoft \n trolls Mac users
-
-        >>> eprint(fix_line_breaks("What is this \x85 I don't even"))
-        What is this \n I don't even
-    """
-    return text.replace('\r\n', '\n').replace('\r', '\n')\
-               .replace('\u2028', '\n').replace('\u2029', '\n')\
-               .replace('\u0085', '\n')
-
-
-def remove_control_chars(text):
-    """
-    Remove all control characters except for the important ones.
-
-    This removes characters in these ranges:
-
-    - U+0000 to U+0008
-    - U+000B
-    - U+000E to U+001F
-    - U+007F
-
-    It leaves alone these characters that are commonly used for formatting:
-
-    - TAB (U+0009)
-    - LF (U+000A)
-    - FF (U+000C)
-    - CR (U+000D)
-    """
-    return text.translate(CONTROL_CHARS)
-
-
-def remove_bom(text):
-    r"""
-    Remove a left-over byte-order mark.
-
-    >>> print(remove_bom("\ufeffWhere do you want to go today?"))
-    Where do you want to go today?
-    """
-    return text.lstrip(unichr(0xfeff))
-
-
-def remove_unsafe_private_use(text):
-    r"""
-    Python 3.3's Unicode support isn't perfect, and in fact there are certain
-    string operations that will crash some versions of it with a SystemError:
-    http://bugs.python.org/issue18183
-
-    The best solution is to remove all characters from Supplementary Private
-    Use Area B, using a regex that is known not to crash given those
-    characters.
-
-    These are the characters from U+100000 to U+10FFFF. It's sad to lose an
-    entire plane of Unicode, but on the other hand, these characters are not
-    assigned and never will be. If you get one of these characters and don't
-    know what its purpose is, its purpose is probably to crash your code.
-
-    If you were using these for actual private use, this might be inconvenient.
-    You can turn off this fixer, of course, but I kind of encourage using
-    Supplementary Private Use Area A instead.
-
-        >>> print(remove_unsafe_private_use('\U0001F4A9\U00100000'))
-        💩
-
-    This fixer is off by default in Python 3.4 or later. (The bug is actually
-    fixed in 3.3.3 and 2.7.6, but I don't want the default behavior to change
-    based on a micro version upgrade of Python.)
-    """
-    return UNSAFE_PRIVATE_USE_RE.sub('', text)
-
-
-# Define a regex to match valid escape sequences in Python string literals.
-ESCAPE_SEQUENCE_RE = re.compile(r'''
-    ( \\U........      # 8-digit hex escapes
-    | \\u....          # 4-digit hex escapes
-    | \\x..            # 2-digit hex escapes
-    | \\[0-7]{1,3}     # Octal escapes
-    | \\N\{[^}]+\}     # Unicode characters by name
-    | \\[\\'"abfnrtv]  # Single-character escapes
-    )''', re.UNICODE | re.VERBOSE)
-
-
-def decode_escapes(text):
-    r"""
-    Decode backslashed escape sequences, including \\x, \\u, and \\U character
-    references, even in the presence of other Unicode.
-
-    This is what Python's "string-escape" and "unicode-escape" codecs were
-    meant to do, but in contrast, this actually works. It will decode the
-    string exactly the same way that the Python interpreter decodes its string
-    literals.
-
-        >>> factoid = '\\u20a1 is the currency symbol for the colón.'
-        >>> print(factoid[1:])
-        u20a1 is the currency symbol for the colón.
-        >>> print(decode_escapes(factoid))
-        ₡ is the currency symbol for the colón.
-
-    Even though Python itself can read string literals with a combination of
-    escapes and literal Unicode -- you're looking at one right now -- the
-    "unicode-escape" codec doesn't work on literal Unicode. (See
-    http://stackoverflow.com/a/24519338/773754 for more details.)
-    
-    Instead, this function searches for just the parts of a string that
-    represent escape sequences, and decodes them, leaving the rest alone. All
-    valid escape sequences are made of ASCII characters, and this allows
-    "unicode-escape" to work correctly.
-
-    This fix cannot be automatically applied by the `ftfy.fix_text` function,
-    because escaped text is not necessarily a mistake, and there is no way
-    to distinguish text that's supposed to be escaped from text that isn't.
-    """
-    def decode_match(match):
-        "Given a regex match, decode the escape sequence it contains."
-        return codecs.decode(match.group(0), 'unicode-escape')
-
-    return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
--- a/lib/ftfy/streamtester/init.py
+++ b/lib/ftfy/streamtester/init.py
@ -1,39 +0,0 @@
-"""
-This file defines a general method for evaluating ftfy using data that arrives
-in a stream. A concrete implementation of it is found in `twitter_tester.py`.
-"""
-from __future__ import print_function, unicode_literals
-from ftfy.fixes import fix_text_encoding
-from ftfy.chardata import possible_encoding
-
-
-class StreamTester:
-    """
-    Take in a sequence of texts, and show the ones that will be changed by
-    ftfy. This will also periodically show updates, such as the proportion of
-    texts that changed.
-    """
-    def __init__(self):
-        self.num_fixed = 0
-        self.count = 0
-
-    def check_ftfy(self, text):
-        """
-        Given a single text input, check whether `ftfy.fix_text_encoding`
-        would change it. If so, display the change.
-        """
-        self.count += 1
-        if not possible_encoding(text, 'ascii'):
-            fixed = fix_text_encoding(text)
-            if text != fixed:
-                # possibly filter common bots before printing
-                print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format(
-                    text=text, fixed=fixed
-                ))
-                self.num_fixed += 1
-
-        # Print status updates once in a while
-        if self.count % 100 == 0:
-            print('.', end='', flush=True)
-        if self.count % 10000 == 0:
-            print('\n%d/%d fixed' % (self.num_fixed, self.count))
--- a/lib/ftfy/streamtester/oauth.py
+++ b/lib/ftfy/streamtester/oauth.py
@ -1,73 +0,0 @@
-# coding: utf-8
-"""
-Do what is necessary to authenticate this tester as a Twitter "app", using
-somebody's Twitter account.
-"""
-from __future__ import unicode_literals
-import os
-
-
-AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth')
-
-def get_auth():
-    """
-    Twitter has some bizarre requirements about how to authorize an "app" to
-    use its API.
-
-    The user of the app has to log in to get a secret token. That's fine. But
-    the app itself has its own "consumer secret" token. The app has to know it,
-    and the user of the app has to not know it.
-
-    This is, of course, impossible. It's equivalent to DRM. Your computer can't
-    *really* make use of secret information while hiding the same information
-    from you.
-
-    The threat appears to be that, if you have this super-sekrit token, you can
-    impersonate the app while doing something different. Well, of course you
-    can do that, because you *have the source code* and you can change it to do
-    what you want. You still have to log in as a particular user who has a
-    token that's actually secret, you know.
-
-    Even developers of closed-source applications that use the Twitter API are
-    unsure what to do, for good reason. These "secrets" are not secret in any
-    cryptographic sense. A bit of Googling shows that the secret tokens for
-    every popular Twitter app are already posted on the Web.
-
-    Twitter wants us to pretend this string can be kept secret, and hide this
-    secret behind a fig leaf like everybody else does. So that's what we've
-    done.
-    """
-
-    from twitter.oauth import OAuth
-    from twitter import oauth_dance, read_token_file
-
-    def unhide(secret):
-        """
-        Do something mysterious and exactly as secure as every other Twitter
-        app.
-        """
-        return ''.join([chr(ord(c) - 0x2800) for c in secret])
-
-    fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁'
-    consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw'
-
-    if os.path.exists(AUTH_TOKEN_PATH):
-        token, token_secret = read_token_file(AUTH_TOKEN_PATH)
-    else:
-        authdir = os.path.dirname(AUTH_TOKEN_PATH)
-        if not os.path.exists(authdir):
-            os.makedirs(authdir)
-        token, token_secret = oauth_dance(
-            app_name='ftfy-tester',
-            consumer_key=consumer_key,
-            consumer_secret=unhide(fig_leaf),
-            token_filename=AUTH_TOKEN_PATH
-        )
-
-    return OAuth(
-        token=token,
-        token_secret=token_secret,
-        consumer_key=consumer_key,
-        consumer_secret=unhide(fig_leaf)
-    )
-
--- a/lib/ftfy/streamtester/twitter_tester.py
+++ b/lib/ftfy/streamtester/twitter_tester.py
@ -1,89 +0,0 @@
-"""
-Implements a StreamTester that runs over Twitter data. See the class
-docstring.
-
-This module is written for Python 3 only. The __future__ imports you see here
-are just to let Python 2 scan the file without crashing with a SyntaxError.
-"""
-from __future__ import print_function, unicode_literals
-import os
-from collections import defaultdict
-from ftfy.streamtester import StreamTester
-
-
-class TwitterTester(StreamTester):
-    """
-    This class uses the StreamTester code (defined in `__init__.py`) to
-    evaluate ftfy's real-world performance, by feeding it live data from
-    Twitter.
-
-    This is a semi-manual evaluation. It requires a human to look at the
-    results and determine if they are good. The three possible cases we
-    can see here are:
-
-        - Success: the process takes in mojibake and outputs correct text.
-        - False positive: the process takes in correct text, and outputs
-          mojibake. Every false positive should be considered a bug, and
-          reported on GitHub if it isn't already.
-        - Confusion: the process takes in mojibake and outputs different
-          mojibake. Not a great outcome, but not as dire as a false
-          positive.
-
-    This tester cannot reveal false negatives. So far, that can only be
-    done by the unit tests.
-    """
-    OUTPUT_DIR = './twitterlogs'
-
-    def __init__(self):
-        self.lines_by_lang = defaultdict(list)
-        super().__init__()
-
-    def save_files(self):
-        """
-        When processing data from live Twitter, save it to log files so that
-        it can be replayed later.
-        """
-        if not os.path.exists(self.OUTPUT_DIR):
-            os.makedirs(self.OUTPUT_DIR)
-        for lang, lines in self.lines_by_lang.items():
-            filename = 'tweets.{}.txt'.format(lang)
-            fullname = os.path.join(self.OUTPUT_DIR, filename)
-            langfile = open(fullname, 'a')
-            for line in lines:
-                print(line.replace('\n', ' '), file=langfile)
-            langfile.close()
-        self.lines_by_lang = defaultdict(list)
-
-    def run_sample(self):
-        """
-        Listen to live data from Twitter, and pass on the fully-formed tweets
-        to `check_ftfy`. This requires the `twitter` Python package as a
-        dependency.
-        """
-        from twitter import TwitterStream
-        from ftfy.streamtester.oauth import get_auth
-        twitter_stream = TwitterStream(auth=get_auth())
-        iterator = twitter_stream.statuses.sample()
-        for tweet in iterator:
-            if 'text' in tweet:
-                self.check_ftfy(tweet['text'])
-                if 'user' in tweet:
-                    lang = tweet['user'].get('lang', 'NONE')
-                    self.lines_by_lang[lang].append(tweet['text'])
-                if self.count % 10000 == 100:
-                    self.save_files()
-
-
-def main():
-    """
-    When run from the command line, this script connects to the Twitter stream
-    and runs the TwitterTester on it forever. Or at least until the stream
-    drops.
-    """
-    tester = TwitterTester()
-    tester.run_sample()
-
-
-if __name__ == '__main__':
-    main()
-
--- a/sickbeard/encodingKludge.py
+++ b/sickbeard/encodingKludge.py
@ -17,53 +17,71 @@
 # along with SickRage.  If not, see <http://www.gnu.org/licenses/>.

 import os
+import traceback

 import sickbeard
 from sickbeard import logger

-import ftfy
-import ftfy.bad_codecs
+import six
+import chardet
+

 # This module tries to deal with the apparently random behavior of python when dealing with unicode <-> utf-8
 # encodings. It tries to just use unicode, but if that fails then it tries forcing it to utf-8. Any functions
 # which return something should always return unicode.

-def fixStupidEncodings(x, silent=False):
-    if type(x) == str:
-        try:
-            return str(ftfy.fix_text(u'' + x)).decode(sickbeard.SYS_ENCODING)
-        except UnicodeDecodeError:
-            logger.log(u"Unable to decode value: " + repr(x), logger.ERROR)
+def toUnicode(x):
+    try:
+        if isinstance(x, unicode):
            return x
-        except UnicodeEncodeError:
-            logger.log(u"Unable to encode value: " + repr(x), logger.ERROR)
-            return x
-    elif type(x) == unicode:
-        return x
-    else:
-        logger.log(
-            u"Unknown value passed in, ignoring it: " + str(type(x)) + " (" + repr(x) + ":" + repr(type(x)) + ")",
-            logger.DEBUG if silent else logger.ERROR)
+        else:
+            try:
+                return six.text_type(x)
+            except:
+                try:
+                    if chardet.detect(x).get('encoding') == 'utf-8':
+                        return x.decode('utf-8')
+                    if isinstance(x, str):
+                        try:
+                            return x.decode(sickbeard.SYS_ENCODING)
+                        except UnicodeDecodeError:
+                            raise
+                    return x
+                except:
+                    raise
+    except:
+        logger.log('Unable to decode value "%s..." : %s ' % (repr(x)[:20], traceback.format_exc()), logger.WARNING)
+        ascii_text = str(x).encode('string_escape')
+        return toUnicode(ascii_text)

+def ss(x):
+    u_x = toUnicode(x)
+
+    try:
+        return u_x.encode(sickbeard.SYS_ENCODING)
+    except Exception as e:
+        logger.log('Failed ss encoding char, force UTF8: %s' % e, logger.WARNING)
+        try:
+            return u_x.encode(sickbeard.SYS_ENCODING, 'replace')
+        except:
+            return u_x.encode('utf-8', 'replace')

 def fixListEncodings(x):
-    if type(x) != list and type(x) != tuple:
+    if not isinstance(x, (list, tuple)):
        return x
    else:
-        return filter(lambda x: x != None, map(fixStupidEncodings, x))
+        return filter(lambda x: x != None, map(toUnicode, x))


 def ek(func, *args, **kwargs):
    if os.name == 'nt':
        result = func(*args, **kwargs)
    else:
-        result = func(
-            *[fixStupidEncodings(x).encode(sickbeard.SYS_ENCODING) if type(x) in (str, unicode) else x for x in args],
-            **kwargs)
+        result = func(*[ss(x) if isinstance(x, (str, unicode)) else x for x in args], **kwargs)

-    if type(result) in (list, tuple):
+    if isinstance(result, (list, tuple)):
        return fixListEncodings(result)
-    elif type(result) == str:
-        return fixStupidEncodings(result)
+    elif isinstance(result, str):
+        return toUnicode(result)
    else:
        return result
--- a/sickbeard/exceptions.py
+++ b/sickbeard/exceptions.py
@ -16,7 +16,7 @@
 # You should have received a copy of the GNU General Public License
 # along with SickRage.  If not, see <http://www.gnu.org/licenses/>.

-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode

 def ex(e):
    """
@ -32,11 +32,11 @@ def ex(e):

        if arg is not None:
            if isinstance(arg, (str, unicode)):
-                fixed_arg = fixStupidEncodings(arg, True)
+                fixed_arg = toUnicode(arg, True)

            else:
                try:
-                    fixed_arg = u"error " + fixStupidEncodings(str(arg), True)
+                    fixed_arg = u"error " + toUnicode(str(arg), True)

                except:
                    fixed_arg = None
--- a/sickbeard/failed_history.py
+++ b/sickbeard/failed_history.py
@ -26,7 +26,7 @@ from sickbeard.exceptions import ex, EpisodeNotFoundException
 from sickbeard.history import dateFormat
 from sickbeard.common import Quality
 from sickbeard.common import WANTED, FAILED
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode

 def prepareFailedName(release):
    """Standardizes release name for failed DB"""
@ -36,7 +36,7 @@ def prepareFailedName(release):
        fixed = fixed.rpartition(".")[0]

    fixed = re.sub("[\.\-\+\ ]", "_", fixed)
-    fixed = fixStupidEncodings(fixed)
+    fixed = toUnicode(fixed)

    return fixed

--- a/sickbeard/history.py
+++ b/sickbeard/history.py
@ -20,7 +20,7 @@ import db
 import datetime

 from sickbeard.common import SNATCHED, SUBTITLED, FAILED, Quality
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode


 dateFormat = "%Y%m%d%H%M%S"
@ -28,7 +28,7 @@ dateFormat = "%Y%m%d%H%M%S"

 def _logHistoryItem(action, showid, season, episode, quality, resource, provider, version=-1):
    logDate = datetime.datetime.today().strftime(dateFormat)
-    resource = fixStupidEncodings(resource)
+    resource = toUnicode(resource)

    myDB = db.DBConnection()
    myDB.action(
--- a/sickbeard/notifiers/emailnotify.py
+++ b/sickbeard/notifiers/emailnotify.py
@ -29,7 +29,7 @@ import sickbeard

 from sickbeard import logger, common
 from sickbeard import db
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
 from sickbeard.exceptions import ex


@ -51,7 +51,7 @@ class EmailNotifier:
        ep_name: The name of the episode that was snatched
        title: The title of the notification (optional)
        """
-        ep_name = fixStupidEncodings(ep_name)
+        ep_name = toUnicode(ep_name)

        if sickbeard.EMAIL_NOTIFY_ONSNATCH:
            show = self._parseEp(ep_name)
@ -86,7 +86,7 @@ class EmailNotifier:
        ep_name: The name of the episode that was downloaded
        title: The title of the notification (optional)
        """
-        ep_name = fixStupidEncodings(ep_name)
+        ep_name = toUnicode(ep_name)

        if sickbeard.EMAIL_NOTIFY_ONDOWNLOAD:
            show = self._parseEp(ep_name)
@ -121,7 +121,7 @@ class EmailNotifier:
        ep_name: The name of the episode that was downloaded
        lang: Subtitle language wanted
        """
-        ep_name = fixStupidEncodings(ep_name)
+        ep_name = toUnicode(ep_name)

        if sickbeard.EMAIL_NOTIFY_ONSUBTITLEDOWNLOAD:
            show = self._parseEp(ep_name)
@ -198,7 +198,7 @@ class EmailNotifier:
            return False

    def _parseEp(self, ep_name):
-        ep_name = fixStupidEncodings(ep_name)
+        ep_name = toUnicode(ep_name)

        sep = " - "
        titles = ep_name.split(sep)
--- a/sickbeard/notifiers/plex.py
+++ b/sickbeard/notifiers/plex.py
@ -25,7 +25,7 @@ import sickbeard
 from sickbeard import logger
 from sickbeard import common
 from sickbeard.exceptions import ex
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode

 from sickbeard.notifiers.xbmc import XBMCNotifier

--- a/sickbeard/notifiers/xbmc.py
+++ b/sickbeard/notifiers/xbmc.py
@ -26,7 +26,7 @@ import sickbeard
 from sickbeard import logger
 from sickbeard import common
 from sickbeard.exceptions import ex
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode


 try:
@ -236,9 +236,9 @@ class XBMCNotifier:
                base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
                authheader = "Basic %s" % base64string
                req.add_header("Authorization", authheader)
-                logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG)
+                logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG)
            else:
-                logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG)
+                logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG)

            response = urllib2.urlopen(req)
            result = response.read().decode(sickbeard.SYS_ENCODING)
@ -248,7 +248,7 @@ class XBMCNotifier:
            return result

        except (urllib2.URLError, IOError), e:
-            logger.log(u"Warning: Couldn't contact XBMC HTTP at " + fixStupidEncodings(url) + " " + ex(e),
+            logger.log(u"Warning: Couldn't contact XBMC HTTP at " + toUnicode(url) + " " + ex(e),
                       logger.WARNING)
            return False

@ -379,9 +379,9 @@ class XBMCNotifier:
                base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
                authheader = "Basic %s" % base64string
                req.add_header("Authorization", authheader)
-                logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG)
+                logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG)
            else:
-                logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG)
+                logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG)

            try:
                response = urllib2.urlopen(req)
@ -401,7 +401,7 @@ class XBMCNotifier:
                return False

        except IOError, e:
-            logger.log(u"Warning: Couldn't contact XBMC JSON API at " + fixStupidEncodings(url) + " " + ex(e),
+            logger.log(u"Warning: Couldn't contact XBMC JSON API at " + toUnicode(url) + " " + ex(e),
                       logger.WARNING)
            return False

--- a/sickbeard/nzbSplitter.py
+++ b/sickbeard/nzbSplitter.py
@ -29,7 +29,7 @@ from sickbeard import encodingKludge as ek
 from sickbeard.exceptions import ex

 from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode


 def getSeasonNZBs(name, urlData, season):
@ -85,7 +85,7 @@ def createNZBString(fileElements, xmlns):
    for curFile in fileElements:
        rootElement.append(stripNS(curFile, xmlns))

-    return xml.etree.ElementTree.tostring(fixStupidEncodings(rootElement))
+    return xml.etree.ElementTree.tostring(toUnicode(rootElement))


 def saveNZB(nzbName, nzbString):
--- a/sickbeard/scene_exceptions.py
+++ b/sickbeard/scene_exceptions.py
@ -27,7 +27,7 @@ from sickbeard import helpers
 from sickbeard import name_cache
 from sickbeard import logger
 from sickbeard import db
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode

 exception_dict = {}
 anidb_exception_dict = {}
@ -234,7 +234,7 @@ def retrieve_exceptions():
            # if this exception isn't already in the DB then add it
            if cur_exception not in existing_exceptions:

-                cur_exception = fixStupidEncodings(cur_exception)
+                cur_exception = toUnicode(cur_exception)

                myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
                            [cur_indexer_id, cur_exception, curSeason])
@ -267,7 +267,7 @@ def update_scene_exceptions(indexer_id, scene_exceptions, season=-1):
        exceptionsCache[indexer_id][season] = scene_exceptions

    for cur_exception in scene_exceptions:
-        cur_exception = fixStupidEncodings(cur_exception)
+        cur_exception = toUnicode(cur_exception)

        myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
                    [indexer_id, cur_exception, season])
--- a/sickbeard/show_name_helpers.py
+++ b/sickbeard/show_name_helpers.py
@ -234,7 +234,7 @@ def isGoodResult(name, show, log=True, season=-1):

    all_show_names = allPossibleShowNames(show, season=season)
    showNames = map(sanitizeSceneName, all_show_names) + all_show_names
-    showNames += map(unidecode, all_show_names)
+    showNames += map(ek.toUnicode, all_show_names)

    for curName in set(showNames):
        if not show.is_anime:
--- a/sickbeard/tvcache.py
+++ b/sickbeard/tvcache.py
@ -33,7 +33,7 @@ from sickbeard.exceptions import AuthException
 from sickbeard.rssfeeds import RSSFeeds
 from sickbeard import clients
 from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode

 class CacheDBConnection(db.DBConnection):
    def __init__(self, providerName):
@ -263,7 +263,7 @@ class TVCache():
            # get quality of release
            quality = parse_result.quality

-            name = fixStupidEncodings(name)
+            name = toUnicode(name)

            # get release group
            release_group = parse_result.release_group
--- a/sickbeard/webserve.py
+++ b/sickbeard/webserve.py
@ -3288,7 +3288,7 @@ class ErrorLogs(MainHandler):

        for x in reversed(data):

-            x = ek.fixStupidEncodings(x)
+            x = ek.toUnicode(x)
            match = re.match(regex, x)

            if match:
--- a/tests/all_tests.py
+++ b/tests/all_tests.py
@ -18,23 +18,27 @@
 # You should have received a copy of the GNU General Public License
 # along with SickRage.  If not, see <http://www.gnu.org/licenses/>.

+import glob
+import unittest
+import sys
+
+class AllTests(unittest.TestCase):
+    def setUp(self):
+        self.test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__]
+        self.module_strings = [file_string[0:len(file_string) - 3] for file_string in self.test_file_strings]
+        self.suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in self.module_strings]
+        self.testSuite = unittest.TestSuite(self.suites)
+
+    def testAll(self):
+        print "=================="
+        print "STARTING - ALL TESTS"
+        print "=================="
+        for includedfiles in self.test_file_strings:
+            print "- " + includedfiles
+
+        text_runner = unittest.TextTestRunner().run(self.testSuite)
+        if not text_runner.wasSuccessful():
+            sys.exit(-1)
+
 if __name__ == "__main__":
-    import glob
-    import unittest
-    import sys
-
-    test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__]
-    module_strings = [file_string[0:len(file_string) - 3] for file_string in test_file_strings]
-    suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in module_strings]
-    testSuite = unittest.TestSuite(suites)
-
-    print "=================="
-    print "STARTING - ALL TESTS"
-    print "=================="
-    print "this will include"
-    for includedfiles in test_file_strings:
-        print "- " + includedfiles
-
-    text_runner = unittest.TextTestRunner().run(testSuite)
-    if not text_runner.wasSuccessful():
-        sys.exit(-1)
+    unittest.main()
--- a/tests/common_tests.py
+++ b/tests/common_tests.py
@ -8,7 +8,6 @@ sys.path.append(os.path.abspath('../lib'))

 from sickbeard import common

-
 class QualityTests(unittest.TestCase):

    # TODO: repack / proper ? air-by-date ? season rip? multi-ep?
--- a/tests/test_lib.py
+++ b/tests/test_lib.py
@ -51,7 +51,6 @@ EPISODE = 2
 FILENAME = u"show name - s0" + str(SEASON) + "e0" + str(EPISODE) + ".mkv"
 FILEDIR = os.path.join(TESTDIR, SHOWNAME)
 FILEPATH = os.path.join(FILEDIR, FILENAME)
-
 SHOWDIR = os.path.join(TESTDIR, SHOWNAME + " final")

 #sickbeard.logger.sb_log_instance = sickbeard.logger.SBRotatingLogHandler(os.path.join(TESTDIR, 'sickbeard.log'), sickbeard.logger.NUM_LOGS, sickbeard.logger.LOG_SIZE)