Added FTFY module to help with any encoding/decoding issues

2025-03-03 01:52:02 -05:00 · 2014-11-24 13:42:30 -08:00 · 2014-11-24 13:42:30 -08:00 · 6a140aa907
commit 6a140aa907
parent f73aee78cc
22 changed files with 2039 additions and 36 deletions
--- a/lib/ftfy/init.py
+++ b/lib/ftfy/init.py
@ -0,0 +1,351 @@
+# -*- coding: utf-8 -*-
+"""
+ftfy: fixes text for you
+
+This is a module for making text less broken. See the `fix_text` function
+for more information.
+"""
+
+from __future__ import unicode_literals
+
+# See the docstring for ftfy.bad_codecs to see what we're doing here.
+import ftfy.bad_codecs
+ftfy.bad_codecs.ok()
+
+from ftfy import fixes
+from ftfy.fixes import fix_text_encoding
+from ftfy.compatibility import PYTHON34_OR_LATER, is_printable
+import unicodedata
+import warnings
+
+
+def fix_text(text,
+             remove_unsafe_private_use=(not PYTHON34_OR_LATER),
+             fix_entities='auto',
+             remove_terminal_escapes=True,
+             fix_encoding=True,
+             normalization='NFKC',
+             uncurl_quotes=True,
+             fix_line_breaks=True,
+             remove_control_chars=True,
+             remove_bom=True,
+             max_decode_length=2**16):
+    r"""
+    Given Unicode text as input, make its representation consistent and
+    possibly less broken.
+
+    Let's start with some examples:
+
+        >>> print(fix_text('uÌˆnicode'))
+        ünicode
+
+        >>> print(fix_text('Broken text&hellip; it&#x2019;s ﬂubberiﬁc!'))
+        Broken text... it's flubberific!
+
+        >>> print(fix_text('HTML entities &lt;3'))
+        HTML entities <3
+
+        >>> print(fix_text('<em>HTML entities &lt;3</em>'))
+        <em>HTML entities &lt;3</em>
+
+        >>> print(fix_text('\001\033[36;44mI&#x92;m blue, da ba dee da ba '
+        ...               'doo&#133;\033[0m'))
+        I'm blue, da ba dee da ba doo...
+
+        >>> # This example string starts with a byte-order mark, even if
+        >>> # you can't see it on the Web.
+        >>> print(fix_text('\ufeffParty like\nit&rsquo;s 1999!'))
+        Party like
+        it's 1999!
+
+        >>> len(fix_text('ﬁ' * 100000))
+        200000
+
+        >>> len(fix_text(''))
+        0
+
+    Based on the options you provide, ftfy applies these steps in order:
+
+    - If `remove_unsafe_private_use` is True, it removes a range of private-use
+      characters that could trigger a Python bug. The bug is fixed in
+      the most recent versions of Python, so this will default to False
+      starting on Python 3.4.
+    - If `fix_entities` is True, replace HTML entities with their equivalent
+      characters. If it's "auto" (the default), then consider replacing HTML
+      entities, but don't do so in text where you have seen a pair of actual
+      angle brackets (that's probably actually HTML and you shouldn't mess
+      with the entities).
+    - If `remove_terminal_escapes` is True, remove sequences of bytes that are
+      instructions for Unix terminals, such as the codes that make text appear
+      in different colors.
+    - If `fix_encoding` is True, look for common mistakes that come from
+      encoding or decoding Unicode text incorrectly, and fix them if they are
+      reasonably fixable. See `fix_text_encoding` for details.
+    - If `normalization` is not None, apply the specified form of Unicode
+      normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
+      The default, 'NFKC', applies the following relevant transformations:
+
+      - C: Combine characters and diacritics that are written using separate
+        code points, such as converting "e" plus an acute accent modifier
+        into "é", or converting "ka" (か) plus a dakuten into the
+        single character "ga" (が).
+      - K: Replace characters that are functionally equivalent with the most
+        common form. For example, half-width katakana will be replaced with
+        full-width versions, full-width Roman characters will be replaced with
+        ASCII characters, ellipsis characters will be replaced with three
+        periods, and the ligature 'ﬂ' will be replaced with 'fl'.
+
+    - If `uncurl_quotes` is True, replace various curly quotation marks with
+      plain-ASCII straight quotes.
+    - If `fix_line_breaks` is true, convert all line breaks to Unix style
+      (CRLF and CR line breaks become LF line breaks).
+    - If `fix_control_characters` is true, remove all C0 control characters
+      except the common useful ones: TAB, CR, LF, and FF. (CR characters
+      may have already been removed by the `fix_line_breaks` step.)
+    - If `remove_bom` is True, remove the Byte-Order Mark if it exists.
+    - If anything was changed, repeat all the steps, so that the function is
+      idempotent. "&amp;amp;" will become "&", for example, not "&amp;".
+
+    `fix_text` will work one line at a time, with the possibility that some
+    lines are in different encodings. When it encounters lines longer than
+    `max_decode_length`, it will not run the `fix_encoding` step, to avoid
+    unbounded slowdowns.
+
+    If you are certain your entire text is in the same encoding (though that
+    encoding is possibly flawed), and do not mind performing operations on
+    the whole text at once, use `fix_text_segment`.
+    """
+    if isinstance(text, bytes):
+        raise UnicodeError(fixes.BYTES_ERROR_TEXT)
+
+    out = []
+    pos = 0
+    while pos < len(text):
+        textbreak = text.find('\n', pos) + 1
+        fix_encoding_this_time = fix_encoding
+        if textbreak == 0:
+            textbreak = len(text)
+        if (textbreak - pos) > max_decode_length:
+            fix_encoding_this_time = False
+
+        substring = text[pos:textbreak]
+
+        if fix_entities == 'auto' and '<' in substring and '>' in substring:
+            # we see angle brackets together; this could be HTML
+            fix_entities = False
+
+        out.append(
+            fix_text_segment(
+                substring,
+                remove_unsafe_private_use=remove_unsafe_private_use,
+                fix_entities=fix_entities,
+                remove_terminal_escapes=remove_terminal_escapes,
+                fix_encoding=fix_encoding_this_time,
+                normalization=normalization,
+                uncurl_quotes=uncurl_quotes,
+                fix_line_breaks=fix_line_breaks,
+                remove_control_chars=remove_control_chars,
+                remove_bom=remove_bom
+            )
+        )
+        pos = textbreak
+
+    return ''.join(out)
+
+ftfy = fix_text
+
+
+def fix_file(input_file,
+             remove_unsafe_private_use=True,
+             fix_entities='auto',
+             remove_terminal_escapes=True,
+             fix_encoding=True,
+             normalization='NFKC',
+             uncurl_quotes=True,
+             fix_line_breaks=True,
+             remove_control_chars=True,
+             remove_bom=True):
+    """
+    Fix text that is found in a file.
+
+    If the file is being read as Unicode text, use that. If it's being read as
+    bytes, then unfortunately, we have to guess what encoding it is. We'll try
+    a few common encodings, but we make no promises. See the `guess_bytes`
+    function for how this is done.
+
+    The output is a stream of fixed lines of text.
+    """
+    entities = fix_entities
+    for line in input_file:
+        if isinstance(line, bytes):
+            line, encoding = guess_bytes(line)
+        if fix_entities == 'auto' and '<' in line and '>' in line:
+            entities = False
+        yield fix_text_segment(
+            line,
+            remove_unsafe_private_use=remove_unsafe_private_use,
+            fix_entities=entities,
+            remove_terminal_escapes=remove_terminal_escapes,
+            fix_encoding=fix_encoding,
+            normalization=normalization,
+            uncurl_quotes=uncurl_quotes,
+            fix_line_breaks=fix_line_breaks,
+            remove_control_chars=remove_control_chars,
+            remove_bom=remove_bom
+        )
+
+
+def fix_text_segment(text,
+                     remove_unsafe_private_use=True,
+                     fix_entities='auto',
+                     remove_terminal_escapes=True,
+                     fix_encoding=True,
+                     normalization='NFKC',
+                     uncurl_quotes=True,
+                     fix_line_breaks=True,
+                     remove_control_chars=True,
+                     remove_bom=True):
+    """
+    Apply fixes to text in a single chunk. This could be a line of text
+    within a larger run of `fix_text`, or it could be a larger amount
+    of text that you are certain is all in the same encoding.
+
+    See `fix_text` for a description of the parameters.
+    """
+    if isinstance(text, bytes):
+        raise UnicodeError(fixes.BYTES_ERROR_TEXT)
+
+    if fix_entities == 'auto' and '<' in text and '>' in text:
+        fix_entities = False
+    while True:
+        origtext = text
+        if remove_unsafe_private_use:
+            text = fixes.remove_unsafe_private_use(text)
+        if fix_entities:
+            text = fixes.unescape_html(text)
+        if remove_terminal_escapes:
+            text = fixes.remove_terminal_escapes(text)
+        if fix_encoding:
+            text = fixes.fix_text_encoding(text)
+        if normalization is not None:
+            text = unicodedata.normalize(normalization, text)
+        if uncurl_quotes:
+            text = fixes.uncurl_quotes(text)
+        if fix_line_breaks:
+            text = fixes.fix_line_breaks(text)
+        if remove_control_chars:
+            text = fixes.remove_control_chars(text)
+        if remove_bom:
+            text = fixes.remove_bom(text)
+        if text == origtext:
+            return text
+
+
+def guess_bytes(bstring):
+    """
+    If you have some bytes in an unknown encoding, here's a reasonable
+    strategy for decoding them, by trying a few common encodings that
+    can be distinguished from each other.
+
+    This is not a magic bullet. If the bytes are coming from some MySQL
+    database with the "character set" set to ISO Elbonian, this won't figure
+    it out. Perhaps more relevantly, this currently doesn't try East Asian
+    encodings.
+
+    The encodings we try are:
+
+    - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
+      like nothing else
+    - UTF-8, because it's the global de facto standard
+    - "utf-8-variants", because it's what people actually implement when they
+      think they're doing UTF-8
+    - MacRoman, because Microsoft Office thinks it's still a thing, and it
+      can be distinguished by its line breaks. (If there are no line breaks in
+      the string, though, you're out of luck.)
+    - "sloppy-windows-1252", the Latin-1-like encoding that is the most common
+      single-byte encoding
+    """
+    if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
+        return bstring.decode('utf-16'), 'utf-16'
+
+    byteset = set(bytes(bstring))
+    byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
+
+    try:
+        if byte_ed in byteset or byte_c0 in byteset:
+            # Byte 0xed can be used to encode a range of codepoints that
+            # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
+            # so when we see 0xed, it's very likely we're being asked to
+            # decode CESU-8, the variant that encodes UTF-16 surrogates
+            # instead of the original characters themselves.
+            #
+            # This will occasionally trigger on standard UTF-8, as there
+            # are some Korean characters that also use byte 0xed, but that's
+            # not harmful.
+            #
+            # Byte 0xc0 is impossible because, numerically, it would only
+            # encode characters lower than U+0040. Those already have
+            # single-byte representations, and UTF-8 requires using the
+            # shortest possible representation. However, Java hides the null
+            # codepoint, U+0000, in a non-standard longer representation -- it
+            # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
+            # will never appear in the encoded bytes.
+            #
+            # The 'utf-8-variants' decoder can handle both of these cases, as
+            # well as standard UTF-8, at the cost of a bit of speed.
+            return bstring.decode('utf-8-variants'), 'utf-8-variants'
+        else:
+            return bstring.decode('utf-8'), 'utf-8'
+    except UnicodeDecodeError:
+        pass
+
+    if byte_CR in bstring and byte_LF not in bstring:
+        return bstring.decode('macroman'), 'macroman'
+    else:
+        return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
+
+
+def explain_unicode(text):
+    """
+    A utility method that's useful for debugging mysterious Unicode.
+
+    It breaks down a string, showing you for each codepoint its number in
+    hexadecimal, its glyph, its category in the Unicode standard, and its name
+    in the Unicode standard.
+
+        >>> explain_unicode('(╯°□°)╯︵ ┻━┻')
+        U+0028  (       [Ps] LEFT PARENTHESIS
+        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
+        U+00B0  °       [So] DEGREE SIGN
+        U+25A1  □       [So] WHITE SQUARE
+        U+00B0  °       [So] DEGREE SIGN
+        U+0029  )       [Pe] RIGHT PARENTHESIS
+        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
+        U+FE35  ︵       [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
+        U+0020          [Zs] SPACE
+        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
+        U+2501  ━       [So] BOX DRAWINGS HEAVY HORIZONTAL
+        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
+    """
+    for char in text:
+        if is_printable(char):
+            display = char
+        else:
+            display = char.encode('unicode-escape').decode('ascii')
+        print('U+{code:04X}  {display:<7} [{category}] {name}'.format(
+            display=display,
+            code=ord(char),
+            category=unicodedata.category(char),
+            name=unicodedata.name(char, '<unknown>')
+        ))
+
+
+def fix_bad_encoding(text):
+    """
+    Kept for compatibility with previous versions of ftfy.
+    """
+    warnings.warn(
+        'fix_bad_encoding is now known as fix_text_encoding',
+        DeprecationWarning
+    )
+    return fix_text_encoding(text)
--- a/lib/ftfy/bad_codecs/init.py
+++ b/lib/ftfy/bad_codecs/init.py
@ -0,0 +1,94 @@
+# coding: utf-8
+r"""
+Give Python the ability to decode some common, flawed encodings.
+
+Python does not want you to be sloppy with your text. Its encoders and decoders
+("codecs") follow the relevant standards whenever possible, which means that
+when you get text that *doesn't* follow those standards, you'll probably fail
+to decode it. Or you might succeed at decoding it for implementation-specific
+reasons, which is perhaps worse.
+
+There are some encodings out there that Python wishes didn't exist, which are
+widely used outside of Python:
+
+- "utf-8-variants", a family of not-quite-UTF-8 encodings, including the
+  ever-popular CESU-8 and "Java modified UTF-8".
+- "Sloppy" versions of character map encodings, where bytes that don't map to
+  anything will instead map to the Unicode character with the same number.
+
+Simply importing this module, or in fact any part of the `ftfy` package, will
+make these new "bad codecs" available to Python through the standard Codecs
+API. You never have to actually call any functions inside `ftfy.bad_codecs`.
+
+However, if you want to call something because your code checker insists on it,
+you can call ``ftfy.bad_codecs.ok()``.
+
+A quick example of decoding text that's encoded in CESU-8:
+
+    >>> import ftfy.bad_codecs
+    >>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
+    😍
+"""
+from __future__ import unicode_literals
+from encodings import normalize_encoding
+import codecs
+
+_CACHE = {}
+
+# Define some aliases for 'utf-8-variants'. All hyphens get turned into
+# underscores, because of `normalize_encoding`.
+UTF8_VAR_NAMES = (
+    'utf_8_variants', 'utf8_variants',
+    'utf_8_variant', 'utf8_variant',
+    'utf_8_var', 'utf8_var',
+    'cesu_8', 'cesu8',
+    'java_utf_8', 'java_utf8'
+)
+
+
+def search_function(encoding):
+    """
+    Register our "bad codecs" with Python's codecs API. This involves adding
+    a search function that takes in an encoding name, and returns a codec
+    for that encoding if it knows one, or None if it doesn't.
+
+    The encodings this will match are:
+
+    - Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N',
+      where the non-sloppy version is an encoding that leaves some bytes
+      unmapped to characters.
+    - The 'utf-8-variants' encoding, which has the several aliases seen
+      above.
+    """
+    if encoding in _CACHE:
+        return _CACHE[encoding]
+
+    norm_encoding = normalize_encoding(encoding)
+    codec = None
+    if norm_encoding in UTF8_VAR_NAMES:
+        from ftfy.bad_codecs.utf8_variants import CODEC_INFO
+        codec = CODEC_INFO
+    elif norm_encoding.startswith('sloppy_'):
+        from ftfy.bad_codecs.sloppy import CODECS
+        codec = CODECS.get(norm_encoding)
+
+    if codec is not None:
+        _CACHE[encoding] = codec
+
+    return codec
+
+
+def ok():
+    """
+    A feel-good function that gives you something to call after importing
+    this package.
+
+    Why is this here? Pyflakes. Pyflakes gets upset when you import a module
+    and appear not to use it. It doesn't know that you're using it when
+    you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
+    encodings.
+    """
+    pass
+
+
+codecs.register(search_function)
--- a/lib/ftfy/bad_codecs/sloppy.py
+++ b/lib/ftfy/bad_codecs/sloppy.py
@ -0,0 +1,156 @@
+# coding: utf-8
+r"""
+Decodes single-byte encodings, filling their "holes" in the same messy way that
+everyone else does.
+
+A single-byte encoding maps each byte to a Unicode character, except that some
+bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
+example, bytes 0x81 and 0x8D, among others, have no meaning.
+
+Python, wanting to preserve some sense of decorum, will handle these bytes
+as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
+different from each other. It just hasn't defined what they are in terms of
+Unicode.
+
+Software that has to interoperate with Windows-1252 and Unicode -- such as all
+the common Web browsers -- will pick some Unicode characters for them to map
+to, and the characters they pick are the Unicode characters with the same
+numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
+resulting characters tend to fall into a range of Unicode that's set aside for
+obselete Latin-1 control characters anyway.
+
+These sloppy codecs let Python do the same thing, thus interoperating with
+other software that works this way. It defines a sloppy version of many
+single-byte encodings with holes. (There is no need for a sloppy version of
+an encoding without holes: for example, there is no such thing as
+sloppy-iso-8859-2 or sloppy-macroman.)
+
+The following encodings will become defined:
+
+- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
+- sloppy-windows-1251 (Cyrillic)
+- sloppy-windows-1252 (Western European, based on Latin-1)
+- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
+- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
+- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
+- sloppy-windows-1256 (Arabic)
+- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
+- sloppy-windows-1258 (Vietnamese)
+- sloppy-cp874 (Thai, based on ISO-8859-11)
+- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
+- sloppy-iso-8859-6 (different Arabic)
+- sloppy-iso-8859-7 (Greek)
+- sloppy-iso-8859-8 (Hebrew)
+- sloppy-iso-8859-11 (Thai)
+
+Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
+defined.
+
+Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
+the rest are rather uncommon.
+
+Here are some examples, using `ftfy.explain_unicode` to illustrate how
+sloppy-windows-1252 merges Windows-1252 with Latin-1:
+
+    >>> from ftfy import explain_unicode
+    >>> some_bytes = b'\x80\x81\x82'
+    >>> explain_unicode(some_bytes.decode('latin-1'))
+    U+0080  \x80    [Cc] <unknown>
+    U+0081  \x81    [Cc] <unknown>
+    U+0082  \x82    [Cc] <unknown>
+
+    >>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
+    U+20AC  €       [Sc] EURO SIGN
+    U+FFFD  <EFBFBD>       [So] REPLACEMENT CHARACTER
+    U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
+
+    >>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
+    U+20AC  €       [Sc] EURO SIGN
+    U+0081  \x81    [Cc] <unknown>
+    U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
+"""
+from __future__ import unicode_literals
+import codecs
+from encodings import normalize_encoding
+
+REPLACEMENT_CHAR = '\ufffd'
+
+
+def make_sloppy_codec(encoding):
+    """
+    Take a codec name, and return a 'sloppy' version of that codec that can
+    encode and decode the unassigned bytes in that encoding.
+
+    Single-byte encodings in the standard library are defined using some
+    boilerplate classes surrounding the functions that do the actual work,
+    `codecs.charmap_decode` and `charmap_encode`. This function, given an
+    encoding name, *defines* those boilerplate classes.
+    """
+    # Make an array of all 256 possible bytes.
+    all_bytes = bytearray(range(256))
+
+    # Get a list of what they would decode to in Latin-1.
+    sloppy_chars = list(all_bytes.decode('latin-1'))
+
+    # Get a list of what they decode to in the given encoding. Use the
+    # replacement character for unassigned bytes.
+    decoded_chars = all_bytes.decode(encoding, errors='replace')
+
+    # Update the sloppy_chars list. Each byte that was successfully decoded
+    # gets its decoded value in the list. The unassigned bytes are left as
+    # they are, which gives their decoding in Latin-1.
+    for i, char in enumerate(decoded_chars):
+        if char != REPLACEMENT_CHAR:
+            sloppy_chars[i] = char
+
+    # Create the data structures that tell the charmap methods how to encode
+    # and decode in this sloppy encoding.
+    decoding_table = ''.join(sloppy_chars)
+    encoding_table = codecs.charmap_build(decoding_table)
+
+    # Now produce all the class boilerplate. Look at the Python source for
+    # `encodings.cp1252` for comparison; this is almost exactly the same,
+    # except I made it follow pep8.
+    class Codec(codecs.Codec):
+        def encode(self, input, errors='strict'):
+            return codecs.charmap_encode(input, errors, encoding_table)
+
+        def decode(self, input, errors='strict'):
+            return codecs.charmap_decode(input, errors, decoding_table)
+
+    class IncrementalEncoder(codecs.IncrementalEncoder):
+        def encode(self, input, final=False):
+            return codecs.charmap_encode(input, self.errors, encoding_table)[0]
+
+    class IncrementalDecoder(codecs.IncrementalDecoder):
+        def decode(self, input, final=False):
+            return codecs.charmap_decode(input, self.errors, decoding_table)[0]
+
+    class StreamWriter(Codec, codecs.StreamWriter):
+        pass
+
+    class StreamReader(Codec, codecs.StreamReader):
+        pass
+
+    return codecs.CodecInfo(
+        name='sloppy-' + encoding,
+        encode=Codec().encode,
+        decode=Codec().decode,
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamreader=StreamReader,
+        streamwriter=StreamWriter,
+    )
+
+# Define a codec for each incomplete encoding. The resulting CODECS dictionary
+# can be used by the main module of ftfy.bad_codecs.
+CODECS = {}
+INCOMPLETE_ENCODINGS = (
+    ['windows-%s' % num for num in range(1250, 1259)] +
+    ['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
+    ['cp%s' % num for num in range(1250, 1259)] + ['cp874']
+)
+
+for _encoding in INCOMPLETE_ENCODINGS:
+    _new_name = normalize_encoding('sloppy-' + _encoding)
+    CODECS[_new_name] = make_sloppy_codec(_encoding)
--- a/lib/ftfy/bad_codecs/utf8_variants.py
+++ b/lib/ftfy/bad_codecs/utf8_variants.py
@ -0,0 +1,281 @@
+r"""
+This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can
+decode text that's been encoded with a popular non-standard version of UTF-8.
+This includes CESU-8, the accidental encoding made by layering UTF-8 on top of
+UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for
+codepoint 0.
+
+This is particularly relevant in Python 3, which provides no other way of
+decoding CESU-8 or Java's encoding. [1]
+
+The easiest way to use the codec is to simply import `ftfy.bad_codecs`:
+
+    >>> import ftfy.bad_codecs
+    >>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var')
+    >>> print(repr(result).lstrip('u'))
+    'here comes a null! \x00'
+
+The codec does not at all enforce "correct" CESU-8. For example, the Unicode
+Consortium's not-quite-standard describing CESU-8 requires that there is only
+one possible encoding of any character, so it does not allow mixing of valid
+UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8
+decoder does.
+
+Characters in the Basic Multilingual Plane still have only one encoding. This
+codec still enforces the rule, within the BMP, that characters must appear in
+their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`,
+instead of just `0x00`, may be used to encode the null character `U+0000`, like
+in Java.
+
+If you encode with this codec, you get legitimate UTF-8. Decoding with this
+codec and then re-encoding is not idempotent, although encoding and then
+decoding is. So this module won't produce CESU-8 for you. Look for that
+functionality in the sister module, "Breaks Text For You", coming approximately
+never.
+
+[1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: first
+decode the bytes (incorrectly), then encode them, then decode them again, using
+UTF-8 as the codec every time.
+"""
+
+from __future__ import unicode_literals
+from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2
+from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder,
+                             IncrementalEncoder as UTF8IncrementalEncoder)
+import re
+import codecs
+
+NAME = 'utf-8-variants'
+# This regular expression matches all possible six-byte CESU-8 sequences.
+CESU8_RE = re.compile(b'\xed[\xa0-\xaf][\x80-\xbf]\xed[\xb0-\xbf][\x80-\xbf]')
+
+
+class IncrementalDecoder(UTF8IncrementalDecoder):
+    """
+    An incremental decoder that extends Python's built-in UTF-8 decoder.
+
+    This encoder needs to take in bytes, possibly arriving in a stream, and
+    output the correctly decoded text. The general strategy for doing this
+    is to fall back on the real UTF-8 decoder whenever possible, because
+    the real UTF-8 decoder is way optimized, but to call specialized methods
+    we define here for the cases the real encoder isn't expecting.
+    """
+    def _buffer_decode(self, input, errors, final):
+        """
+        Decode bytes that may be arriving in a stream, following the Codecs
+        API.
+
+        `input` is the incoming sequence of bytes. `errors` tells us how to
+        handle errors, though we delegate all error-handling cases to the real
+        UTF-8 decoder to ensure correct behavior. `final` indicates whether
+        this is the end of the sequence, in which case we should raise an
+        error given incomplete input.
+
+        Returns as much decoded text as possible, and the number of bytes
+        consumed.
+        """
+        # decoded_segments are the pieces of text we have decoded so far,
+        # and position is our current position in the byte string. (Bytes
+        # before this position have been consumed, and bytes after it have
+        # yet to be decoded.)
+        decoded_segments = []
+        position = 0
+        while True:
+            # Use _buffer_decode_step to decode a segment of text.
+            decoded, consumed = self._buffer_decode_step(
+                input[position:],
+                errors,
+                final
+            )
+            if consumed == 0:
+                # Either there's nothing left to decode, or we need to wait
+                # for more input. Either way, we're done for now.
+                break
+
+            # Append the decoded text to the list, and update our position.
+            decoded_segments.append(decoded)
+            position += consumed
+
+        if final:
+            # _buffer_decode_step must consume all the bytes when `final` is
+            # true.
+            assert position == len(input)
+
+        return ''.join(decoded_segments), position
+
+    def _buffer_decode_step(self, input, errors, final):
+        """
+        There are three possibilities for each decoding step:
+
+        - Decode as much real UTF-8 as possible.
+        - Decode a six-byte CESU-8 sequence at the current position.
+        - Decode a Java-style null at the current position.
+
+        This method figures out which step is appropriate, and does it.
+        """
+        # Get a reference to the superclass method that we'll be using for
+        # most of the real work.
+        sup = UTF8IncrementalDecoder._buffer_decode
+
+        # Find the next byte position that indicates a variant of UTF-8.
+        # CESU-8 sequences always start with 0xed, and Java nulls always
+        # start with 0xc0, both of which are conveniently impossible in
+        # real UTF-8.
+        cutoff1 = input.find(b'\xed')
+        cutoff2 = input.find(b'\xc0')
+
+        # Set `cutoff` to whichever cutoff comes first.
+        if cutoff1 != -1 and cutoff2 != -1:
+            cutoff = min(cutoff1, cutoff2)
+        elif cutoff1 != -1:
+            cutoff = cutoff1
+        elif cutoff2 != -1:
+            cutoff = cutoff2
+        else:
+            # The entire input can be decoded as UTF-8, so just do so.
+            return sup(input, errors, final)
+
+        if cutoff1 == 0:
+            # Decode a possible six-byte sequence starting with 0xed.
+            return self._buffer_decode_surrogates(sup, input, errors, final)
+        elif cutoff2 == 0:
+            # Decode a possible two-byte sequence, 0xc0 0x80.
+            return self._buffer_decode_null(sup, input, errors, final)
+        else:
+            # Decode the bytes up until the next weird thing as UTF-8.
+            # Set final=True because 0xc0 and 0xed don't make sense in the
+            # middle of a sequence, in any variant.
+            return sup(input[:cutoff], errors, True)
+
+    @staticmethod
+    def _buffer_decode_null(sup, input, errors, final):
+        """
+        Decode the bytes 0xc0 0x80 as U+0000, like Java does.
+        """
+        nextbyte = input[1:2]
+        if nextbyte == b'':
+            if final:
+                # We found 0xc0 at the end of the stream, which is an error.
+                # Delegate to the superclass method to handle that error.
+                return sup(input, errors, final)
+            else:
+                # We found 0xc0 and we don't know what comes next, so consume
+                # no bytes and wait.
+                return '', 0
+        elif nextbyte == b'\x80':
+            # We found the usual 0xc0 0x80 sequence, so decode it and consume
+            # two bytes.
+            return '\u0000', 2
+        else:
+            # We found 0xc0 followed by something else, which is an error.
+            # Whatever should happen is equivalent to what happens when the
+            # superclass is given just the byte 0xc0, with final=True.
+            return sup(b'\xc0', errors, True)
+
+    @staticmethod
+    def _buffer_decode_surrogates(sup, input, errors, final):
+        """
+        When we have improperly encoded surrogates, we can still see the
+        bits that they were meant to represent.
+
+        The surrogates were meant to encode a 20-bit number, to which we
+        add 0x10000 to get a codepoint. That 20-bit number now appears in
+        this form:
+
+          11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
+
+        The CESU8_RE above matches byte sequences of this form. Then we need
+        to extract the bits and assemble a codepoint number from them.
+        """
+        if len(input) < 6:
+            if final:
+                # We found 0xed near the end of the stream, and there aren't
+                # six bytes to decode. Delegate to the superclass method to
+                # handle it as normal UTF-8. It might be a Hangul character
+                # or an error.
+                if PYTHON2 and len(input) >= 3:
+                    # We can't trust Python 2 to raise an error when it's
+                    # asked to decode a surrogate, so let's force the issue.
+                    input = mangle_surrogates(input)
+                return sup(input, errors, final)
+            else:
+                # We found 0xed, the stream isn't over yet, and we don't know
+                # enough of the following bytes to decode anything, so consume
+                # zero bytes and wait.
+                return '', 0
+        else:
+            if CESU8_RE.match(input):
+                # If this is a CESU-8 sequence, do some math to pull out
+                # the intended 20-bit value, and consume six bytes.
+                bytenums = bytes_to_ints(input[:6])
+                codepoint = (
+                    ((bytenums[1] & 0x0f) << 16) +
+                    ((bytenums[2] & 0x3f) << 10) +
+                    ((bytenums[4] & 0x0f) << 6) +
+                    (bytenums[5] & 0x3f) +
+                    0x10000
+                )
+                return unichr(codepoint), 6
+            else:
+                # This looked like a CESU-8 sequence, but it wasn't one.
+                # 0xed indicates the start of a three-byte sequence, so give
+                # three bytes to the superclass to decode as usual -- except
+                # for working around the Python 2 discrepancy as before.
+                if PYTHON2:
+                    input = mangle_surrogates(input)
+                return sup(input[:3], errors, False)
+
+
+def mangle_surrogates(bytestring):
+    """
+    When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats
+    it as an error (which it is). In 'replace' mode, it will decode as three
+    replacement characters. But Python 2 will just output the surrogate
+    codepoint.
+
+    To ensure consistency between Python 2 and Python 3, and protect downstream
+    applications from malformed strings, we turn surrogate sequences at the
+    start of the string into the bytes `ff ff ff`, which we're *sure* won't
+    decode, and which turn into three replacement characters in 'replace' mode.
+    """
+    if PYTHON2:
+        if bytestring.startswith(b'\xed') and len(bytestring) >= 3:
+            decoded = bytestring[:3].decode('utf-8', 'replace')
+            if '\ud800' <= decoded <= '\udfff':
+                return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:])
+        return bytestring
+    else:
+        # On Python 3, nothing needs to be done.
+        return bytestring
+
+# The encoder is identical to UTF-8.
+IncrementalEncoder = UTF8IncrementalEncoder
+
+
+# Everything below here is boilerplate that matches the modules in the
+# built-in `encodings` package.
+def encode(input, errors='strict'):
+    return IncrementalEncoder(errors).encode(input, final=True), len(input)
+
+
+def decode(input, errors='strict'):
+    return IncrementalDecoder(errors).decode(input, final=True), len(input)
+
+
+class StreamWriter(codecs.StreamWriter):
+    encode = encode
+
+
+class StreamReader(codecs.StreamReader):
+    decode = decode
+
+
+CODEC_INFO = codecs.CodecInfo(
+    name=NAME,
+    encode=encode,
+    decode=decode,
+    incrementalencoder=IncrementalEncoder,
+    incrementaldecoder=IncrementalDecoder,
+    streamreader=StreamReader,
+    streamwriter=StreamWriter,
+)
--- a/lib/ftfy/badness.py
+++ b/lib/ftfy/badness.py
@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+"""
+Heuristics to determine whether re-encoding text is actually making it
+more reasonable.
+"""
+
+from __future__ import unicode_literals
+from ftfy.chardata import chars_to_classes
+import re
+import unicodedata
+
+# The following regex uses the mapping of character classes to ASCII
+# characters defined in chardata.py and build_data.py:
+#
+# L = Latin capital letter
+# l = Latin lowercase letter
+# A = Non-latin capital or title-case letter
+# a = Non-latin lowercase letter
+# C = Non-cased letter (Lo)
+# X = Control character (Cc)
+# m = Letter modifier (Lm)
+# M = Mark (Mc, Me, Mn)
+# N = Miscellaneous numbers (No)
+# 0 = Math symbol (Sm)
+# 1 = Currency symbol (Sc)
+# 2 = Symbol modifier (Sk)
+# 3 = Other symbol (So)
+# S = UTF-16 surrogate
+# _ = Unassigned character
+#   = Whitespace
+# o = Other
+
+
+def _make_weirdness_regex():
+    """
+    Creates a list of regexes that match 'weird' character sequences.
+    The more matches there are, the weirder the text is.
+    """
+    groups = []
+
+    # Match lowercase letters that are followed by non-ASCII uppercase letters
+    groups.append('lA')
+
+    # Match diacritical marks, except when they modify a non-cased letter or
+    # another mark.
+    #
+    # You wouldn't put a diacritical mark on a digit or a space, for example.
+    # You might put it on a Latin letter, but in that case there will almost
+    # always be a pre-composed version, and we normalize to pre-composed
+    # versions first. The cases that can't be pre-composed tend to be in
+    # large scripts without case, which are in class C.
+    groups.append('[^CM]M')
+
+    # Match non-Latin characters adjacent to Latin characters.
+    #
+    # This is a simplification from ftfy version 2, which compared all
+    # adjacent scripts. However, the ambiguities we need to resolve come from
+    # encodings designed to represent Latin characters.
+    groups.append('[Ll][AaC]')
+    groups.append('[AaC][Ll]')
+
+    # Match C1 control characters, which are almost always the result of
+    # decoding Latin-1 that was meant to be Windows-1252.
+    groups.append('X')
+
+    # Match private use and unassigned characters.
+    groups.append('P')
+    groups.append('_')
+
+    # Match adjacent characters from any different pair of these categories:
+    # - Modifier marks (M)
+    # - Letter modifiers (m)
+    # - Miscellaneous numbers (N)
+    # - Symbols (0123)
+
+    exclusive_categories = 'MmN0123'
+    for cat1 in exclusive_categories:
+        others_range = ''.join(c for c in exclusive_categories if c != cat1)
+        groups.append('{cat1}[{others_range}]'.format(
+            cat1=cat1, others_range=others_range
+        ))
+    regex = '|'.join('({0})'.format(group) for group in groups)
+    return re.compile(regex)
+
+WEIRDNESS_RE = _make_weirdness_regex()
+
+# A few characters are common ending punctuation that can show up at the end
+# of a mojibake sequence. It's plausible that such a character could appear
+# after an accented capital letter, for example, so we'll want to add a
+# slight preference to leave these characters alone.
+#
+# The match ends with a + so that we only give the bonus once for a
+# consecutive sequence of these characters.
+ENDING_PUNCT_RE = re.compile(
+    '['
+    '\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}'
+    '\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}'
+    '\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}'
+    '\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}'
+    ']+'
+)
+
+def sequence_weirdness(text):
+    """
+    Determine how often a text has unexpected characters or sequences of
+    characters. This metric is used to disambiguate when text should be
+    re-decoded or left as is.
+
+    We start by normalizing text in NFC form, so that penalties for
+    diacritical marks don't apply to characters that know what to do with
+    them.
+
+    The following things are deemed weird:
+
+    - Lowercase letters followed by non-ASCII uppercase letters
+    - Non-Latin characters next to Latin characters
+    - Un-combined diacritical marks, unless they're stacking on non-alphabetic
+      characters (in languages that do that kind of thing a lot) or other
+      marks
+    - C1 control characters
+    - Adjacent symbols from any different pair of these categories:
+
+        - Modifier marks
+        - Letter modifiers
+        - Non-digit numbers
+        - Symbols (including math and currency)
+
+    The return value is the number of instances of weirdness.
+    """
+    text2 = unicodedata.normalize('NFC', text)
+    weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2)))
+    punct_discount = len(ENDING_PUNCT_RE.findall(text2))
+    return weirdness * 2 - punct_discount
+
+
+def text_cost(text):
+    """
+    An overall cost function for text. Weirder is worse, but all else being
+    equal, shorter strings are better.
+
+    The overall cost is measured as the "weirdness" (see
+    :func:`sequence_weirdness`) plus the length.
+    """
+    return sequence_weirdness(text) + len(text)
--- a/lib/ftfy/build_data.py
+++ b/lib/ftfy/build_data.py
@ -0,0 +1,111 @@
+"""
+A script to make the char_classes.dat file.
+
+This never needs to run in normal usage. It needs to be run if the character
+classes we care about change, or if a new version of Python supports a new
+Unicode standard and we want it to affect our string decoding.
+
+The file that we generate is based on Unicode 6.1, as supported by Python 3.3.
+You can certainly use it in earlier versions. This simply makes sure that we
+get consistent results from running ftfy on different versions of Python.
+
+The file will be written to the current directory.
+"""
+from __future__ import unicode_literals
+import unicodedata
+import sys
+import zlib
+if sys.hexversion >= 0x03000000:
+    unichr = chr
+
+# L = Latin capital letter
+# l = Latin lowercase letter
+# A = Non-latin capital or title-case letter
+# a = Non-latin lowercase letter
+# C = Non-cased letter (Lo)
+# X = Control character (Cc)
+# m = Letter modifier (Lm)
+# M = Mark (Mc, Me, Mn)
+# N = Miscellaneous numbers (No)
+# P = Private use (Co)
+# 0 = Math symbol (Sm)
+# 1 = Currency symbol (Sc)
+# 2 = Symbol modifier (Sk)
+# 3 = Other symbol (So)
+# S = UTF-16 surrogate
+# _ = Unassigned character
+#   = Whitespace
+# o = Other
+
+
+def make_char_data_file(do_it_anyway=False):
+    """
+    Build the compressed data file 'char_classes.dat' and write it to the
+    current directory.
+
+    If you run this, run it in Python 3.3 or later. It will run in earlier
+    versions, but you won't get the current Unicode standard, leading to
+    inconsistent behavior. To protect against this, running this in the
+    wrong version of Python will raise an error unless you pass
+    `do_it_anyway=True`.
+    """
+    if sys.hexversion < 0x03030000 and not do_it_anyway:
+        raise RuntimeError(
+            "This function should be run in Python 3.3 or later."
+        )
+
+    cclasses = [None] * 0x110000
+    for codepoint in range(0x0, 0x110000):
+        char = unichr(codepoint)
+        category = unicodedata.category(char)
+
+        if category.startswith('L'):  # letters
+            is_latin = unicodedata.name(char).startswith('LATIN')
+            if is_latin and codepoint < 0x200:
+                if category == 'Lu':
+                    cclasses[codepoint] = 'L'
+                else:
+                    cclasses[codepoint] = 'l'
+            else:  # non-Latin letter, or close enough
+                if category == 'Lu' or category == 'Lt':
+                    cclasses[codepoint] = 'A'
+                elif category == 'Ll':
+                    cclasses[codepoint] = 'a'
+                elif category == 'Lo':
+                    cclasses[codepoint] = 'C'
+                elif category == 'Lm':
+                    cclasses[codepoint] = 'm'
+                else:
+                    raise ValueError('got some weird kind of letter')
+        elif category.startswith('M'):  # marks
+            cclasses[codepoint] = 'M'
+        elif category == 'No':
+            cclasses[codepoint] = 'N'
+        elif category == 'Sm':
+            cclasses[codepoint] = '0'
+        elif category == 'Sc':
+            cclasses[codepoint] = '1'
+        elif category == 'Sk':
+            cclasses[codepoint] = '2'
+        elif category == 'So':
+            cclasses[codepoint] = '3'
+        elif category == 'Cn':
+            cclasses[codepoint] = '_'
+        elif category == 'Cc':
+            cclasses[codepoint] = 'X'
+        elif category == 'Cs':
+            cclasses[codepoint] = 'S'
+        elif category == 'Co':
+            cclasses[codepoint] = 'P'
+        elif category.startswith('Z'):
+            cclasses[codepoint] = ' '
+        else:
+            cclasses[codepoint] = 'o'
+
+    cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
+    out = open('char_classes.dat', 'wb')
+    out.write(zlib.compress(''.join(cclasses).encode('ascii')))
+    out.close()
+
+if __name__ == '__main__':
+    make_char_data_file()
--- a/lib/ftfy/char_classes.dat
+++ b/lib/ftfy/char_classes.dat
--- a/lib/ftfy/chardata.py
+++ b/lib/ftfy/chardata.py
@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+"""
+This gives other modules access to the gritty details about characters and the
+encodings that use them.
+"""
+
+from __future__ import unicode_literals
+import re
+import zlib
+from pkg_resources import resource_string
+from ftfy.compatibility import unichr
+
+# These are the five encodings we will try to fix in ftfy, in the
+# order that they should be tried.
+CHARMAP_ENCODINGS = [
+    'latin-1',
+    'sloppy-windows-1252',
+    'macroman',
+    'cp437',
+    'sloppy-windows-1251',
+]
+
+
+def _build_regexes():
+    """
+    ENCODING_REGEXES contain reasonably fast ways to detect if we
+    could represent a given string in a given encoding. The simplest one is
+    the 'ascii' detector, which of course just determines if all characters
+    are between U+0000 and U+007F.
+    """
+    # Define a regex that matches ASCII text.
+    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
+
+    for encoding in CHARMAP_ENCODINGS:
+        latin1table = ''.join(unichr(i) for i in range(128, 256))
+        charlist = latin1table.encode('latin-1').decode(encoding)
+
+        # Build a regex from the ASCII range, followed by the decodings of
+        # bytes 0x80-0xff in this character set. (This uses the fact that all
+        # regex special characters are ASCII, and therefore won't appear in the
+        # string.)
+        regex = '^[\x00-\x7f{}]*$'.format(charlist)
+        encoding_regexes[encoding] = re.compile(regex)
+    return encoding_regexes
+ENCODING_REGEXES = _build_regexes()
+
+
+def possible_encoding(text, encoding):
+    """
+    Given text and a single-byte encoding, check whether that text could have
+    been decoded from that single-byte encoding.
+
+    In other words, check whether it can be encoded in that encoding, possibly
+    sloppily.
+    """
+    return bool(ENCODING_REGEXES[encoding].match(text))
+
+
+CHAR_CLASS_STRING = zlib.decompress(
+    resource_string(__name__, 'char_classes.dat')
+).decode('ascii')
+
+def chars_to_classes(string):
+    """
+    Convert each Unicode character to a letter indicating which of many
+    classes it's in.
+
+    See build_data.py for where this data comes from and what it means.
+    """
+    return string.translate(CHAR_CLASS_STRING)
+
+
+# A translate mapping that will strip all C0 control characters except
+# those that represent whitespace.
+CONTROL_CHARS = {}
+for i in range(32):
+    CONTROL_CHARS[i] = None
+
+# Map whitespace control characters to themselves.
+for char in '\t\n\f\r':
+    del CONTROL_CHARS[ord(char)]
--- a/lib/ftfy/cli.py
+++ b/lib/ftfy/cli.py
@ -0,0 +1,34 @@
+"""
+A simple command-line utility for fixing text found in a file.
+
+Because files do not come with their encoding marked, it first runs the file
+through `ftfy.guess_bytes`, then runs it through `ftfy.fix_text`.
+"""
+from ftfy import fix_file
+
+import sys
+ENCODE_STDOUT = (sys.hexversion < 0x03000000)
+
+
+def main():
+    """
+    Run ftfy as a command-line utility. (Requires Python 2.7 or later, or
+    the 'argparse' module.)
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename', help='file to transcode')
+
+    args = parser.parse_args()
+
+    file = open(args.filename)
+    for line in fix_file(file):
+        if ENCODE_STDOUT:
+            sys.stdout.write(line.encode('utf-8'))
+        else:
+            sys.stdout.write(line)
+
+
+if __name__ == '__main__':
+    main()
--- a/lib/ftfy/compatibility.py
+++ b/lib/ftfy/compatibility.py
@ -0,0 +1,79 @@
+"""
+Makes some function names and behavior consistent between Python 2 and
+Python 3, and also between narrow and wide builds.
+"""
+from __future__ import unicode_literals
+import sys
+import re
+import unicodedata
+
+if sys.hexversion >= 0x03000000:
+    from html import entities
+    unichr = chr
+    xrange = range
+    PYTHON2 = False
+else:
+    import htmlentitydefs as entities
+    unichr = unichr
+    xrange = xrange
+    PYTHON2 = True
+htmlentitydefs = entities
+
+PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000)
+
+
+def _narrow_unichr_workaround(codepoint):
+    """
+    A replacement for unichr() on narrow builds of Python. This will get
+    us the narrow representation of an astral character, which will be
+    a string of length two, containing two UTF-16 surrogates.
+    """
+    escaped = b'\\U%08x' % codepoint
+    return escaped.decode('unicode-escape')
+
+
+if sys.maxunicode < 0x10000:
+    unichr = _narrow_unichr_workaround
+    # In a narrow build of Python, we can't write a regex involving astral
+    # characters. If we want to write the regex:
+    #
+    #   [\U00100000-\U0010ffff]
+    #
+    # The actual string that defines it quietly turns into:
+    #
+    #   [\udbc0\udc00-\udbff\udfff]
+    #
+    # And now the range operator only applies to the middle two characters.
+    # It looks like a range that's going backwards from \dc00 to \dbff,
+    # which is an error.
+    #
+    # What we can do instead is rewrite the expression to be _about_ the two
+    # surrogates that make up the astral characters, instead of the characters
+    # themselves. This would be wrong on a wide build, but it works on a
+    # narrow build.
+    UNSAFE_PRIVATE_USE_RE = re.compile('[\udbc0-\udbff][\udc00-\udfff]')
+else:
+    UNSAFE_PRIVATE_USE_RE = re.compile('[\U00100000-\U0010ffff]')
+
+
+def bytes_to_ints(bytestring):
+    """
+    No matter what version of Python this is, make a sequence of integers from
+    a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a
+    sequence of integers.
+    """
+    if PYTHON2:
+        return [ord(b) for b in bytestring]
+    else:
+        return bytestring
+
+
+def is_printable(char):
+    """
+    str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so
+    let's make a crude approximation in Python 2.
+    """
+    if PYTHON2:
+        return not unicodedata.category(char).startswith('C')
+    else:
+        return char.isprintable()
--- a/lib/ftfy/fixes.py
+++ b/lib/ftfy/fixes.py
@ -0,0 +1,473 @@
+# -*- coding: utf-8 -*-
+"""
+This module contains the individual fixes that the main fix_text function
+can perform.
+"""
+
+from __future__ import unicode_literals
+from ftfy.chardata import (possible_encoding,
+                           CHARMAP_ENCODINGS, CONTROL_CHARS)
+from ftfy.badness import text_cost
+from ftfy.compatibility import htmlentitydefs, unichr, UNSAFE_PRIVATE_USE_RE
+import re
+import sys
+import codecs
+
+
+BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
+
+ftfy is designed to fix problems that were introduced by handling Unicode
+incorrectly. It might be able to fix the bytes you just handed it, but the
+fact that you just gave a pile of bytes to a function that fixes text means
+that your code is *also* handling Unicode incorrectly.
+
+ftfy takes Unicode text as input. You should take these bytes and decode
+them from the encoding you think they are in. If you're not sure what encoding
+they're in:
+
+- First, try to find out. 'utf-8' is a good assumption.
+- If the encoding is simply unknowable, try running your bytes through
+  ftfy.guess_bytes. As the name implies, this may not always be accurate.
+
+If you're confused by this, please read the Python Unicode HOWTO:
+
+    http://docs.python.org/%d/howto/unicode.html
+""" % sys.version_info[0]
+
+
+def fix_text_encoding(text):
+    r"""
+    Fix text with incorrectly-decoded garbage ("mojibake") whenever possible.
+
+    Something you will find all over the place, in real-world text, is text
+    that's mistakenly encoded as utf-8, decoded in some ugly format like
+    latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
+
+    This causes your perfectly good Unicode-aware code to end up with garbage
+    text because someone else (or maybe "someone else") made a mistake.
+
+    This function looks for the evidence of that having happened and fixes it.
+    It determines whether it should replace nonsense sequences of single-byte
+    characters that were really meant to be UTF-8 characters, and if so, turns
+    them into the correctly-encoded Unicode character that they were meant to
+    represent.
+
+    The input to the function must be Unicode. If you don't have Unicode text,
+    you're not using the right tool to solve your problem.
+
+    .. note::
+        The following examples are written using unmarked literal strings,
+        but they are Unicode text. In Python 2 we have "unicode_literals"
+        turned on, and in Python 3 this is always the case.
+
+    ftfy decodes text that looks like it was decoded incorrectly. It leaves
+    alone text that doesn't.
+
+        >>> print(fix_text_encoding('Ãºnico'))
+        único
+
+        >>> print(fix_text_encoding('This text is fine already :þ'))
+        This text is fine already :þ
+
+    Because these characters often come from Microsoft products, we allow
+    for the possibility that we get not just Unicode characters 128-255, but
+    also Windows's conflicting idea of what characters 128-160 are.
+
+        >>> print(fix_text_encoding('This â€” should be an em dash'))
+        This — should be an em dash
+
+    We might have to deal with both Windows characters and raw control
+    characters at the same time, especially when dealing with characters like
+    0x81 that have no mapping in Windows. This is a string that Python's
+    standard `.encode` and `.decode` methods cannot correct.
+
+        >>> print(fix_text_encoding('This text is sad .â\x81”.'))
+        This text is sad .⁔.
+
+    However, it has safeguards against fixing sequences of letters and
+    punctuation that can occur in valid text:
+
+        >>> print(fix_text_encoding('not such a fan of Charlotte Brontë…”'))
+        not such a fan of Charlotte Brontë…”
+
+    Cases of genuine ambiguity can sometimes be addressed by finding other
+    characters that are not double-encoded, and expecting the encoding to
+    be consistent:
+
+        >>> print(fix_text_encoding('AHÅ™, the new sofa from IKEA®'))
+        AHÅ™, the new sofa from IKEA®
+
+    Finally, we handle the case where the text is in a single-byte encoding
+    that was intended as Windows-1252 all along but read as Latin-1:
+
+        >>> print(fix_text_encoding('This text was never UTF-8 at all\x85'))
+        This text was never UTF-8 at all…
+
+    The best version of the text is found using
+    :func:`ftfy.badness.text_cost`.
+    """
+    text, _plan = fix_encoding_and_explain(text)
+    return text
+
+
+def fix_encoding_and_explain(text):
+    """
+    Re-decodes text that has been decoded incorrectly, and also return a
+    "plan" indicating all the steps required to fix it.
+
+    To fix similar text in the same way, without having to detect anything,
+    you can use the ``apply_plan`` function.
+    """
+    best_version = text
+    best_cost = text_cost(text)
+    best_plan = []
+    plan_so_far = []
+    while True:
+        prevtext = text
+        text, plan = fix_one_step_and_explain(text)
+        plan_so_far.extend(plan)
+        cost = text_cost(text)
+
+        # Add a penalty if we used a particularly obsolete encoding. The result
+        # is that we won't use these encodings unless they can successfully
+        # replace multiple characters.
+        if ('encode', 'macroman') in plan_so_far or\
+           ('encode', 'cp437') in plan_so_far:
+            cost += 2
+
+        # We need pretty solid evidence to decode from Windows-1251 (Cyrillic).
+        if ('encode', 'sloppy-windows-1251') in plan_so_far:
+            cost += 5
+
+        if cost < best_cost:
+            best_cost = cost
+            best_version = text
+            best_plan = list(plan_so_far)
+        if text == prevtext:
+            return best_version, best_plan
+
+
+def fix_one_step_and_explain(text):
+    """
+    Performs a single step of re-decoding text that's been decoded incorrectly.
+
+    Returns the decoded text, plus a "plan" for how to reproduce what it
+    did.
+    """
+    if isinstance(text, bytes):
+        raise UnicodeError(BYTES_ERROR_TEXT)
+    if len(text) == 0:
+        return text, []
+
+    # The first plan is to return ASCII text unchanged.
+    if possible_encoding(text, 'ascii'):
+        return text, []
+
+    # As we go through the next step, remember the possible encodings
+    # that we encounter but don't successfully fix yet. We may need them
+    # later.
+    possible_1byte_encodings = []
+
+    # Suppose the text was supposed to be UTF-8, but it was decoded using
+    # a single-byte encoding instead. When these cases can be fixed, they
+    # are usually the correct thing to do, so try them next.
+    for encoding in CHARMAP_ENCODINGS:
+        if possible_encoding(text, encoding):
+            encoded_bytes = text.encode(encoding)
+
+            # Now, find out if it's UTF-8 (or close enough). Otherwise,
+            # remember the encoding for later.
+            try:
+                decoding = 'utf-8'
+                if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes:
+                    decoding = 'utf-8-variants'
+                fixed = encoded_bytes.decode(decoding)
+                steps = [('encode', encoding), ('decode', decoding)]
+                return fixed, steps
+            except UnicodeDecodeError:
+                possible_1byte_encodings.append(encoding)
+
+    # The next most likely case is that this is Latin-1 that was intended to
+    # be read as Windows-1252, because those two encodings in particular are
+    # easily confused.
+    if 'latin-1' in possible_1byte_encodings:
+        if 'windows-1252' in possible_1byte_encodings:
+            # This text is in the intersection of Latin-1 and
+            # Windows-1252, so it's probably legit.
+            return text, []
+        else:
+            # Otherwise, it means we have characters that are in Latin-1 but
+            # not in Windows-1252. Those are C1 control characters. Nobody
+            # wants those. Assume they were meant to be Windows-1252. Don't
+            # use the sloppy codec, because bad Windows-1252 characters are
+            # a bad sign.
+            encoded = text.encode('latin-1')
+            try:
+                fixed = encoded.decode('windows-1252')
+                steps = []
+                if fixed != text:
+                    steps = [('encode', 'latin-1'), ('decode', 'windows-1252')]
+                return fixed, steps
+            except UnicodeDecodeError:
+                # This text contained characters that don't even make sense
+                # if you assume they were supposed to be Windows-1252. In
+                # that case, let's not assume anything.
+                pass
+
+    # The cases that remain are mixups between two different single-byte
+    # encodings, and not the common case of Latin-1 vs. Windows-1252.
+    #
+    # Those cases are somewhat rare, and impossible to solve without false
+    # positives. If you're in one of these situations, you should try using
+    # the `ftfy.guess_bytes` function.
+
+    # Return the text unchanged; the plan is empty.
+    return text, []
+
+
+def apply_plan(text, plan):
+    """
+    Apply a plan for fixing the encoding of text.
+
+    The plan is a list of tuples of the form (operation, encoding), where
+    `operation` is either 'encode' or 'decode', and `encoding` is an encoding
+    name such as 'utf-8' or 'latin-1'.
+
+    Because only text can be encoded, and only bytes can be decoded, the plan
+    should alternate 'encode' and 'decode' steps, or else this function will
+    encounter an error.
+    """
+    obj = text
+    for operation, encoding in plan:
+        if operation == 'encode':
+            obj = obj.encode(encoding)
+        elif operation == 'decode':
+            obj = obj.decode(encoding)
+        else:
+            raise ValueError("Unknown plan step: %s" % operation)
+
+    return obj
+
+
+HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};")
+
+
+def unescape_html(text):
+    """
+    Decode all three types of HTML entities/character references.
+
+    Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
+    to it for efficiency: it won't match entities longer than 8 characters,
+    because there are no valid entities like that.
+
+        >>> print(unescape_html('&lt;tag&gt;'))
+        <tag>
+    """
+    def fixup(match):
+        """
+        Replace one matched HTML entity with the character it represents,
+        if possible.
+        """
+        text = match.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text  # leave as is
+    return HTML_ENTITY_RE.sub(fixup, text)
+
+
+ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])')
+
+def remove_terminal_escapes(text):
+    r"""
+    Strip out "ANSI" terminal escape sequences, such as those that produce
+    colored text on Unix.
+
+        >>> print(remove_terminal_escapes(
+        ...     "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
+        ... ))
+        I'm blue, da ba dee da ba doo...
+    """
+    return ANSI_RE.sub('', text)
+
+
+SINGLE_QUOTE_RE = re.compile('[\u2018-\u201b]')
+DOUBLE_QUOTE_RE = re.compile('[\u201c-\u201f]')
+
+def uncurl_quotes(text):
+    r"""
+    Replace curly quotation marks with straight equivalents.
+
+        >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
+        "here's a test"
+    """
+    return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
+
+
+def fix_line_breaks(text):
+    r"""
+    Convert all line breaks to Unix style.
+
+    This will convert the following sequences into the standard \\n
+    line break:
+
+        - CRLF (\\r\\n), used on Windows and in some communication
+          protocols
+        - CR (\\r), once used on Mac OS Classic, and now kept alive
+          by misguided software such as Microsoft Office for Mac
+        - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029),
+          defined by Unicode and used to sow confusion and discord
+        - NEXT LINE (\\x85), a C1 control character that is certainly
+          not what you meant
+
+    The NEXT LINE character is a bit of an odd case, because it
+    usually won't show up if `fix_encoding` is also being run.
+    \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
+
+        >>> print(fix_line_breaks(
+        ...     "This string is made of two things:\u2029"
+        ...     "1. Unicode\u2028"
+        ...     "2. Spite"
+        ... ))
+        This string is made of two things:
+        1. Unicode
+        2. Spite
+
+    For further testing and examples, let's define a function to make sure
+    we can see the control characters in their escaped form:
+
+        >>> def eprint(text):
+        ...     print(text.encode('unicode-escape').decode('ascii'))
+
+        >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
+        Content-type: text/plain\n\nHi.
+
+        >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
+        This is how Microsoft \n trolls Mac users
+
+        >>> eprint(fix_line_breaks("What is this \x85 I don't even"))
+        What is this \n I don't even
+    """
+    return text.replace('\r\n', '\n').replace('\r', '\n')\
+               .replace('\u2028', '\n').replace('\u2029', '\n')\
+               .replace('\u0085', '\n')
+
+
+def remove_control_chars(text):
+    """
+    Remove all control characters except for the important ones.
+
+    This removes characters in these ranges:
+
+    - U+0000 to U+0008
+    - U+000B
+    - U+000E to U+001F
+    - U+007F
+
+    It leaves alone these characters that are commonly used for formatting:
+
+    - TAB (U+0009)
+    - LF (U+000A)
+    - FF (U+000C)
+    - CR (U+000D)
+    """
+    return text.translate(CONTROL_CHARS)
+
+
+def remove_bom(text):
+    r"""
+    Remove a left-over byte-order mark.
+
+    >>> print(remove_bom("\ufeffWhere do you want to go today?"))
+    Where do you want to go today?
+    """
+    return text.lstrip(unichr(0xfeff))
+
+
+def remove_unsafe_private_use(text):
+    r"""
+    Python 3.3's Unicode support isn't perfect, and in fact there are certain
+    string operations that will crash some versions of it with a SystemError:
+    http://bugs.python.org/issue18183
+
+    The best solution is to remove all characters from Supplementary Private
+    Use Area B, using a regex that is known not to crash given those
+    characters.
+
+    These are the characters from U+100000 to U+10FFFF. It's sad to lose an
+    entire plane of Unicode, but on the other hand, these characters are not
+    assigned and never will be. If you get one of these characters and don't
+    know what its purpose is, its purpose is probably to crash your code.
+
+    If you were using these for actual private use, this might be inconvenient.
+    You can turn off this fixer, of course, but I kind of encourage using
+    Supplementary Private Use Area A instead.
+
+        >>> print(remove_unsafe_private_use('\U0001F4A9\U00100000'))
+        💩
+
+    This fixer is off by default in Python 3.4 or later. (The bug is actually
+    fixed in 3.3.3 and 2.7.6, but I don't want the default behavior to change
+    based on a micro version upgrade of Python.)
+    """
+    return UNSAFE_PRIVATE_USE_RE.sub('', text)
+
+
+# Define a regex to match valid escape sequences in Python string literals.
+ESCAPE_SEQUENCE_RE = re.compile(r'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\'"abfnrtv]  # Single-character escapes
+    )''', re.UNICODE | re.VERBOSE)
+
+
+def decode_escapes(text):
+    r"""
+    Decode backslashed escape sequences, including \\x, \\u, and \\U character
+    references, even in the presence of other Unicode.
+
+    This is what Python's "string-escape" and "unicode-escape" codecs were
+    meant to do, but in contrast, this actually works. It will decode the
+    string exactly the same way that the Python interpreter decodes its string
+    literals.
+
+        >>> factoid = '\\u20a1 is the currency symbol for the colón.'
+        >>> print(factoid[1:])
+        u20a1 is the currency symbol for the colón.
+        >>> print(decode_escapes(factoid))
+        ₡ is the currency symbol for the colón.
+
+    Even though Python itself can read string literals with a combination of
+    escapes and literal Unicode -- you're looking at one right now -- the
+    "unicode-escape" codec doesn't work on literal Unicode. (See
+    http://stackoverflow.com/a/24519338/773754 for more details.)
+    
+    Instead, this function searches for just the parts of a string that
+    represent escape sequences, and decodes them, leaving the rest alone. All
+    valid escape sequences are made of ASCII characters, and this allows
+    "unicode-escape" to work correctly.
+
+    This fix cannot be automatically applied by the `ftfy.fix_text` function,
+    because escaped text is not necessarily a mistake, and there is no way
+    to distinguish text that's supposed to be escaped from text that isn't.
+    """
+    def decode_match(match):
+        "Given a regex match, decode the escape sequence it contains."
+        return codecs.decode(match.group(0), 'unicode-escape')
+
+    return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
--- a/lib/ftfy/streamtester/init.py
+++ b/lib/ftfy/streamtester/init.py
@ -0,0 +1,39 @@
+"""
+This file defines a general method for evaluating ftfy using data that arrives
+in a stream. A concrete implementation of it is found in `twitter_tester.py`.
+"""
+from __future__ import print_function, unicode_literals
+from ftfy.fixes import fix_text_encoding
+from ftfy.chardata import possible_encoding
+
+
+class StreamTester:
+    """
+    Take in a sequence of texts, and show the ones that will be changed by
+    ftfy. This will also periodically show updates, such as the proportion of
+    texts that changed.
+    """
+    def __init__(self):
+        self.num_fixed = 0
+        self.count = 0
+
+    def check_ftfy(self, text):
+        """
+        Given a single text input, check whether `ftfy.fix_text_encoding`
+        would change it. If so, display the change.
+        """
+        self.count += 1
+        if not possible_encoding(text, 'ascii'):
+            fixed = fix_text_encoding(text)
+            if text != fixed:
+                # possibly filter common bots before printing
+                print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format(
+                    text=text, fixed=fixed
+                ))
+                self.num_fixed += 1
+
+        # Print status updates once in a while
+        if self.count % 100 == 0:
+            print('.', end='', flush=True)
+        if self.count % 10000 == 0:
+            print('\n%d/%d fixed' % (self.num_fixed, self.count))
--- a/lib/ftfy/streamtester/oauth.py
+++ b/lib/ftfy/streamtester/oauth.py
@ -0,0 +1,73 @@
+# coding: utf-8
+"""
+Do what is necessary to authenticate this tester as a Twitter "app", using
+somebody's Twitter account.
+"""
+from __future__ import unicode_literals
+import os
+
+
+AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth')
+
+def get_auth():
+    """
+    Twitter has some bizarre requirements about how to authorize an "app" to
+    use its API.
+
+    The user of the app has to log in to get a secret token. That's fine. But
+    the app itself has its own "consumer secret" token. The app has to know it,
+    and the user of the app has to not know it.
+
+    This is, of course, impossible. It's equivalent to DRM. Your computer can't
+    *really* make use of secret information while hiding the same information
+    from you.
+
+    The threat appears to be that, if you have this super-sekrit token, you can
+    impersonate the app while doing something different. Well, of course you
+    can do that, because you *have the source code* and you can change it to do
+    what you want. You still have to log in as a particular user who has a
+    token that's actually secret, you know.
+
+    Even developers of closed-source applications that use the Twitter API are
+    unsure what to do, for good reason. These "secrets" are not secret in any
+    cryptographic sense. A bit of Googling shows that the secret tokens for
+    every popular Twitter app are already posted on the Web.
+
+    Twitter wants us to pretend this string can be kept secret, and hide this
+    secret behind a fig leaf like everybody else does. So that's what we've
+    done.
+    """
+
+    from twitter.oauth import OAuth
+    from twitter import oauth_dance, read_token_file
+
+    def unhide(secret):
+        """
+        Do something mysterious and exactly as secure as every other Twitter
+        app.
+        """
+        return ''.join([chr(ord(c) - 0x2800) for c in secret])
+
+    fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁'
+    consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw'
+
+    if os.path.exists(AUTH_TOKEN_PATH):
+        token, token_secret = read_token_file(AUTH_TOKEN_PATH)
+    else:
+        authdir = os.path.dirname(AUTH_TOKEN_PATH)
+        if not os.path.exists(authdir):
+            os.makedirs(authdir)
+        token, token_secret = oauth_dance(
+            app_name='ftfy-tester',
+            consumer_key=consumer_key,
+            consumer_secret=unhide(fig_leaf),
+            token_filename=AUTH_TOKEN_PATH
+        )
+
+    return OAuth(
+        token=token,
+        token_secret=token_secret,
+        consumer_key=consumer_key,
+        consumer_secret=unhide(fig_leaf)
+    )
+
--- a/lib/ftfy/streamtester/twitter_tester.py
+++ b/lib/ftfy/streamtester/twitter_tester.py
@ -0,0 +1,89 @@
+"""
+Implements a StreamTester that runs over Twitter data. See the class
+docstring.
+
+This module is written for Python 3 only. The __future__ imports you see here
+are just to let Python 2 scan the file without crashing with a SyntaxError.
+"""
+from __future__ import print_function, unicode_literals
+import os
+from collections import defaultdict
+from ftfy.streamtester import StreamTester
+
+
+class TwitterTester(StreamTester):
+    """
+    This class uses the StreamTester code (defined in `__init__.py`) to
+    evaluate ftfy's real-world performance, by feeding it live data from
+    Twitter.
+
+    This is a semi-manual evaluation. It requires a human to look at the
+    results and determine if they are good. The three possible cases we
+    can see here are:
+
+        - Success: the process takes in mojibake and outputs correct text.
+        - False positive: the process takes in correct text, and outputs
+          mojibake. Every false positive should be considered a bug, and
+          reported on GitHub if it isn't already.
+        - Confusion: the process takes in mojibake and outputs different
+          mojibake. Not a great outcome, but not as dire as a false
+          positive.
+
+    This tester cannot reveal false negatives. So far, that can only be
+    done by the unit tests.
+    """
+    OUTPUT_DIR = './twitterlogs'
+
+    def __init__(self):
+        self.lines_by_lang = defaultdict(list)
+        super().__init__()
+
+    def save_files(self):
+        """
+        When processing data from live Twitter, save it to log files so that
+        it can be replayed later.
+        """
+        if not os.path.exists(self.OUTPUT_DIR):
+            os.makedirs(self.OUTPUT_DIR)
+        for lang, lines in self.lines_by_lang.items():
+            filename = 'tweets.{}.txt'.format(lang)
+            fullname = os.path.join(self.OUTPUT_DIR, filename)
+            langfile = open(fullname, 'a')
+            for line in lines:
+                print(line.replace('\n', ' '), file=langfile)
+            langfile.close()
+        self.lines_by_lang = defaultdict(list)
+
+    def run_sample(self):
+        """
+        Listen to live data from Twitter, and pass on the fully-formed tweets
+        to `check_ftfy`. This requires the `twitter` Python package as a
+        dependency.
+        """
+        from twitter import TwitterStream
+        from ftfy.streamtester.oauth import get_auth
+        twitter_stream = TwitterStream(auth=get_auth())
+        iterator = twitter_stream.statuses.sample()
+        for tweet in iterator:
+            if 'text' in tweet:
+                self.check_ftfy(tweet['text'])
+                if 'user' in tweet:
+                    lang = tweet['user'].get('lang', 'NONE')
+                    self.lines_by_lang[lang].append(tweet['text'])
+                if self.count % 10000 == 100:
+                    self.save_files()
+
+
+def main():
+    """
+    When run from the command line, this script connects to the Twitter stream
+    and runs the TwitterTester on it forever. Or at least until the stream
+    drops.
+    """
+    tester = TwitterTester()
+    tester.run_sample()
+
+
+if __name__ == '__main__':
+    main()
+
--- a/sickbeard/encodingKludge.py
+++ b/sickbeard/encodingKludge.py
@ -18,22 +18,23 @@

 import os

-from sickbeard import logger
 import sickbeard
+from sickbeard import logger
+
+import ftfy
+import ftfy.bad_codecs

 # This module tries to deal with the apparently random behavior of python when dealing with unicode <-> utf-8
 # encodings. It tries to just use unicode, but if that fails then it tries forcing it to utf-8. Any functions
 # which return something should always return unicode.

 def fixStupidEncodings(x, silent=False):
-    if type(x) == str:
+    if type(x) in [str, unicode]:
        try:
-            return x.decode(sickbeard.SYS_ENCODING)
+            return ftfy.fix_text(u'' + x).decode(sickbeard.SYS_ENCODING)
        except UnicodeDecodeError:
            logger.log(u"Unable to decode value: " + repr(x), logger.ERROR)
            return None
-    elif type(x) == unicode:
-        return x
    else:
        logger.log(
            u"Unknown value passed in, ignoring it: " + str(type(x)) + " (" + repr(x) + ":" + repr(type(x)) + ")",
@ -49,12 +50,12 @@ def fixListEncodings(x):

 def callPeopleStupid(x):
    try:
-        return x.encode(sickbeard.SYS_ENCODING)
+        return ftfy.fix_text(x).encode(sickbeard.SYS_ENCODING)
    except (UnicodeEncodeError, UnicodeDecodeError):
        logger.log(
            u"YOUR COMPUTER SUCKS! Your data is being corrupted by a bad locale/encoding setting. Report this error on the forums or IRC please: " + repr(
                x) + ", " + sickbeard.SYS_ENCODING, logger.ERROR)
-        return x.encode(sickbeard.SYS_ENCODING, 'ignore')
+        return ftfy.fix_text(x).encode(sickbeard.SYS_ENCODING, 'ignore')

 def ek(func, *args, **kwargs):
    if os.name == 'nt':
--- a/sickbeard/failed_history.py
+++ b/sickbeard/failed_history.py
@ -26,6 +26,7 @@ from sickbeard.exceptions import ex, EpisodeNotFoundException
 from sickbeard.history import dateFormat
 from sickbeard.common import Quality
 from sickbeard.common import WANTED, FAILED
+from encodingKludge import fixStupidEncodings


 def prepareFailedName(release):
@ -36,9 +37,7 @@ def prepareFailedName(release):
        fixed = fixed.rpartition(".")[0]

    fixed = re.sub("[\.\-\+\ ]", "_", fixed)
-
-    if not isinstance(fixed, unicode):
-        fixed = unicode(fixed, 'utf-8', 'replace')
+    fixed = fixStupidEncodings(fixed)

    return fixed

--- a/sickbeard/history.py
+++ b/sickbeard/history.py
@ -20,6 +20,7 @@ import db
 import datetime

 from sickbeard.common import SNATCHED, SUBTITLED, FAILED, Quality
+from encodingKludge import fixStupidEncodings


 dateFormat = "%Y%m%d%H%M%S"
@ -27,9 +28,7 @@ dateFormat = "%Y%m%d%H%M%S"

 def _logHistoryItem(action, showid, season, episode, quality, resource, provider, version=-1):
    logDate = datetime.datetime.today().strftime(dateFormat)
-
-    if not isinstance(resource, unicode):
-        resource = unicode(resource, 'utf-8', 'replace')
+    resource = fixStupidEncodings(resource)

    myDB = db.DBConnection()
    myDB.action(
--- a/sickbeard/notifiers/emailnotify.py
+++ b/sickbeard/notifiers/emailnotify.py
@ -29,6 +29,7 @@ import sickbeard

 from sickbeard import logger, common
 from sickbeard import db
+from encodingKludge import fixStupidEncodings
 from sickbeard.exceptions import ex


@ -50,7 +51,7 @@ class EmailNotifier:
        ep_name: The name of the episode that was snatched
        title: The title of the notification (optional)
        """
-        ep_name = ep_name.encode('utf-8', 'replace')
+        ep_name = fixStupidEncodings(ep_name)

        if sickbeard.EMAIL_NOTIFY_ONSNATCH:
            show = self._parseEp(ep_name)
@ -85,7 +86,7 @@ class EmailNotifier:
        ep_name: The name of the episode that was downloaded
        title: The title of the notification (optional)
        """
-        ep_name = ep_name.encode('utf-8', 'replace')
+        ep_name = fixStupidEncodings(ep_name)

        if sickbeard.EMAIL_NOTIFY_ONDOWNLOAD:
            show = self._parseEp(ep_name)
@ -120,7 +121,7 @@ class EmailNotifier:
        ep_name: The name of the episode that was downloaded
        lang: Subtitle language wanted
        """
-        ep_name = ep_name.encode('utf-8', 'replace')
+        ep_name = fixStupidEncodings(ep_name)

        if sickbeard.EMAIL_NOTIFY_ONSUBTITLEDOWNLOAD:
            show = self._parseEp(ep_name)
@ -197,7 +198,7 @@ class EmailNotifier:
            return False

    def _parseEp(self, ep_name):
-        ep_name = ep_name.encode('utf-8', 'replace')
+        ep_name = fixStupidEncodings(ep_name)

        sep = " - "
        titles = ep_name.split(sep)
--- a/sickbeard/nzbSplitter.py
+++ b/sickbeard/nzbSplitter.py
@ -23,13 +23,14 @@ import xml.etree.cElementTree as etree
 import xml.etree
 import re

-from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
-
 from sickbeard import logger, classes, helpers
 from sickbeard.common import Quality
 from sickbeard import encodingKludge as ek
 from sickbeard.exceptions import ex

+from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
+from encodingKludge import fixStupidEncodings
+

 def getSeasonNZBs(name, urlData, season):
    try:
@ -84,7 +85,7 @@ def createNZBString(fileElements, xmlns):
    for curFile in fileElements:
        rootElement.append(stripNS(curFile, xmlns))

-    return xml.etree.ElementTree.tostring(rootElement, 'utf-8', 'replace')
+    return xml.etree.ElementTree.tostring(fixStupidEncodings(rootElement))


 def saveNZB(nzbName, nzbString):
--- a/sickbeard/scene_exceptions.py
+++ b/sickbeard/scene_exceptions.py
@ -20,13 +20,14 @@ import re
 import time
 import threading
 import datetime
-import sickbeard

-from lib import adba
+import sickbeard
+import adba
 from sickbeard import helpers
 from sickbeard import name_cache
 from sickbeard import logger
 from sickbeard import db
+from encodingKludge import fixStupidEncodings

 exception_dict = {}
 anidb_exception_dict = {}
@ -233,8 +234,7 @@ def retrieve_exceptions():
            # if this exception isn't already in the DB then add it
            if cur_exception not in existing_exceptions:

-                if not isinstance(cur_exception, unicode):
-                    cur_exception = unicode(cur_exception, 'utf-8', 'replace')
+                cur_exception = fixStupidEncodings(cur_exception)

                myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
                            [cur_indexer_id, cur_exception, curSeason])
@ -267,9 +267,7 @@ def update_scene_exceptions(indexer_id, scene_exceptions, season=-1):
        exceptionsCache[indexer_id][season] = scene_exceptions

    for cur_exception in scene_exceptions:
-
-        if not isinstance(cur_exception, unicode):
-            cur_exception = unicode(cur_exception, 'utf-8', 'replace')
+        cur_exception = fixStupidEncodings(cur_exception)

        myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
                    [indexer_id, cur_exception, season])
--- a/sickbeard/tvcache.py
+++ b/sickbeard/tvcache.py
@ -20,19 +20,20 @@ from __future__ import with_statement

 import time
 import datetime
+import itertools
+
 import sickbeard

 from sickbeard import db
 from sickbeard import logger
 from sickbeard.common import Quality
-
 from sickbeard import helpers, show_name_helpers
 from sickbeard.exceptions import MultipleShowObjectsException
 from sickbeard.exceptions import AuthException
-from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
 from sickbeard.rssfeeds import RSSFeeds
 from sickbeard import clients
-import itertools
+from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
+from encodingKludge import fixStupidEncodings

 class CacheDBConnection(db.DBConnection):
    def __init__(self, providerName):
@ -262,8 +263,7 @@ class TVCache():
            # get quality of release
            quality = parse_result.quality

-            if not isinstance(name, unicode):
-                name = unicode(name, 'utf-8', 'replace')
+            name = fixStupidEncodings(name)

            # get release group
            release_group = parse_result.release_group
--- a/sickbeard/webserve.py
+++ b/sickbeard/webserve.py
@ -64,8 +64,8 @@ from browser import WebFileBrowser
 from lib.dateutil import tz
 from lib.unrar2 import RarFile

-from lib import subliminal
-from trakt import TraktCall
+from lib import adba, subliminal
+from lib.trakt import TraktCall

 try:
    import json
@ -77,7 +77,6 @@ try:
 except ImportError:
    import xml.etree.ElementTree as etree

-from lib import adba

 from Cheetah.Template import Template
 from tornado.web import RequestHandler, HTTPError, asynchronous
@ -3289,7 +3288,7 @@ class ErrorLogs(MainHandler):

        for x in reversed(data):

-            x = x.decode('utf-8', 'replace')
+            x = ek.fixStupidEncodings(x)
            match = re.match(regex, x)

            if match: