diff --git a/lib/ftfy/__init__.py b/lib/ftfy/__init__.py
deleted file mode 100644
index 2887c5b9..00000000
--- a/lib/ftfy/__init__.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-ftfy: fixes text for you
-
-This is a module for making text less broken. See the `fix_text` function
-for more information.
-"""
-
-from __future__ import unicode_literals
-
-# See the docstring for ftfy.bad_codecs to see what we're doing here.
-import ftfy.bad_codecs
-ftfy.bad_codecs.ok()
-
-from ftfy import fixes
-from ftfy.fixes import fix_text_encoding
-from ftfy.compatibility import PYTHON34_OR_LATER, is_printable
-import unicodedata
-import warnings
-
-
-def fix_text(text,
- remove_unsafe_private_use=(not PYTHON34_OR_LATER),
- fix_entities='auto',
- remove_terminal_escapes=True,
- fix_encoding=True,
- normalization='NFKC',
- uncurl_quotes=True,
- fix_line_breaks=True,
- remove_control_chars=True,
- remove_bom=True,
- max_decode_length=2**16):
- r"""
- Given Unicode text as input, make its representation consistent and
- possibly less broken.
-
- Let's start with some examples:
-
- >>> print(fix_text('ünicode'))
- ünicode
-
- >>> print(fix_text('Broken text… it’s flubberific!'))
- Broken text... it's flubberific!
-
- >>> print(fix_text('HTML entities <3'))
- HTML entities <3
-
- >>> print(fix_text('HTML entities <3'))
- HTML entities <3
-
- >>> print(fix_text('\001\033[36;44mIm blue, da ba dee da ba '
- ... 'doo
\033[0m'))
- I'm blue, da ba dee da ba doo...
-
- >>> # This example string starts with a byte-order mark, even if
- >>> # you can't see it on the Web.
- >>> print(fix_text('\ufeffParty like\nit’s 1999!'))
- Party like
- it's 1999!
-
- >>> len(fix_text('fi' * 100000))
- 200000
-
- >>> len(fix_text(''))
- 0
-
- Based on the options you provide, ftfy applies these steps in order:
-
- - If `remove_unsafe_private_use` is True, it removes a range of private-use
- characters that could trigger a Python bug. The bug is fixed in
- the most recent versions of Python, so this will default to False
- starting on Python 3.4.
- - If `fix_entities` is True, replace HTML entities with their equivalent
- characters. If it's "auto" (the default), then consider replacing HTML
- entities, but don't do so in text where you have seen a pair of actual
- angle brackets (that's probably actually HTML and you shouldn't mess
- with the entities).
- - If `remove_terminal_escapes` is True, remove sequences of bytes that are
- instructions for Unix terminals, such as the codes that make text appear
- in different colors.
- - If `fix_encoding` is True, look for common mistakes that come from
- encoding or decoding Unicode text incorrectly, and fix them if they are
- reasonably fixable. See `fix_text_encoding` for details.
- - If `normalization` is not None, apply the specified form of Unicode
- normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
- The default, 'NFKC', applies the following relevant transformations:
-
- - C: Combine characters and diacritics that are written using separate
- code points, such as converting "e" plus an acute accent modifier
- into "é", or converting "ka" (か) plus a dakuten into the
- single character "ga" (が).
- - K: Replace characters that are functionally equivalent with the most
- common form. For example, half-width katakana will be replaced with
- full-width versions, full-width Roman characters will be replaced with
- ASCII characters, ellipsis characters will be replaced with three
- periods, and the ligature 'fl' will be replaced with 'fl'.
-
- - If `uncurl_quotes` is True, replace various curly quotation marks with
- plain-ASCII straight quotes.
- - If `fix_line_breaks` is true, convert all line breaks to Unix style
- (CRLF and CR line breaks become LF line breaks).
- - If `fix_control_characters` is true, remove all C0 control characters
- except the common useful ones: TAB, CR, LF, and FF. (CR characters
- may have already been removed by the `fix_line_breaks` step.)
- - If `remove_bom` is True, remove the Byte-Order Mark if it exists.
- - If anything was changed, repeat all the steps, so that the function is
- idempotent. "&" will become "&", for example, not "&".
-
- `fix_text` will work one line at a time, with the possibility that some
- lines are in different encodings. When it encounters lines longer than
- `max_decode_length`, it will not run the `fix_encoding` step, to avoid
- unbounded slowdowns.
-
- If you are certain your entire text is in the same encoding (though that
- encoding is possibly flawed), and do not mind performing operations on
- the whole text at once, use `fix_text_segment`.
- """
- if isinstance(text, bytes):
- raise UnicodeError(fixes.BYTES_ERROR_TEXT)
-
- out = []
- pos = 0
- while pos < len(text):
- textbreak = text.find('\n', pos) + 1
- fix_encoding_this_time = fix_encoding
- if textbreak == 0:
- textbreak = len(text)
- if (textbreak - pos) > max_decode_length:
- fix_encoding_this_time = False
-
- substring = text[pos:textbreak]
-
- if fix_entities == 'auto' and '<' in substring and '>' in substring:
- # we see angle brackets together; this could be HTML
- fix_entities = False
-
- out.append(
- fix_text_segment(
- substring,
- remove_unsafe_private_use=remove_unsafe_private_use,
- fix_entities=fix_entities,
- remove_terminal_escapes=remove_terminal_escapes,
- fix_encoding=fix_encoding_this_time,
- normalization=normalization,
- uncurl_quotes=uncurl_quotes,
- fix_line_breaks=fix_line_breaks,
- remove_control_chars=remove_control_chars,
- remove_bom=remove_bom
- )
- )
- pos = textbreak
-
- return ''.join(out)
-
-ftfy = fix_text
-
-
-def fix_file(input_file,
- remove_unsafe_private_use=True,
- fix_entities='auto',
- remove_terminal_escapes=True,
- fix_encoding=True,
- normalization='NFKC',
- uncurl_quotes=True,
- fix_line_breaks=True,
- remove_control_chars=True,
- remove_bom=True):
- """
- Fix text that is found in a file.
-
- If the file is being read as Unicode text, use that. If it's being read as
- bytes, then unfortunately, we have to guess what encoding it is. We'll try
- a few common encodings, but we make no promises. See the `guess_bytes`
- function for how this is done.
-
- The output is a stream of fixed lines of text.
- """
- entities = fix_entities
- for line in input_file:
- if isinstance(line, bytes):
- line, encoding = guess_bytes(line)
- if fix_entities == 'auto' and '<' in line and '>' in line:
- entities = False
- yield fix_text_segment(
- line,
- remove_unsafe_private_use=remove_unsafe_private_use,
- fix_entities=entities,
- remove_terminal_escapes=remove_terminal_escapes,
- fix_encoding=fix_encoding,
- normalization=normalization,
- uncurl_quotes=uncurl_quotes,
- fix_line_breaks=fix_line_breaks,
- remove_control_chars=remove_control_chars,
- remove_bom=remove_bom
- )
-
-
-def fix_text_segment(text,
- remove_unsafe_private_use=True,
- fix_entities='auto',
- remove_terminal_escapes=True,
- fix_encoding=True,
- normalization='NFKC',
- uncurl_quotes=True,
- fix_line_breaks=True,
- remove_control_chars=True,
- remove_bom=True):
- """
- Apply fixes to text in a single chunk. This could be a line of text
- within a larger run of `fix_text`, or it could be a larger amount
- of text that you are certain is all in the same encoding.
-
- See `fix_text` for a description of the parameters.
- """
- if isinstance(text, bytes):
- raise UnicodeError(fixes.BYTES_ERROR_TEXT)
-
- if fix_entities == 'auto' and '<' in text and '>' in text:
- fix_entities = False
- while True:
- origtext = text
- if remove_unsafe_private_use:
- text = fixes.remove_unsafe_private_use(text)
- if fix_entities:
- text = fixes.unescape_html(text)
- if remove_terminal_escapes:
- text = fixes.remove_terminal_escapes(text)
- if fix_encoding:
- text = fixes.fix_text_encoding(text)
- if normalization is not None:
- text = unicodedata.normalize(normalization, text)
- if uncurl_quotes:
- text = fixes.uncurl_quotes(text)
- if fix_line_breaks:
- text = fixes.fix_line_breaks(text)
- if remove_control_chars:
- text = fixes.remove_control_chars(text)
- if remove_bom:
- text = fixes.remove_bom(text)
- if text == origtext:
- return text
-
-
-def guess_bytes(bstring):
- """
- If you have some bytes in an unknown encoding, here's a reasonable
- strategy for decoding them, by trying a few common encodings that
- can be distinguished from each other.
-
- This is not a magic bullet. If the bytes are coming from some MySQL
- database with the "character set" set to ISO Elbonian, this won't figure
- it out. Perhaps more relevantly, this currently doesn't try East Asian
- encodings.
-
- The encodings we try are:
-
- - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
- like nothing else
- - UTF-8, because it's the global de facto standard
- - "utf-8-variants", because it's what people actually implement when they
- think they're doing UTF-8
- - MacRoman, because Microsoft Office thinks it's still a thing, and it
- can be distinguished by its line breaks. (If there are no line breaks in
- the string, though, you're out of luck.)
- - "sloppy-windows-1252", the Latin-1-like encoding that is the most common
- single-byte encoding
- """
- if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
- return bstring.decode('utf-16'), 'utf-16'
-
- byteset = set(bytes(bstring))
- byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
-
- try:
- if byte_ed in byteset or byte_c0 in byteset:
- # Byte 0xed can be used to encode a range of codepoints that
- # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
- # so when we see 0xed, it's very likely we're being asked to
- # decode CESU-8, the variant that encodes UTF-16 surrogates
- # instead of the original characters themselves.
- #
- # This will occasionally trigger on standard UTF-8, as there
- # are some Korean characters that also use byte 0xed, but that's
- # not harmful.
- #
- # Byte 0xc0 is impossible because, numerically, it would only
- # encode characters lower than U+0040. Those already have
- # single-byte representations, and UTF-8 requires using the
- # shortest possible representation. However, Java hides the null
- # codepoint, U+0000, in a non-standard longer representation -- it
- # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
- # will never appear in the encoded bytes.
- #
- # The 'utf-8-variants' decoder can handle both of these cases, as
- # well as standard UTF-8, at the cost of a bit of speed.
- return bstring.decode('utf-8-variants'), 'utf-8-variants'
- else:
- return bstring.decode('utf-8'), 'utf-8'
- except UnicodeDecodeError:
- pass
-
- if byte_CR in bstring and byte_LF not in bstring:
- return bstring.decode('macroman'), 'macroman'
- else:
- return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
-
-
-def explain_unicode(text):
- """
- A utility method that's useful for debugging mysterious Unicode.
-
- It breaks down a string, showing you for each codepoint its number in
- hexadecimal, its glyph, its category in the Unicode standard, and its name
- in the Unicode standard.
-
- >>> explain_unicode('(╯°□°)╯︵ ┻━┻')
- U+0028 ( [Ps] LEFT PARENTHESIS
- U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
- U+00B0 ° [So] DEGREE SIGN
- U+25A1 □ [So] WHITE SQUARE
- U+00B0 ° [So] DEGREE SIGN
- U+0029 ) [Pe] RIGHT PARENTHESIS
- U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
- U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
- U+0020 [Zs] SPACE
- U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
- U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL
- U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
- """
- for char in text:
- if is_printable(char):
- display = char
- else:
- display = char.encode('unicode-escape').decode('ascii')
- print('U+{code:04X} {display:<7} [{category}] {name}'.format(
- display=display,
- code=ord(char),
- category=unicodedata.category(char),
- name=unicodedata.name(char, '')
- ))
-
-
-def fix_bad_encoding(text):
- """
- Kept for compatibility with previous versions of ftfy.
- """
- warnings.warn(
- 'fix_bad_encoding is now known as fix_text_encoding',
- DeprecationWarning
- )
- return fix_text_encoding(text)
diff --git a/lib/ftfy/bad_codecs/__init__.py b/lib/ftfy/bad_codecs/__init__.py
deleted file mode 100644
index 0984bd52..00000000
--- a/lib/ftfy/bad_codecs/__init__.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# coding: utf-8
-r"""
-Give Python the ability to decode some common, flawed encodings.
-
-Python does not want you to be sloppy with your text. Its encoders and decoders
-("codecs") follow the relevant standards whenever possible, which means that
-when you get text that *doesn't* follow those standards, you'll probably fail
-to decode it. Or you might succeed at decoding it for implementation-specific
-reasons, which is perhaps worse.
-
-There are some encodings out there that Python wishes didn't exist, which are
-widely used outside of Python:
-
-- "utf-8-variants", a family of not-quite-UTF-8 encodings, including the
- ever-popular CESU-8 and "Java modified UTF-8".
-- "Sloppy" versions of character map encodings, where bytes that don't map to
- anything will instead map to the Unicode character with the same number.
-
-Simply importing this module, or in fact any part of the `ftfy` package, will
-make these new "bad codecs" available to Python through the standard Codecs
-API. You never have to actually call any functions inside `ftfy.bad_codecs`.
-
-However, if you want to call something because your code checker insists on it,
-you can call ``ftfy.bad_codecs.ok()``.
-
-A quick example of decoding text that's encoded in CESU-8:
-
- >>> import ftfy.bad_codecs
- >>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
- 😍
-"""
-from __future__ import unicode_literals
-from encodings import normalize_encoding
-import codecs
-
-_CACHE = {}
-
-# Define some aliases for 'utf-8-variants'. All hyphens get turned into
-# underscores, because of `normalize_encoding`.
-UTF8_VAR_NAMES = (
- 'utf_8_variants', 'utf8_variants',
- 'utf_8_variant', 'utf8_variant',
- 'utf_8_var', 'utf8_var',
- 'cesu_8', 'cesu8',
- 'java_utf_8', 'java_utf8'
-)
-
-
-def search_function(encoding):
- """
- Register our "bad codecs" with Python's codecs API. This involves adding
- a search function that takes in an encoding name, and returns a codec
- for that encoding if it knows one, or None if it doesn't.
-
- The encodings this will match are:
-
- - Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N',
- where the non-sloppy version is an encoding that leaves some bytes
- unmapped to characters.
- - The 'utf-8-variants' encoding, which has the several aliases seen
- above.
- """
- if encoding in _CACHE:
- return _CACHE[encoding]
-
- norm_encoding = normalize_encoding(encoding)
- codec = None
- if norm_encoding in UTF8_VAR_NAMES:
- from ftfy.bad_codecs.utf8_variants import CODEC_INFO
- codec = CODEC_INFO
- elif norm_encoding.startswith('sloppy_'):
- from ftfy.bad_codecs.sloppy import CODECS
- codec = CODECS.get(norm_encoding)
-
- if codec is not None:
- _CACHE[encoding] = codec
-
- return codec
-
-
-def ok():
- """
- A feel-good function that gives you something to call after importing
- this package.
-
- Why is this here? Pyflakes. Pyflakes gets upset when you import a module
- and appear not to use it. It doesn't know that you're using it when
- you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
- encodings.
- """
- pass
-
-
-codecs.register(search_function)
diff --git a/lib/ftfy/bad_codecs/sloppy.py b/lib/ftfy/bad_codecs/sloppy.py
deleted file mode 100644
index adca2213..00000000
--- a/lib/ftfy/bad_codecs/sloppy.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# coding: utf-8
-r"""
-Decodes single-byte encodings, filling their "holes" in the same messy way that
-everyone else does.
-
-A single-byte encoding maps each byte to a Unicode character, except that some
-bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
-example, bytes 0x81 and 0x8D, among others, have no meaning.
-
-Python, wanting to preserve some sense of decorum, will handle these bytes
-as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
-different from each other. It just hasn't defined what they are in terms of
-Unicode.
-
-Software that has to interoperate with Windows-1252 and Unicode -- such as all
-the common Web browsers -- will pick some Unicode characters for them to map
-to, and the characters they pick are the Unicode characters with the same
-numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
-resulting characters tend to fall into a range of Unicode that's set aside for
-obselete Latin-1 control characters anyway.
-
-These sloppy codecs let Python do the same thing, thus interoperating with
-other software that works this way. It defines a sloppy version of many
-single-byte encodings with holes. (There is no need for a sloppy version of
-an encoding without holes: for example, there is no such thing as
-sloppy-iso-8859-2 or sloppy-macroman.)
-
-The following encodings will become defined:
-
-- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
-- sloppy-windows-1251 (Cyrillic)
-- sloppy-windows-1252 (Western European, based on Latin-1)
-- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
-- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
-- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
-- sloppy-windows-1256 (Arabic)
-- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
-- sloppy-windows-1258 (Vietnamese)
-- sloppy-cp874 (Thai, based on ISO-8859-11)
-- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
-- sloppy-iso-8859-6 (different Arabic)
-- sloppy-iso-8859-7 (Greek)
-- sloppy-iso-8859-8 (Hebrew)
-- sloppy-iso-8859-11 (Thai)
-
-Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
-defined.
-
-Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
-the rest are rather uncommon.
-
-Here are some examples, using `ftfy.explain_unicode` to illustrate how
-sloppy-windows-1252 merges Windows-1252 with Latin-1:
-
- >>> from ftfy import explain_unicode
- >>> some_bytes = b'\x80\x81\x82'
- >>> explain_unicode(some_bytes.decode('latin-1'))
- U+0080 \x80 [Cc]
- U+0081 \x81 [Cc]
- U+0082 \x82 [Cc]
-
- >>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
- U+20AC € [Sc] EURO SIGN
- U+FFFD � [So] REPLACEMENT CHARACTER
- U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
-
- >>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
- U+20AC € [Sc] EURO SIGN
- U+0081 \x81 [Cc]
- U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
-"""
-from __future__ import unicode_literals
-import codecs
-from encodings import normalize_encoding
-
-REPLACEMENT_CHAR = '\ufffd'
-
-
-def make_sloppy_codec(encoding):
- """
- Take a codec name, and return a 'sloppy' version of that codec that can
- encode and decode the unassigned bytes in that encoding.
-
- Single-byte encodings in the standard library are defined using some
- boilerplate classes surrounding the functions that do the actual work,
- `codecs.charmap_decode` and `charmap_encode`. This function, given an
- encoding name, *defines* those boilerplate classes.
- """
- # Make an array of all 256 possible bytes.
- all_bytes = bytearray(range(256))
-
- # Get a list of what they would decode to in Latin-1.
- sloppy_chars = list(all_bytes.decode('latin-1'))
-
- # Get a list of what they decode to in the given encoding. Use the
- # replacement character for unassigned bytes.
- decoded_chars = all_bytes.decode(encoding, 'replace')
-
- # Update the sloppy_chars list. Each byte that was successfully decoded
- # gets its decoded value in the list. The unassigned bytes are left as
- # they are, which gives their decoding in Latin-1.
- for i, char in enumerate(decoded_chars):
- if char != REPLACEMENT_CHAR:
- sloppy_chars[i] = char
-
- # Create the data structures that tell the charmap methods how to encode
- # and decode in this sloppy encoding.
- decoding_table = ''.join(sloppy_chars)
- encoding_table = codecs.charmap_build(decoding_table)
-
- # Now produce all the class boilerplate. Look at the Python source for
- # `encodings.cp1252` for comparison; this is almost exactly the same,
- # except I made it follow pep8.
- class Codec(codecs.Codec):
- def encode(self, input, errors='strict'):
- return codecs.charmap_encode(input, errors, encoding_table)
-
- def decode(self, input, errors='strict'):
- return codecs.charmap_decode(input, errors, decoding_table)
-
- class IncrementalEncoder(codecs.IncrementalEncoder):
- def encode(self, input, final=False):
- return codecs.charmap_encode(input, self.errors, encoding_table)[0]
-
- class IncrementalDecoder(codecs.IncrementalDecoder):
- def decode(self, input, final=False):
- return codecs.charmap_decode(input, self.errors, decoding_table)[0]
-
- class StreamWriter(Codec, codecs.StreamWriter):
- pass
-
- class StreamReader(Codec, codecs.StreamReader):
- pass
-
- return codecs.CodecInfo(
- name='sloppy-' + encoding,
- encode=Codec().encode,
- decode=Codec().decode,
- incrementalencoder=IncrementalEncoder,
- incrementaldecoder=IncrementalDecoder,
- streamreader=StreamReader,
- streamwriter=StreamWriter,
- )
-
-# Define a codec for each incomplete encoding. The resulting CODECS dictionary
-# can be used by the main module of ftfy.bad_codecs.
-CODECS = {}
-INCOMPLETE_ENCODINGS = (
- ['windows-%s' % num for num in range(1250, 1259)] +
- ['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
- ['cp%s' % num for num in range(1250, 1259)] + ['cp874']
-)
-
-for _encoding in INCOMPLETE_ENCODINGS:
- _new_name = normalize_encoding('sloppy-' + _encoding)
- CODECS[_new_name] = make_sloppy_codec(_encoding)
diff --git a/lib/ftfy/bad_codecs/utf8_variants.py b/lib/ftfy/bad_codecs/utf8_variants.py
deleted file mode 100644
index 565cb2b4..00000000
--- a/lib/ftfy/bad_codecs/utf8_variants.py
+++ /dev/null
@@ -1,281 +0,0 @@
-r"""
-This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can
-decode text that's been encoded with a popular non-standard version of UTF-8.
-This includes CESU-8, the accidental encoding made by layering UTF-8 on top of
-UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for
-codepoint 0.
-
-This is particularly relevant in Python 3, which provides no other way of
-decoding CESU-8 or Java's encoding. [1]
-
-The easiest way to use the codec is to simply import `ftfy.bad_codecs`:
-
- >>> import ftfy.bad_codecs
- >>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var')
- >>> print(repr(result).lstrip('u'))
- 'here comes a null! \x00'
-
-The codec does not at all enforce "correct" CESU-8. For example, the Unicode
-Consortium's not-quite-standard describing CESU-8 requires that there is only
-one possible encoding of any character, so it does not allow mixing of valid
-UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8
-decoder does.
-
-Characters in the Basic Multilingual Plane still have only one encoding. This
-codec still enforces the rule, within the BMP, that characters must appear in
-their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`,
-instead of just `0x00`, may be used to encode the null character `U+0000`, like
-in Java.
-
-If you encode with this codec, you get legitimate UTF-8. Decoding with this
-codec and then re-encoding is not idempotent, although encoding and then
-decoding is. So this module won't produce CESU-8 for you. Look for that
-functionality in the sister module, "Breaks Text For You", coming approximately
-never.
-
-[1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: first
-decode the bytes (incorrectly), then encode them, then decode them again, using
-UTF-8 as the codec every time.
-"""
-
-from __future__ import unicode_literals
-from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2
-from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder,
- IncrementalEncoder as UTF8IncrementalEncoder)
-import re
-import codecs
-
-NAME = 'utf-8-variants'
-# This regular expression matches all possible six-byte CESU-8 sequences.
-CESU8_RE = re.compile(b'\xed[\xa0-\xaf][\x80-\xbf]\xed[\xb0-\xbf][\x80-\xbf]')
-
-
-class IncrementalDecoder(UTF8IncrementalDecoder):
- """
- An incremental decoder that extends Python's built-in UTF-8 decoder.
-
- This encoder needs to take in bytes, possibly arriving in a stream, and
- output the correctly decoded text. The general strategy for doing this
- is to fall back on the real UTF-8 decoder whenever possible, because
- the real UTF-8 decoder is way optimized, but to call specialized methods
- we define here for the cases the real encoder isn't expecting.
- """
- def _buffer_decode(self, input, errors, final):
- """
- Decode bytes that may be arriving in a stream, following the Codecs
- API.
-
- `input` is the incoming sequence of bytes. `errors` tells us how to
- handle errors, though we delegate all error-handling cases to the real
- UTF-8 decoder to ensure correct behavior. `final` indicates whether
- this is the end of the sequence, in which case we should raise an
- error given incomplete input.
-
- Returns as much decoded text as possible, and the number of bytes
- consumed.
- """
- # decoded_segments are the pieces of text we have decoded so far,
- # and position is our current position in the byte string. (Bytes
- # before this position have been consumed, and bytes after it have
- # yet to be decoded.)
- decoded_segments = []
- position = 0
- while True:
- # Use _buffer_decode_step to decode a segment of text.
- decoded, consumed = self._buffer_decode_step(
- input[position:],
- errors,
- final
- )
- if consumed == 0:
- # Either there's nothing left to decode, or we need to wait
- # for more input. Either way, we're done for now.
- break
-
- # Append the decoded text to the list, and update our position.
- decoded_segments.append(decoded)
- position += consumed
-
- if final:
- # _buffer_decode_step must consume all the bytes when `final` is
- # true.
- assert position == len(input)
-
- return ''.join(decoded_segments), position
-
- def _buffer_decode_step(self, input, errors, final):
- """
- There are three possibilities for each decoding step:
-
- - Decode as much real UTF-8 as possible.
- - Decode a six-byte CESU-8 sequence at the current position.
- - Decode a Java-style null at the current position.
-
- This method figures out which step is appropriate, and does it.
- """
- # Get a reference to the superclass method that we'll be using for
- # most of the real work.
- sup = UTF8IncrementalDecoder._buffer_decode
-
- # Find the next byte position that indicates a variant of UTF-8.
- # CESU-8 sequences always start with 0xed, and Java nulls always
- # start with 0xc0, both of which are conveniently impossible in
- # real UTF-8.
- cutoff1 = input.find(b'\xed')
- cutoff2 = input.find(b'\xc0')
-
- # Set `cutoff` to whichever cutoff comes first.
- if cutoff1 != -1 and cutoff2 != -1:
- cutoff = min(cutoff1, cutoff2)
- elif cutoff1 != -1:
- cutoff = cutoff1
- elif cutoff2 != -1:
- cutoff = cutoff2
- else:
- # The entire input can be decoded as UTF-8, so just do so.
- return sup(input, errors, final)
-
- if cutoff1 == 0:
- # Decode a possible six-byte sequence starting with 0xed.
- return self._buffer_decode_surrogates(sup, input, errors, final)
- elif cutoff2 == 0:
- # Decode a possible two-byte sequence, 0xc0 0x80.
- return self._buffer_decode_null(sup, input, errors, final)
- else:
- # Decode the bytes up until the next weird thing as UTF-8.
- # Set final=True because 0xc0 and 0xed don't make sense in the
- # middle of a sequence, in any variant.
- return sup(input[:cutoff], errors, True)
-
- @staticmethod
- def _buffer_decode_null(sup, input, errors, final):
- """
- Decode the bytes 0xc0 0x80 as U+0000, like Java does.
- """
- nextbyte = input[1:2]
- if nextbyte == b'':
- if final:
- # We found 0xc0 at the end of the stream, which is an error.
- # Delegate to the superclass method to handle that error.
- return sup(input, errors, final)
- else:
- # We found 0xc0 and we don't know what comes next, so consume
- # no bytes and wait.
- return '', 0
- elif nextbyte == b'\x80':
- # We found the usual 0xc0 0x80 sequence, so decode it and consume
- # two bytes.
- return '\u0000', 2
- else:
- # We found 0xc0 followed by something else, which is an error.
- # Whatever should happen is equivalent to what happens when the
- # superclass is given just the byte 0xc0, with final=True.
- return sup(b'\xc0', errors, True)
-
- @staticmethod
- def _buffer_decode_surrogates(sup, input, errors, final):
- """
- When we have improperly encoded surrogates, we can still see the
- bits that they were meant to represent.
-
- The surrogates were meant to encode a 20-bit number, to which we
- add 0x10000 to get a codepoint. That 20-bit number now appears in
- this form:
-
- 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
-
- The CESU8_RE above matches byte sequences of this form. Then we need
- to extract the bits and assemble a codepoint number from them.
- """
- if len(input) < 6:
- if final:
- # We found 0xed near the end of the stream, and there aren't
- # six bytes to decode. Delegate to the superclass method to
- # handle it as normal UTF-8. It might be a Hangul character
- # or an error.
- if PYTHON2 and len(input) >= 3:
- # We can't trust Python 2 to raise an error when it's
- # asked to decode a surrogate, so let's force the issue.
- input = mangle_surrogates(input)
- return sup(input, errors, final)
- else:
- # We found 0xed, the stream isn't over yet, and we don't know
- # enough of the following bytes to decode anything, so consume
- # zero bytes and wait.
- return '', 0
- else:
- if CESU8_RE.match(input):
- # If this is a CESU-8 sequence, do some math to pull out
- # the intended 20-bit value, and consume six bytes.
- bytenums = bytes_to_ints(input[:6])
- codepoint = (
- ((bytenums[1] & 0x0f) << 16) +
- ((bytenums[2] & 0x3f) << 10) +
- ((bytenums[4] & 0x0f) << 6) +
- (bytenums[5] & 0x3f) +
- 0x10000
- )
- return unichr(codepoint), 6
- else:
- # This looked like a CESU-8 sequence, but it wasn't one.
- # 0xed indicates the start of a three-byte sequence, so give
- # three bytes to the superclass to decode as usual -- except
- # for working around the Python 2 discrepancy as before.
- if PYTHON2:
- input = mangle_surrogates(input)
- return sup(input[:3], errors, False)
-
-
-def mangle_surrogates(bytestring):
- """
- When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats
- it as an error (which it is). In 'replace' mode, it will decode as three
- replacement characters. But Python 2 will just output the surrogate
- codepoint.
-
- To ensure consistency between Python 2 and Python 3, and protect downstream
- applications from malformed strings, we turn surrogate sequences at the
- start of the string into the bytes `ff ff ff`, which we're *sure* won't
- decode, and which turn into three replacement characters in 'replace' mode.
- """
- if PYTHON2:
- if bytestring.startswith(b'\xed') and len(bytestring) >= 3:
- decoded = bytestring[:3].decode('utf-8', 'replace')
- if '\ud800' <= decoded <= '\udfff':
- return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:])
- return bytestring
- else:
- # On Python 3, nothing needs to be done.
- return bytestring
-
-# The encoder is identical to UTF-8.
-IncrementalEncoder = UTF8IncrementalEncoder
-
-
-# Everything below here is boilerplate that matches the modules in the
-# built-in `encodings` package.
-def encode(input, errors='strict'):
- return IncrementalEncoder(errors).encode(input, final=True), len(input)
-
-
-def decode(input, errors='strict'):
- return IncrementalDecoder(errors).decode(input, final=True), len(input)
-
-
-class StreamWriter(codecs.StreamWriter):
- encode = encode
-
-
-class StreamReader(codecs.StreamReader):
- decode = decode
-
-
-CODEC_INFO = codecs.CodecInfo(
- name=NAME,
- encode=encode,
- decode=decode,
- incrementalencoder=IncrementalEncoder,
- incrementaldecoder=IncrementalDecoder,
- streamreader=StreamReader,
- streamwriter=StreamWriter,
-)
diff --git a/lib/ftfy/badness.py b/lib/ftfy/badness.py
deleted file mode 100644
index f94fc552..00000000
--- a/lib/ftfy/badness.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Heuristics to determine whether re-encoding text is actually making it
-more reasonable.
-"""
-
-from __future__ import unicode_literals
-from ftfy.chardata import chars_to_classes
-import re
-import unicodedata
-
-# The following regex uses the mapping of character classes to ASCII
-# characters defined in chardata.py and build_data.py:
-#
-# L = Latin capital letter
-# l = Latin lowercase letter
-# A = Non-latin capital or title-case letter
-# a = Non-latin lowercase letter
-# C = Non-cased letter (Lo)
-# X = Control character (Cc)
-# m = Letter modifier (Lm)
-# M = Mark (Mc, Me, Mn)
-# N = Miscellaneous numbers (No)
-# 0 = Math symbol (Sm)
-# 1 = Currency symbol (Sc)
-# 2 = Symbol modifier (Sk)
-# 3 = Other symbol (So)
-# S = UTF-16 surrogate
-# _ = Unassigned character
-# = Whitespace
-# o = Other
-
-
-def _make_weirdness_regex():
- """
- Creates a list of regexes that match 'weird' character sequences.
- The more matches there are, the weirder the text is.
- """
- groups = []
-
- # Match lowercase letters that are followed by non-ASCII uppercase letters
- groups.append('lA')
-
- # Match diacritical marks, except when they modify a non-cased letter or
- # another mark.
- #
- # You wouldn't put a diacritical mark on a digit or a space, for example.
- # You might put it on a Latin letter, but in that case there will almost
- # always be a pre-composed version, and we normalize to pre-composed
- # versions first. The cases that can't be pre-composed tend to be in
- # large scripts without case, which are in class C.
- groups.append('[^CM]M')
-
- # Match non-Latin characters adjacent to Latin characters.
- #
- # This is a simplification from ftfy version 2, which compared all
- # adjacent scripts. However, the ambiguities we need to resolve come from
- # encodings designed to represent Latin characters.
- groups.append('[Ll][AaC]')
- groups.append('[AaC][Ll]')
-
- # Match C1 control characters, which are almost always the result of
- # decoding Latin-1 that was meant to be Windows-1252.
- groups.append('X')
-
- # Match private use and unassigned characters.
- groups.append('P')
- groups.append('_')
-
- # Match adjacent characters from any different pair of these categories:
- # - Modifier marks (M)
- # - Letter modifiers (m)
- # - Miscellaneous numbers (N)
- # - Symbols (0123)
-
- exclusive_categories = 'MmN0123'
- for cat1 in exclusive_categories:
- others_range = ''.join(c for c in exclusive_categories if c != cat1)
- groups.append('{cat1}[{others_range}]'.format(
- cat1=cat1, others_range=others_range
- ))
- regex = '|'.join('({0})'.format(group) for group in groups)
- return re.compile(regex)
-
-WEIRDNESS_RE = _make_weirdness_regex()
-
-# A few characters are common ending punctuation that can show up at the end
-# of a mojibake sequence. It's plausible that such a character could appear
-# after an accented capital letter, for example, so we'll want to add a
-# slight preference to leave these characters alone.
-#
-# The match ends with a + so that we only give the bonus once for a
-# consecutive sequence of these characters.
-ENDING_PUNCT_RE = re.compile(
- '['
- '\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}'
- '\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}'
- '\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}'
- '\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}'
- ']+'
-)
-
-def sequence_weirdness(text):
- """
- Determine how often a text has unexpected characters or sequences of
- characters. This metric is used to disambiguate when text should be
- re-decoded or left as is.
-
- We start by normalizing text in NFC form, so that penalties for
- diacritical marks don't apply to characters that know what to do with
- them.
-
- The following things are deemed weird:
-
- - Lowercase letters followed by non-ASCII uppercase letters
- - Non-Latin characters next to Latin characters
- - Un-combined diacritical marks, unless they're stacking on non-alphabetic
- characters (in languages that do that kind of thing a lot) or other
- marks
- - C1 control characters
- - Adjacent symbols from any different pair of these categories:
-
- - Modifier marks
- - Letter modifiers
- - Non-digit numbers
- - Symbols (including math and currency)
-
- The return value is the number of instances of weirdness.
- """
- text2 = unicodedata.normalize('NFC', text)
- weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2)))
- punct_discount = len(ENDING_PUNCT_RE.findall(text2))
- return weirdness * 2 - punct_discount
-
-
-def text_cost(text):
- """
- An overall cost function for text. Weirder is worse, but all else being
- equal, shorter strings are better.
-
- The overall cost is measured as the "weirdness" (see
- :func:`sequence_weirdness`) plus the length.
- """
- return sequence_weirdness(text) + len(text)
diff --git a/lib/ftfy/build_data.py b/lib/ftfy/build_data.py
deleted file mode 100644
index f556b306..00000000
--- a/lib/ftfy/build_data.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""
-A script to make the char_classes.dat file.
-
-This never needs to run in normal usage. It needs to be run if the character
-classes we care about change, or if a new version of Python supports a new
-Unicode standard and we want it to affect our string decoding.
-
-The file that we generate is based on Unicode 6.1, as supported by Python 3.3.
-You can certainly use it in earlier versions. This simply makes sure that we
-get consistent results from running ftfy on different versions of Python.
-
-The file will be written to the current directory.
-"""
-from __future__ import unicode_literals
-import unicodedata
-import sys
-import zlib
-if sys.hexversion >= 0x03000000:
- unichr = chr
-
-# L = Latin capital letter
-# l = Latin lowercase letter
-# A = Non-latin capital or title-case letter
-# a = Non-latin lowercase letter
-# C = Non-cased letter (Lo)
-# X = Control character (Cc)
-# m = Letter modifier (Lm)
-# M = Mark (Mc, Me, Mn)
-# N = Miscellaneous numbers (No)
-# P = Private use (Co)
-# 0 = Math symbol (Sm)
-# 1 = Currency symbol (Sc)
-# 2 = Symbol modifier (Sk)
-# 3 = Other symbol (So)
-# S = UTF-16 surrogate
-# _ = Unassigned character
-# = Whitespace
-# o = Other
-
-
-def make_char_data_file(do_it_anyway=False):
- """
- Build the compressed data file 'char_classes.dat' and write it to the
- current directory.
-
- If you run this, run it in Python 3.3 or later. It will run in earlier
- versions, but you won't get the current Unicode standard, leading to
- inconsistent behavior. To protect against this, running this in the
- wrong version of Python will raise an error unless you pass
- `do_it_anyway=True`.
- """
- if sys.hexversion < 0x03030000 and not do_it_anyway:
- raise RuntimeError(
- "This function should be run in Python 3.3 or later."
- )
-
- cclasses = [None] * 0x110000
- for codepoint in range(0x0, 0x110000):
- char = unichr(codepoint)
- category = unicodedata.category(char)
-
- if category.startswith('L'): # letters
- is_latin = unicodedata.name(char).startswith('LATIN')
- if is_latin and codepoint < 0x200:
- if category == 'Lu':
- cclasses[codepoint] = 'L'
- else:
- cclasses[codepoint] = 'l'
- else: # non-Latin letter, or close enough
- if category == 'Lu' or category == 'Lt':
- cclasses[codepoint] = 'A'
- elif category == 'Ll':
- cclasses[codepoint] = 'a'
- elif category == 'Lo':
- cclasses[codepoint] = 'C'
- elif category == 'Lm':
- cclasses[codepoint] = 'm'
- else:
- raise ValueError('got some weird kind of letter')
- elif category.startswith('M'): # marks
- cclasses[codepoint] = 'M'
- elif category == 'No':
- cclasses[codepoint] = 'N'
- elif category == 'Sm':
- cclasses[codepoint] = '0'
- elif category == 'Sc':
- cclasses[codepoint] = '1'
- elif category == 'Sk':
- cclasses[codepoint] = '2'
- elif category == 'So':
- cclasses[codepoint] = '3'
- elif category == 'Cn':
- cclasses[codepoint] = '_'
- elif category == 'Cc':
- cclasses[codepoint] = 'X'
- elif category == 'Cs':
- cclasses[codepoint] = 'S'
- elif category == 'Co':
- cclasses[codepoint] = 'P'
- elif category.startswith('Z'):
- cclasses[codepoint] = ' '
- else:
- cclasses[codepoint] = 'o'
-
- cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
- out = open('char_classes.dat', 'wb')
- out.write(zlib.compress(''.join(cclasses).encode('ascii')))
- out.close()
-
-if __name__ == '__main__':
- make_char_data_file()
diff --git a/lib/ftfy/char_classes.dat b/lib/ftfy/char_classes.dat
deleted file mode 100644
index 84155cd5..00000000
Binary files a/lib/ftfy/char_classes.dat and /dev/null differ
diff --git a/lib/ftfy/chardata.py b/lib/ftfy/chardata.py
deleted file mode 100644
index e853ed3e..00000000
--- a/lib/ftfy/chardata.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-This gives other modules access to the gritty details about characters and the
-encodings that use them.
-"""
-
-from __future__ import unicode_literals
-import re
-import zlib
-from pkg_resources import resource_string
-from ftfy.compatibility import unichr
-
-# These are the five encodings we will try to fix in ftfy, in the
-# order that they should be tried.
-CHARMAP_ENCODINGS = [
- 'latin-1',
- 'sloppy-windows-1252',
- 'macroman',
- 'cp437',
- 'sloppy-windows-1251',
-]
-
-
-def _build_regexes():
- """
- ENCODING_REGEXES contain reasonably fast ways to detect if we
- could represent a given string in a given encoding. The simplest one is
- the 'ascii' detector, which of course just determines if all characters
- are between U+0000 and U+007F.
- """
- # Define a regex that matches ASCII text.
- encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
-
- for encoding in CHARMAP_ENCODINGS:
- latin1table = ''.join(unichr(i) for i in range(128, 256))
- charlist = latin1table.encode('latin-1').decode(encoding)
-
- # Build a regex from the ASCII range, followed by the decodings of
- # bytes 0x80-0xff in this character set. (This uses the fact that all
- # regex special characters are ASCII, and therefore won't appear in the
- # string.)
- regex = '^[\x00-\x7f{0}]*$'.format(charlist)
- encoding_regexes[encoding] = re.compile(regex)
- return encoding_regexes
-ENCODING_REGEXES = _build_regexes()
-
-
-def possible_encoding(text, encoding):
- """
- Given text and a single-byte encoding, check whether that text could have
- been decoded from that single-byte encoding.
-
- In other words, check whether it can be encoded in that encoding, possibly
- sloppily.
- """
- return bool(ENCODING_REGEXES[encoding].match(text))
-
-
-CHAR_CLASS_STRING = zlib.decompress(
- resource_string(__name__, 'char_classes.dat')
-).decode('ascii')
-
-def chars_to_classes(string):
- """
- Convert each Unicode character to a letter indicating which of many
- classes it's in.
-
- See build_data.py for where this data comes from and what it means.
- """
- return string.translate(CHAR_CLASS_STRING)
-
-
-# A translate mapping that will strip all C0 control characters except
-# those that represent whitespace.
-CONTROL_CHARS = {}
-for i in range(32):
- CONTROL_CHARS[i] = None
-
-# Map whitespace control characters to themselves.
-for char in '\t\n\f\r':
- del CONTROL_CHARS[ord(char)]
diff --git a/lib/ftfy/cli.py b/lib/ftfy/cli.py
deleted file mode 100644
index 6ac83706..00000000
--- a/lib/ftfy/cli.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""
-A simple command-line utility for fixing text found in a file.
-
-Because files do not come with their encoding marked, it first runs the file
-through `ftfy.guess_bytes`, then runs it through `ftfy.fix_text`.
-"""
-from ftfy import fix_file
-
-import sys
-ENCODE_STDOUT = (sys.hexversion < 0x03000000)
-
-
-def main():
- """
- Run ftfy as a command-line utility. (Requires Python 2.7 or later, or
- the 'argparse' module.)
- """
- import argparse
-
- parser = argparse.ArgumentParser()
- parser.add_argument('filename', help='file to transcode')
-
- args = parser.parse_args()
-
- file = open(args.filename)
- for line in fix_file(file):
- if ENCODE_STDOUT:
- sys.stdout.write(line.encode('utf-8'))
- else:
- sys.stdout.write(line)
-
-
-if __name__ == '__main__':
- main()
diff --git a/lib/ftfy/compatibility.py b/lib/ftfy/compatibility.py
deleted file mode 100644
index 1246248c..00000000
--- a/lib/ftfy/compatibility.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""
-Makes some function names and behavior consistent between Python 2 and
-Python 3, and also between narrow and wide builds.
-"""
-from __future__ import unicode_literals
-import sys
-import re
-import unicodedata
-
-if sys.hexversion >= 0x03000000:
- from html import entities
- unichr = chr
- xrange = range
- PYTHON2 = False
-else:
- import htmlentitydefs as entities
- unichr = unichr
- xrange = xrange
- PYTHON2 = True
-htmlentitydefs = entities
-
-PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000)
-
-
-def _narrow_unichr_workaround(codepoint):
- """
- A replacement for unichr() on narrow builds of Python. This will get
- us the narrow representation of an astral character, which will be
- a string of length two, containing two UTF-16 surrogates.
- """
- escaped = b'\\U%08x' % codepoint
- return escaped.decode('unicode-escape')
-
-
-if sys.maxunicode < 0x10000:
- unichr = _narrow_unichr_workaround
- # In a narrow build of Python, we can't write a regex involving astral
- # characters. If we want to write the regex:
- #
- # [\U00100000-\U0010ffff]
- #
- # The actual string that defines it quietly turns into:
- #
- # [\udbc0\udc00-\udbff\udfff]
- #
- # And now the range operator only applies to the middle two characters.
- # It looks like a range that's going backwards from \dc00 to \dbff,
- # which is an error.
- #
- # What we can do instead is rewrite the expression to be _about_ the two
- # surrogates that make up the astral characters, instead of the characters
- # themselves. This would be wrong on a wide build, but it works on a
- # narrow build.
- UNSAFE_PRIVATE_USE_RE = re.compile('[\udbc0-\udbff][\udc00-\udfff]')
-else:
- UNSAFE_PRIVATE_USE_RE = re.compile('[\U00100000-\U0010ffff]')
-
-
-def bytes_to_ints(bytestring):
- """
- No matter what version of Python this is, make a sequence of integers from
- a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a
- sequence of integers.
- """
- if PYTHON2:
- return [ord(b) for b in bytestring]
- else:
- return bytestring
-
-
-def is_printable(char):
- """
- str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so
- let's make a crude approximation in Python 2.
- """
- if PYTHON2:
- return not unicodedata.category(char).startswith('C')
- else:
- return char.isprintable()
diff --git a/lib/ftfy/fixes.py b/lib/ftfy/fixes.py
deleted file mode 100644
index 8da51aa4..00000000
--- a/lib/ftfy/fixes.py
+++ /dev/null
@@ -1,473 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-This module contains the individual fixes that the main fix_text function
-can perform.
-"""
-
-from __future__ import unicode_literals
-from ftfy.chardata import (possible_encoding,
- CHARMAP_ENCODINGS, CONTROL_CHARS)
-from ftfy.badness import text_cost
-from ftfy.compatibility import htmlentitydefs, unichr, UNSAFE_PRIVATE_USE_RE
-import re
-import sys
-import codecs
-
-
-BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
-
-ftfy is designed to fix problems that were introduced by handling Unicode
-incorrectly. It might be able to fix the bytes you just handed it, but the
-fact that you just gave a pile of bytes to a function that fixes text means
-that your code is *also* handling Unicode incorrectly.
-
-ftfy takes Unicode text as input. You should take these bytes and decode
-them from the encoding you think they are in. If you're not sure what encoding
-they're in:
-
-- First, try to find out. 'utf-8' is a good assumption.
-- If the encoding is simply unknowable, try running your bytes through
- ftfy.guess_bytes. As the name implies, this may not always be accurate.
-
-If you're confused by this, please read the Python Unicode HOWTO:
-
- http://docs.python.org/%d/howto/unicode.html
-""" % sys.version_info[0]
-
-
-def fix_text_encoding(text):
- r"""
- Fix text with incorrectly-decoded garbage ("mojibake") whenever possible.
-
- Something you will find all over the place, in real-world text, is text
- that's mistakenly encoded as utf-8, decoded in some ugly format like
- latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
-
- This causes your perfectly good Unicode-aware code to end up with garbage
- text because someone else (or maybe "someone else") made a mistake.
-
- This function looks for the evidence of that having happened and fixes it.
- It determines whether it should replace nonsense sequences of single-byte
- characters that were really meant to be UTF-8 characters, and if so, turns
- them into the correctly-encoded Unicode character that they were meant to
- represent.
-
- The input to the function must be Unicode. If you don't have Unicode text,
- you're not using the right tool to solve your problem.
-
- .. note::
- The following examples are written using unmarked literal strings,
- but they are Unicode text. In Python 2 we have "unicode_literals"
- turned on, and in Python 3 this is always the case.
-
- ftfy decodes text that looks like it was decoded incorrectly. It leaves
- alone text that doesn't.
-
- >>> print(fix_text_encoding('único'))
- único
-
- >>> print(fix_text_encoding('This text is fine already :þ'))
- This text is fine already :þ
-
- Because these characters often come from Microsoft products, we allow
- for the possibility that we get not just Unicode characters 128-255, but
- also Windows's conflicting idea of what characters 128-160 are.
-
- >>> print(fix_text_encoding('This — should be an em dash'))
- This — should be an em dash
-
- We might have to deal with both Windows characters and raw control
- characters at the same time, especially when dealing with characters like
- 0x81 that have no mapping in Windows. This is a string that Python's
- standard `.encode` and `.decode` methods cannot correct.
-
- >>> print(fix_text_encoding('This text is sad .â\x81”.'))
- This text is sad .⁔.
-
- However, it has safeguards against fixing sequences of letters and
- punctuation that can occur in valid text:
-
- >>> print(fix_text_encoding('not such a fan of Charlotte Brontë…”'))
- not such a fan of Charlotte Brontë…”
-
- Cases of genuine ambiguity can sometimes be addressed by finding other
- characters that are not double-encoded, and expecting the encoding to
- be consistent:
-
- >>> print(fix_text_encoding('AHÅ™, the new sofa from IKEA®'))
- AHÅ™, the new sofa from IKEA®
-
- Finally, we handle the case where the text is in a single-byte encoding
- that was intended as Windows-1252 all along but read as Latin-1:
-
- >>> print(fix_text_encoding('This text was never UTF-8 at all\x85'))
- This text was never UTF-8 at all…
-
- The best version of the text is found using
- :func:`ftfy.badness.text_cost`.
- """
- text, _plan = fix_encoding_and_explain(text)
- return text
-
-
-def fix_encoding_and_explain(text):
- """
- Re-decodes text that has been decoded incorrectly, and also return a
- "plan" indicating all the steps required to fix it.
-
- To fix similar text in the same way, without having to detect anything,
- you can use the ``apply_plan`` function.
- """
- best_version = text
- best_cost = text_cost(text)
- best_plan = []
- plan_so_far = []
- while True:
- prevtext = text
- text, plan = fix_one_step_and_explain(text)
- plan_so_far.extend(plan)
- cost = text_cost(text)
-
- # Add a penalty if we used a particularly obsolete encoding. The result
- # is that we won't use these encodings unless they can successfully
- # replace multiple characters.
- if ('encode', 'macroman') in plan_so_far or\
- ('encode', 'cp437') in plan_so_far:
- cost += 2
-
- # We need pretty solid evidence to decode from Windows-1251 (Cyrillic).
- if ('encode', 'sloppy-windows-1251') in plan_so_far:
- cost += 5
-
- if cost < best_cost:
- best_cost = cost
- best_version = text
- best_plan = list(plan_so_far)
- if text == prevtext:
- return best_version, best_plan
-
-
-def fix_one_step_and_explain(text):
- """
- Performs a single step of re-decoding text that's been decoded incorrectly.
-
- Returns the decoded text, plus a "plan" for how to reproduce what it
- did.
- """
- if isinstance(text, bytes):
- raise UnicodeError(BYTES_ERROR_TEXT)
- if len(text) == 0:
- return text, []
-
- # The first plan is to return ASCII text unchanged.
- if possible_encoding(text, 'ascii'):
- return text, []
-
- # As we go through the next step, remember the possible encodings
- # that we encounter but don't successfully fix yet. We may need them
- # later.
- possible_1byte_encodings = []
-
- # Suppose the text was supposed to be UTF-8, but it was decoded using
- # a single-byte encoding instead. When these cases can be fixed, they
- # are usually the correct thing to do, so try them next.
- for encoding in CHARMAP_ENCODINGS:
- if possible_encoding(text, encoding):
- encoded_bytes = text.encode(encoding)
-
- # Now, find out if it's UTF-8 (or close enough). Otherwise,
- # remember the encoding for later.
- try:
- decoding = 'utf-8'
- if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes:
- decoding = 'utf-8-variants'
- fixed = encoded_bytes.decode(decoding)
- steps = [('encode', encoding), ('decode', decoding)]
- return fixed, steps
- except UnicodeDecodeError:
- possible_1byte_encodings.append(encoding)
-
- # The next most likely case is that this is Latin-1 that was intended to
- # be read as Windows-1252, because those two encodings in particular are
- # easily confused.
- if 'latin-1' in possible_1byte_encodings:
- if 'windows-1252' in possible_1byte_encodings:
- # This text is in the intersection of Latin-1 and
- # Windows-1252, so it's probably legit.
- return text, []
- else:
- # Otherwise, it means we have characters that are in Latin-1 but
- # not in Windows-1252. Those are C1 control characters. Nobody
- # wants those. Assume they were meant to be Windows-1252. Don't
- # use the sloppy codec, because bad Windows-1252 characters are
- # a bad sign.
- encoded = text.encode('latin-1')
- try:
- fixed = encoded.decode('windows-1252')
- steps = []
- if fixed != text:
- steps = [('encode', 'latin-1'), ('decode', 'windows-1252')]
- return fixed, steps
- except UnicodeDecodeError:
- # This text contained characters that don't even make sense
- # if you assume they were supposed to be Windows-1252. In
- # that case, let's not assume anything.
- pass
-
- # The cases that remain are mixups between two different single-byte
- # encodings, and not the common case of Latin-1 vs. Windows-1252.
- #
- # Those cases are somewhat rare, and impossible to solve without false
- # positives. If you're in one of these situations, you should try using
- # the `ftfy.guess_bytes` function.
-
- # Return the text unchanged; the plan is empty.
- return text, []
-
-
-def apply_plan(text, plan):
- """
- Apply a plan for fixing the encoding of text.
-
- The plan is a list of tuples of the form (operation, encoding), where
- `operation` is either 'encode' or 'decode', and `encoding` is an encoding
- name such as 'utf-8' or 'latin-1'.
-
- Because only text can be encoded, and only bytes can be decoded, the plan
- should alternate 'encode' and 'decode' steps, or else this function will
- encounter an error.
- """
- obj = text
- for operation, encoding in plan:
- if operation == 'encode':
- obj = obj.encode(encoding)
- elif operation == 'decode':
- obj = obj.decode(encoding)
- else:
- raise ValueError("Unknown plan step: %s" % operation)
-
- return obj
-
-
-HTML_ENTITY_RE = re.compile(r"?\w{0,8};")
-
-
-def unescape_html(text):
- """
- Decode all three types of HTML entities/character references.
-
- Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
- to it for efficiency: it won't match entities longer than 8 characters,
- because there are no valid entities like that.
-
- >>> print(unescape_html('<tag>'))
-
- """
- def fixup(match):
- """
- Replace one matched HTML entity with the character it represents,
- if possible.
- """
- text = match.group(0)
- if text[:2] == "":
- # character reference
- try:
- if text[:3] == "":
- return unichr(int(text[3:-1], 16))
- else:
- return unichr(int(text[2:-1]))
- except ValueError:
- pass
- else:
- # named entity
- try:
- text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
- except KeyError:
- pass
- return text # leave as is
- return HTML_ENTITY_RE.sub(fixup, text)
-
-
-ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])')
-
-def remove_terminal_escapes(text):
- r"""
- Strip out "ANSI" terminal escape sequences, such as those that produce
- colored text on Unix.
-
- >>> print(remove_terminal_escapes(
- ... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
- ... ))
- I'm blue, da ba dee da ba doo...
- """
- return ANSI_RE.sub('', text)
-
-
-SINGLE_QUOTE_RE = re.compile('[\u2018-\u201b]')
-DOUBLE_QUOTE_RE = re.compile('[\u201c-\u201f]')
-
-def uncurl_quotes(text):
- r"""
- Replace curly quotation marks with straight equivalents.
-
- >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
- "here's a test"
- """
- return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
-
-
-def fix_line_breaks(text):
- r"""
- Convert all line breaks to Unix style.
-
- This will convert the following sequences into the standard \\n
- line break:
-
- - CRLF (\\r\\n), used on Windows and in some communication
- protocols
- - CR (\\r), once used on Mac OS Classic, and now kept alive
- by misguided software such as Microsoft Office for Mac
- - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029),
- defined by Unicode and used to sow confusion and discord
- - NEXT LINE (\\x85), a C1 control character that is certainly
- not what you meant
-
- The NEXT LINE character is a bit of an odd case, because it
- usually won't show up if `fix_encoding` is also being run.
- \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
-
- >>> print(fix_line_breaks(
- ... "This string is made of two things:\u2029"
- ... "1. Unicode\u2028"
- ... "2. Spite"
- ... ))
- This string is made of two things:
- 1. Unicode
- 2. Spite
-
- For further testing and examples, let's define a function to make sure
- we can see the control characters in their escaped form:
-
- >>> def eprint(text):
- ... print(text.encode('unicode-escape').decode('ascii'))
-
- >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
- Content-type: text/plain\n\nHi.
-
- >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
- This is how Microsoft \n trolls Mac users
-
- >>> eprint(fix_line_breaks("What is this \x85 I don't even"))
- What is this \n I don't even
- """
- return text.replace('\r\n', '\n').replace('\r', '\n')\
- .replace('\u2028', '\n').replace('\u2029', '\n')\
- .replace('\u0085', '\n')
-
-
-def remove_control_chars(text):
- """
- Remove all control characters except for the important ones.
-
- This removes characters in these ranges:
-
- - U+0000 to U+0008
- - U+000B
- - U+000E to U+001F
- - U+007F
-
- It leaves alone these characters that are commonly used for formatting:
-
- - TAB (U+0009)
- - LF (U+000A)
- - FF (U+000C)
- - CR (U+000D)
- """
- return text.translate(CONTROL_CHARS)
-
-
-def remove_bom(text):
- r"""
- Remove a left-over byte-order mark.
-
- >>> print(remove_bom("\ufeffWhere do you want to go today?"))
- Where do you want to go today?
- """
- return text.lstrip(unichr(0xfeff))
-
-
-def remove_unsafe_private_use(text):
- r"""
- Python 3.3's Unicode support isn't perfect, and in fact there are certain
- string operations that will crash some versions of it with a SystemError:
- http://bugs.python.org/issue18183
-
- The best solution is to remove all characters from Supplementary Private
- Use Area B, using a regex that is known not to crash given those
- characters.
-
- These are the characters from U+100000 to U+10FFFF. It's sad to lose an
- entire plane of Unicode, but on the other hand, these characters are not
- assigned and never will be. If you get one of these characters and don't
- know what its purpose is, its purpose is probably to crash your code.
-
- If you were using these for actual private use, this might be inconvenient.
- You can turn off this fixer, of course, but I kind of encourage using
- Supplementary Private Use Area A instead.
-
- >>> print(remove_unsafe_private_use('\U0001F4A9\U00100000'))
- 💩
-
- This fixer is off by default in Python 3.4 or later. (The bug is actually
- fixed in 3.3.3 and 2.7.6, but I don't want the default behavior to change
- based on a micro version upgrade of Python.)
- """
- return UNSAFE_PRIVATE_USE_RE.sub('', text)
-
-
-# Define a regex to match valid escape sequences in Python string literals.
-ESCAPE_SEQUENCE_RE = re.compile(r'''
- ( \\U........ # 8-digit hex escapes
- | \\u.... # 4-digit hex escapes
- | \\x.. # 2-digit hex escapes
- | \\[0-7]{1,3} # Octal escapes
- | \\N\{[^}]+\} # Unicode characters by name
- | \\[\\'"abfnrtv] # Single-character escapes
- )''', re.UNICODE | re.VERBOSE)
-
-
-def decode_escapes(text):
- r"""
- Decode backslashed escape sequences, including \\x, \\u, and \\U character
- references, even in the presence of other Unicode.
-
- This is what Python's "string-escape" and "unicode-escape" codecs were
- meant to do, but in contrast, this actually works. It will decode the
- string exactly the same way that the Python interpreter decodes its string
- literals.
-
- >>> factoid = '\\u20a1 is the currency symbol for the colón.'
- >>> print(factoid[1:])
- u20a1 is the currency symbol for the colón.
- >>> print(decode_escapes(factoid))
- ₡ is the currency symbol for the colón.
-
- Even though Python itself can read string literals with a combination of
- escapes and literal Unicode -- you're looking at one right now -- the
- "unicode-escape" codec doesn't work on literal Unicode. (See
- http://stackoverflow.com/a/24519338/773754 for more details.)
-
- Instead, this function searches for just the parts of a string that
- represent escape sequences, and decodes them, leaving the rest alone. All
- valid escape sequences are made of ASCII characters, and this allows
- "unicode-escape" to work correctly.
-
- This fix cannot be automatically applied by the `ftfy.fix_text` function,
- because escaped text is not necessarily a mistake, and there is no way
- to distinguish text that's supposed to be escaped from text that isn't.
- """
- def decode_match(match):
- "Given a regex match, decode the escape sequence it contains."
- return codecs.decode(match.group(0), 'unicode-escape')
-
- return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
diff --git a/lib/ftfy/streamtester/__init__.py b/lib/ftfy/streamtester/__init__.py
deleted file mode 100644
index 4b5c0614..00000000
--- a/lib/ftfy/streamtester/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-"""
-This file defines a general method for evaluating ftfy using data that arrives
-in a stream. A concrete implementation of it is found in `twitter_tester.py`.
-"""
-from __future__ import print_function, unicode_literals
-from ftfy.fixes import fix_text_encoding
-from ftfy.chardata import possible_encoding
-
-
-class StreamTester:
- """
- Take in a sequence of texts, and show the ones that will be changed by
- ftfy. This will also periodically show updates, such as the proportion of
- texts that changed.
- """
- def __init__(self):
- self.num_fixed = 0
- self.count = 0
-
- def check_ftfy(self, text):
- """
- Given a single text input, check whether `ftfy.fix_text_encoding`
- would change it. If so, display the change.
- """
- self.count += 1
- if not possible_encoding(text, 'ascii'):
- fixed = fix_text_encoding(text)
- if text != fixed:
- # possibly filter common bots before printing
- print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format(
- text=text, fixed=fixed
- ))
- self.num_fixed += 1
-
- # Print status updates once in a while
- if self.count % 100 == 0:
- print('.', end='', flush=True)
- if self.count % 10000 == 0:
- print('\n%d/%d fixed' % (self.num_fixed, self.count))
diff --git a/lib/ftfy/streamtester/oauth.py b/lib/ftfy/streamtester/oauth.py
deleted file mode 100644
index 8e300ed7..00000000
--- a/lib/ftfy/streamtester/oauth.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# coding: utf-8
-"""
-Do what is necessary to authenticate this tester as a Twitter "app", using
-somebody's Twitter account.
-"""
-from __future__ import unicode_literals
-import os
-
-
-AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth')
-
-def get_auth():
- """
- Twitter has some bizarre requirements about how to authorize an "app" to
- use its API.
-
- The user of the app has to log in to get a secret token. That's fine. But
- the app itself has its own "consumer secret" token. The app has to know it,
- and the user of the app has to not know it.
-
- This is, of course, impossible. It's equivalent to DRM. Your computer can't
- *really* make use of secret information while hiding the same information
- from you.
-
- The threat appears to be that, if you have this super-sekrit token, you can
- impersonate the app while doing something different. Well, of course you
- can do that, because you *have the source code* and you can change it to do
- what you want. You still have to log in as a particular user who has a
- token that's actually secret, you know.
-
- Even developers of closed-source applications that use the Twitter API are
- unsure what to do, for good reason. These "secrets" are not secret in any
- cryptographic sense. A bit of Googling shows that the secret tokens for
- every popular Twitter app are already posted on the Web.
-
- Twitter wants us to pretend this string can be kept secret, and hide this
- secret behind a fig leaf like everybody else does. So that's what we've
- done.
- """
-
- from twitter.oauth import OAuth
- from twitter import oauth_dance, read_token_file
-
- def unhide(secret):
- """
- Do something mysterious and exactly as secure as every other Twitter
- app.
- """
- return ''.join([chr(ord(c) - 0x2800) for c in secret])
-
- fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁'
- consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw'
-
- if os.path.exists(AUTH_TOKEN_PATH):
- token, token_secret = read_token_file(AUTH_TOKEN_PATH)
- else:
- authdir = os.path.dirname(AUTH_TOKEN_PATH)
- if not os.path.exists(authdir):
- os.makedirs(authdir)
- token, token_secret = oauth_dance(
- app_name='ftfy-tester',
- consumer_key=consumer_key,
- consumer_secret=unhide(fig_leaf),
- token_filename=AUTH_TOKEN_PATH
- )
-
- return OAuth(
- token=token,
- token_secret=token_secret,
- consumer_key=consumer_key,
- consumer_secret=unhide(fig_leaf)
- )
-
diff --git a/lib/ftfy/streamtester/twitter_tester.py b/lib/ftfy/streamtester/twitter_tester.py
deleted file mode 100644
index 6ad125ee..00000000
--- a/lib/ftfy/streamtester/twitter_tester.py
+++ /dev/null
@@ -1,89 +0,0 @@
-"""
-Implements a StreamTester that runs over Twitter data. See the class
-docstring.
-
-This module is written for Python 3 only. The __future__ imports you see here
-are just to let Python 2 scan the file without crashing with a SyntaxError.
-"""
-from __future__ import print_function, unicode_literals
-import os
-from collections import defaultdict
-from ftfy.streamtester import StreamTester
-
-
-class TwitterTester(StreamTester):
- """
- This class uses the StreamTester code (defined in `__init__.py`) to
- evaluate ftfy's real-world performance, by feeding it live data from
- Twitter.
-
- This is a semi-manual evaluation. It requires a human to look at the
- results and determine if they are good. The three possible cases we
- can see here are:
-
- - Success: the process takes in mojibake and outputs correct text.
- - False positive: the process takes in correct text, and outputs
- mojibake. Every false positive should be considered a bug, and
- reported on GitHub if it isn't already.
- - Confusion: the process takes in mojibake and outputs different
- mojibake. Not a great outcome, but not as dire as a false
- positive.
-
- This tester cannot reveal false negatives. So far, that can only be
- done by the unit tests.
- """
- OUTPUT_DIR = './twitterlogs'
-
- def __init__(self):
- self.lines_by_lang = defaultdict(list)
- super().__init__()
-
- def save_files(self):
- """
- When processing data from live Twitter, save it to log files so that
- it can be replayed later.
- """
- if not os.path.exists(self.OUTPUT_DIR):
- os.makedirs(self.OUTPUT_DIR)
- for lang, lines in self.lines_by_lang.items():
- filename = 'tweets.{}.txt'.format(lang)
- fullname = os.path.join(self.OUTPUT_DIR, filename)
- langfile = open(fullname, 'a')
- for line in lines:
- print(line.replace('\n', ' '), file=langfile)
- langfile.close()
- self.lines_by_lang = defaultdict(list)
-
- def run_sample(self):
- """
- Listen to live data from Twitter, and pass on the fully-formed tweets
- to `check_ftfy`. This requires the `twitter` Python package as a
- dependency.
- """
- from twitter import TwitterStream
- from ftfy.streamtester.oauth import get_auth
- twitter_stream = TwitterStream(auth=get_auth())
- iterator = twitter_stream.statuses.sample()
- for tweet in iterator:
- if 'text' in tweet:
- self.check_ftfy(tweet['text'])
- if 'user' in tweet:
- lang = tweet['user'].get('lang', 'NONE')
- self.lines_by_lang[lang].append(tweet['text'])
- if self.count % 10000 == 100:
- self.save_files()
-
-
-def main():
- """
- When run from the command line, this script connects to the Twitter stream
- and runs the TwitterTester on it forever. Or at least until the stream
- drops.
- """
- tester = TwitterTester()
- tester.run_sample()
-
-
-if __name__ == '__main__':
- main()
-
diff --git a/sickbeard/encodingKludge.py b/sickbeard/encodingKludge.py
index de1fd499..a3d94541 100644
--- a/sickbeard/encodingKludge.py
+++ b/sickbeard/encodingKludge.py
@@ -17,53 +17,71 @@
# along with SickRage. If not, see .
import os
+import traceback
import sickbeard
from sickbeard import logger
-import ftfy
-import ftfy.bad_codecs
+import six
+import chardet
+
# This module tries to deal with the apparently random behavior of python when dealing with unicode <-> utf-8
# encodings. It tries to just use unicode, but if that fails then it tries forcing it to utf-8. Any functions
# which return something should always return unicode.
-def fixStupidEncodings(x, silent=False):
- if type(x) == str:
- try:
- return str(ftfy.fix_text(u'' + x)).decode(sickbeard.SYS_ENCODING)
- except UnicodeDecodeError:
- logger.log(u"Unable to decode value: " + repr(x), logger.ERROR)
+def toUnicode(x):
+ try:
+ if isinstance(x, unicode):
return x
- except UnicodeEncodeError:
- logger.log(u"Unable to encode value: " + repr(x), logger.ERROR)
- return x
- elif type(x) == unicode:
- return x
- else:
- logger.log(
- u"Unknown value passed in, ignoring it: " + str(type(x)) + " (" + repr(x) + ":" + repr(type(x)) + ")",
- logger.DEBUG if silent else logger.ERROR)
+ else:
+ try:
+ return six.text_type(x)
+ except:
+ try:
+ if chardet.detect(x).get('encoding') == 'utf-8':
+ return x.decode('utf-8')
+ if isinstance(x, str):
+ try:
+ return x.decode(sickbeard.SYS_ENCODING)
+ except UnicodeDecodeError:
+ raise
+ return x
+ except:
+ raise
+ except:
+ logger.log('Unable to decode value "%s..." : %s ' % (repr(x)[:20], traceback.format_exc()), logger.WARNING)
+ ascii_text = str(x).encode('string_escape')
+ return toUnicode(ascii_text)
+def ss(x):
+ u_x = toUnicode(x)
+
+ try:
+ return u_x.encode(sickbeard.SYS_ENCODING)
+ except Exception as e:
+ logger.log('Failed ss encoding char, force UTF8: %s' % e, logger.WARNING)
+ try:
+ return u_x.encode(sickbeard.SYS_ENCODING, 'replace')
+ except:
+ return u_x.encode('utf-8', 'replace')
def fixListEncodings(x):
- if type(x) != list and type(x) != tuple:
+ if not isinstance(x, (list, tuple)):
return x
else:
- return filter(lambda x: x != None, map(fixStupidEncodings, x))
+ return filter(lambda x: x != None, map(toUnicode, x))
def ek(func, *args, **kwargs):
if os.name == 'nt':
result = func(*args, **kwargs)
else:
- result = func(
- *[fixStupidEncodings(x).encode(sickbeard.SYS_ENCODING) if type(x) in (str, unicode) else x for x in args],
- **kwargs)
+ result = func(*[ss(x) if isinstance(x, (str, unicode)) else x for x in args], **kwargs)
- if type(result) in (list, tuple):
+ if isinstance(result, (list, tuple)):
return fixListEncodings(result)
- elif type(result) == str:
- return fixStupidEncodings(result)
+ elif isinstance(result, str):
+ return toUnicode(result)
else:
return result
diff --git a/sickbeard/exceptions.py b/sickbeard/exceptions.py
index 3a81bfa7..c209ee74 100644
--- a/sickbeard/exceptions.py
+++ b/sickbeard/exceptions.py
@@ -16,7 +16,7 @@
# You should have received a copy of the GNU General Public License
# along with SickRage. If not, see .
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
def ex(e):
"""
@@ -32,11 +32,11 @@ def ex(e):
if arg is not None:
if isinstance(arg, (str, unicode)):
- fixed_arg = fixStupidEncodings(arg, True)
+ fixed_arg = toUnicode(arg, True)
else:
try:
- fixed_arg = u"error " + fixStupidEncodings(str(arg), True)
+ fixed_arg = u"error " + toUnicode(str(arg), True)
except:
fixed_arg = None
diff --git a/sickbeard/failed_history.py b/sickbeard/failed_history.py
index 32c3a8cd..0fc1484b 100644
--- a/sickbeard/failed_history.py
+++ b/sickbeard/failed_history.py
@@ -26,7 +26,7 @@ from sickbeard.exceptions import ex, EpisodeNotFoundException
from sickbeard.history import dateFormat
from sickbeard.common import Quality
from sickbeard.common import WANTED, FAILED
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
def prepareFailedName(release):
"""Standardizes release name for failed DB"""
@@ -36,7 +36,7 @@ def prepareFailedName(release):
fixed = fixed.rpartition(".")[0]
fixed = re.sub("[\.\-\+\ ]", "_", fixed)
- fixed = fixStupidEncodings(fixed)
+ fixed = toUnicode(fixed)
return fixed
diff --git a/sickbeard/history.py b/sickbeard/history.py
index 45f15f96..cb1a8486 100644
--- a/sickbeard/history.py
+++ b/sickbeard/history.py
@@ -20,7 +20,7 @@ import db
import datetime
from sickbeard.common import SNATCHED, SUBTITLED, FAILED, Quality
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
dateFormat = "%Y%m%d%H%M%S"
@@ -28,7 +28,7 @@ dateFormat = "%Y%m%d%H%M%S"
def _logHistoryItem(action, showid, season, episode, quality, resource, provider, version=-1):
logDate = datetime.datetime.today().strftime(dateFormat)
- resource = fixStupidEncodings(resource)
+ resource = toUnicode(resource)
myDB = db.DBConnection()
myDB.action(
diff --git a/sickbeard/notifiers/emailnotify.py b/sickbeard/notifiers/emailnotify.py
index ff412f2c..1dac6758 100644
--- a/sickbeard/notifiers/emailnotify.py
+++ b/sickbeard/notifiers/emailnotify.py
@@ -29,7 +29,7 @@ import sickbeard
from sickbeard import logger, common
from sickbeard import db
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
from sickbeard.exceptions import ex
@@ -51,7 +51,7 @@ class EmailNotifier:
ep_name: The name of the episode that was snatched
title: The title of the notification (optional)
"""
- ep_name = fixStupidEncodings(ep_name)
+ ep_name = toUnicode(ep_name)
if sickbeard.EMAIL_NOTIFY_ONSNATCH:
show = self._parseEp(ep_name)
@@ -86,7 +86,7 @@ class EmailNotifier:
ep_name: The name of the episode that was downloaded
title: The title of the notification (optional)
"""
- ep_name = fixStupidEncodings(ep_name)
+ ep_name = toUnicode(ep_name)
if sickbeard.EMAIL_NOTIFY_ONDOWNLOAD:
show = self._parseEp(ep_name)
@@ -121,7 +121,7 @@ class EmailNotifier:
ep_name: The name of the episode that was downloaded
lang: Subtitle language wanted
"""
- ep_name = fixStupidEncodings(ep_name)
+ ep_name = toUnicode(ep_name)
if sickbeard.EMAIL_NOTIFY_ONSUBTITLEDOWNLOAD:
show = self._parseEp(ep_name)
@@ -198,7 +198,7 @@ class EmailNotifier:
return False
def _parseEp(self, ep_name):
- ep_name = fixStupidEncodings(ep_name)
+ ep_name = toUnicode(ep_name)
sep = " - "
titles = ep_name.split(sep)
diff --git a/sickbeard/notifiers/plex.py b/sickbeard/notifiers/plex.py
index f3d51c9f..3d9f8717 100644
--- a/sickbeard/notifiers/plex.py
+++ b/sickbeard/notifiers/plex.py
@@ -25,7 +25,7 @@ import sickbeard
from sickbeard import logger
from sickbeard import common
from sickbeard.exceptions import ex
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
from sickbeard.notifiers.xbmc import XBMCNotifier
diff --git a/sickbeard/notifiers/xbmc.py b/sickbeard/notifiers/xbmc.py
index 6feca826..49683bba 100644
--- a/sickbeard/notifiers/xbmc.py
+++ b/sickbeard/notifiers/xbmc.py
@@ -26,7 +26,7 @@ import sickbeard
from sickbeard import logger
from sickbeard import common
from sickbeard.exceptions import ex
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
try:
@@ -236,9 +236,9 @@ class XBMCNotifier:
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
authheader = "Basic %s" % base64string
req.add_header("Authorization", authheader)
- logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG)
+ logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG)
else:
- logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG)
+ logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG)
response = urllib2.urlopen(req)
result = response.read().decode(sickbeard.SYS_ENCODING)
@@ -248,7 +248,7 @@ class XBMCNotifier:
return result
except (urllib2.URLError, IOError), e:
- logger.log(u"Warning: Couldn't contact XBMC HTTP at " + fixStupidEncodings(url) + " " + ex(e),
+ logger.log(u"Warning: Couldn't contact XBMC HTTP at " + toUnicode(url) + " " + ex(e),
logger.WARNING)
return False
@@ -379,9 +379,9 @@ class XBMCNotifier:
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
authheader = "Basic %s" % base64string
req.add_header("Authorization", authheader)
- logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG)
+ logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG)
else:
- logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG)
+ logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG)
try:
response = urllib2.urlopen(req)
@@ -401,7 +401,7 @@ class XBMCNotifier:
return False
except IOError, e:
- logger.log(u"Warning: Couldn't contact XBMC JSON API at " + fixStupidEncodings(url) + " " + ex(e),
+ logger.log(u"Warning: Couldn't contact XBMC JSON API at " + toUnicode(url) + " " + ex(e),
logger.WARNING)
return False
diff --git a/sickbeard/nzbSplitter.py b/sickbeard/nzbSplitter.py
index 6b60c20c..39d1df63 100644
--- a/sickbeard/nzbSplitter.py
+++ b/sickbeard/nzbSplitter.py
@@ -29,7 +29,7 @@ from sickbeard import encodingKludge as ek
from sickbeard.exceptions import ex
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
def getSeasonNZBs(name, urlData, season):
@@ -85,7 +85,7 @@ def createNZBString(fileElements, xmlns):
for curFile in fileElements:
rootElement.append(stripNS(curFile, xmlns))
- return xml.etree.ElementTree.tostring(fixStupidEncodings(rootElement))
+ return xml.etree.ElementTree.tostring(toUnicode(rootElement))
def saveNZB(nzbName, nzbString):
diff --git a/sickbeard/scene_exceptions.py b/sickbeard/scene_exceptions.py
index 7ca5f977..f4154449 100644
--- a/sickbeard/scene_exceptions.py
+++ b/sickbeard/scene_exceptions.py
@@ -27,7 +27,7 @@ from sickbeard import helpers
from sickbeard import name_cache
from sickbeard import logger
from sickbeard import db
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
exception_dict = {}
anidb_exception_dict = {}
@@ -234,7 +234,7 @@ def retrieve_exceptions():
# if this exception isn't already in the DB then add it
if cur_exception not in existing_exceptions:
- cur_exception = fixStupidEncodings(cur_exception)
+ cur_exception = toUnicode(cur_exception)
myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
[cur_indexer_id, cur_exception, curSeason])
@@ -267,7 +267,7 @@ def update_scene_exceptions(indexer_id, scene_exceptions, season=-1):
exceptionsCache[indexer_id][season] = scene_exceptions
for cur_exception in scene_exceptions:
- cur_exception = fixStupidEncodings(cur_exception)
+ cur_exception = toUnicode(cur_exception)
myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
[indexer_id, cur_exception, season])
diff --git a/sickbeard/show_name_helpers.py b/sickbeard/show_name_helpers.py
index e408d535..736cbf42 100644
--- a/sickbeard/show_name_helpers.py
+++ b/sickbeard/show_name_helpers.py
@@ -234,7 +234,7 @@ def isGoodResult(name, show, log=True, season=-1):
all_show_names = allPossibleShowNames(show, season=season)
showNames = map(sanitizeSceneName, all_show_names) + all_show_names
- showNames += map(unidecode, all_show_names)
+ showNames += map(ek.toUnicode, all_show_names)
for curName in set(showNames):
if not show.is_anime:
diff --git a/sickbeard/tvcache.py b/sickbeard/tvcache.py
index 01d3453e..dec8280d 100644
--- a/sickbeard/tvcache.py
+++ b/sickbeard/tvcache.py
@@ -33,7 +33,7 @@ from sickbeard.exceptions import AuthException
from sickbeard.rssfeeds import RSSFeeds
from sickbeard import clients
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
-from sickbeard.encodingKludge import fixStupidEncodings
+from sickbeard.encodingKludge import toUnicode
class CacheDBConnection(db.DBConnection):
def __init__(self, providerName):
@@ -263,7 +263,7 @@ class TVCache():
# get quality of release
quality = parse_result.quality
- name = fixStupidEncodings(name)
+ name = toUnicode(name)
# get release group
release_group = parse_result.release_group
diff --git a/sickbeard/webserve.py b/sickbeard/webserve.py
index bfc73f49..08277811 100644
--- a/sickbeard/webserve.py
+++ b/sickbeard/webserve.py
@@ -3288,7 +3288,7 @@ class ErrorLogs(MainHandler):
for x in reversed(data):
- x = ek.fixStupidEncodings(x)
+ x = ek.toUnicode(x)
match = re.match(regex, x)
if match:
diff --git a/tests/all_tests.py b/tests/all_tests.py
index dfe63c9e..28bdeb63 100644
--- a/tests/all_tests.py
+++ b/tests/all_tests.py
@@ -18,23 +18,27 @@
# You should have received a copy of the GNU General Public License
# along with SickRage. If not, see .
+import glob
+import unittest
+import sys
+
+class AllTests(unittest.TestCase):
+ def setUp(self):
+ self.test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__]
+ self.module_strings = [file_string[0:len(file_string) - 3] for file_string in self.test_file_strings]
+ self.suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in self.module_strings]
+ self.testSuite = unittest.TestSuite(self.suites)
+
+ def testAll(self):
+ print "=================="
+ print "STARTING - ALL TESTS"
+ print "=================="
+ for includedfiles in self.test_file_strings:
+ print "- " + includedfiles
+
+ text_runner = unittest.TextTestRunner().run(self.testSuite)
+ if not text_runner.wasSuccessful():
+ sys.exit(-1)
+
if __name__ == "__main__":
- import glob
- import unittest
- import sys
-
- test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__]
- module_strings = [file_string[0:len(file_string) - 3] for file_string in test_file_strings]
- suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in module_strings]
- testSuite = unittest.TestSuite(suites)
-
- print "=================="
- print "STARTING - ALL TESTS"
- print "=================="
- print "this will include"
- for includedfiles in test_file_strings:
- print "- " + includedfiles
-
- text_runner = unittest.TextTestRunner().run(testSuite)
- if not text_runner.wasSuccessful():
- sys.exit(-1)
+ unittest.main()
\ No newline at end of file
diff --git a/tests/common_tests.py b/tests/common_tests.py
index de620965..19b4632e 100644
--- a/tests/common_tests.py
+++ b/tests/common_tests.py
@@ -8,7 +8,6 @@ sys.path.append(os.path.abspath('../lib'))
from sickbeard import common
-
class QualityTests(unittest.TestCase):
# TODO: repack / proper ? air-by-date ? season rip? multi-ep?
diff --git a/tests/test_lib.py b/tests/test_lib.py
index 7f956b67..201d2182 100644
--- a/tests/test_lib.py
+++ b/tests/test_lib.py
@@ -51,7 +51,6 @@ EPISODE = 2
FILENAME = u"show name - s0" + str(SEASON) + "e0" + str(EPISODE) + ".mkv"
FILEDIR = os.path.join(TESTDIR, SHOWNAME)
FILEPATH = os.path.join(FILEDIR, FILENAME)
-
SHOWDIR = os.path.join(TESTDIR, SHOWNAME + " final")
#sickbeard.logger.sb_log_instance = sickbeard.logger.SBRotatingLogHandler(os.path.join(TESTDIR, 'sickbeard.log'), sickbeard.logger.NUM_LOGS, sickbeard.logger.LOG_SIZE)