From 360c3afa08ffd6683a8b050e67b18adbb74a71af Mon Sep 17 00:00:00 2001 From: echel0n Date: Tue, 25 Nov 2014 17:22:31 -0800 Subject: [PATCH] Removed FTFY, python 2.6 compatibility issues. Re-coded encodingKludge encode/decode for unicode <-> utf-8 --- lib/ftfy/__init__.py | 351 ------------------ lib/ftfy/bad_codecs/__init__.py | 94 ----- lib/ftfy/bad_codecs/sloppy.py | 156 -------- lib/ftfy/bad_codecs/utf8_variants.py | 281 -------------- lib/ftfy/badness.py | 144 -------- lib/ftfy/build_data.py | 111 ------ lib/ftfy/char_classes.dat | Bin 3568 -> 0 bytes lib/ftfy/chardata.py | 81 ---- lib/ftfy/cli.py | 34 -- lib/ftfy/compatibility.py | 79 ---- lib/ftfy/fixes.py | 473 ------------------------ lib/ftfy/streamtester/__init__.py | 39 -- lib/ftfy/streamtester/oauth.py | 73 ---- lib/ftfy/streamtester/twitter_tester.py | 89 ----- sickbeard/encodingKludge.py | 68 ++-- sickbeard/exceptions.py | 6 +- sickbeard/failed_history.py | 4 +- sickbeard/history.py | 4 +- sickbeard/notifiers/emailnotify.py | 10 +- sickbeard/notifiers/plex.py | 2 +- sickbeard/notifiers/xbmc.py | 14 +- sickbeard/nzbSplitter.py | 4 +- sickbeard/scene_exceptions.py | 6 +- sickbeard/show_name_helpers.py | 2 +- sickbeard/tvcache.py | 4 +- sickbeard/webserve.py | 2 +- tests/all_tests.py | 42 ++- tests/common_tests.py | 1 - tests/test_lib.py | 1 - 29 files changed, 95 insertions(+), 2080 deletions(-) delete mode 100644 lib/ftfy/__init__.py delete mode 100644 lib/ftfy/bad_codecs/__init__.py delete mode 100644 lib/ftfy/bad_codecs/sloppy.py delete mode 100644 lib/ftfy/bad_codecs/utf8_variants.py delete mode 100644 lib/ftfy/badness.py delete mode 100644 lib/ftfy/build_data.py delete mode 100644 lib/ftfy/char_classes.dat delete mode 100644 lib/ftfy/chardata.py delete mode 100644 lib/ftfy/cli.py delete mode 100644 lib/ftfy/compatibility.py delete mode 100644 lib/ftfy/fixes.py delete mode 100644 lib/ftfy/streamtester/__init__.py delete mode 100644 lib/ftfy/streamtester/oauth.py delete mode 100644 lib/ftfy/streamtester/twitter_tester.py diff --git a/lib/ftfy/__init__.py b/lib/ftfy/__init__.py deleted file mode 100644 index 2887c5b9..00000000 --- a/lib/ftfy/__init__.py +++ /dev/null @@ -1,351 +0,0 @@ -# -*- coding: utf-8 -*- -""" -ftfy: fixes text for you - -This is a module for making text less broken. See the `fix_text` function -for more information. -""" - -from __future__ import unicode_literals - -# See the docstring for ftfy.bad_codecs to see what we're doing here. -import ftfy.bad_codecs -ftfy.bad_codecs.ok() - -from ftfy import fixes -from ftfy.fixes import fix_text_encoding -from ftfy.compatibility import PYTHON34_OR_LATER, is_printable -import unicodedata -import warnings - - -def fix_text(text, - remove_unsafe_private_use=(not PYTHON34_OR_LATER), - fix_entities='auto', - remove_terminal_escapes=True, - fix_encoding=True, - normalization='NFKC', - uncurl_quotes=True, - fix_line_breaks=True, - remove_control_chars=True, - remove_bom=True, - max_decode_length=2**16): - r""" - Given Unicode text as input, make its representation consistent and - possibly less broken. - - Let's start with some examples: - - >>> print(fix_text('ünicode')) - ünicode - - >>> print(fix_text('Broken text… it’s flubberific!')) - Broken text... it's flubberific! - - >>> print(fix_text('HTML entities <3')) - HTML entities <3 - - >>> print(fix_text('HTML entities <3')) - HTML entities <3 - - >>> print(fix_text('\001\033[36;44mI’m blue, da ba dee da ba ' - ... 'doo…\033[0m')) - I'm blue, da ba dee da ba doo... - - >>> # This example string starts with a byte-order mark, even if - >>> # you can't see it on the Web. - >>> print(fix_text('\ufeffParty like\nit’s 1999!')) - Party like - it's 1999! - - >>> len(fix_text('fi' * 100000)) - 200000 - - >>> len(fix_text('')) - 0 - - Based on the options you provide, ftfy applies these steps in order: - - - If `remove_unsafe_private_use` is True, it removes a range of private-use - characters that could trigger a Python bug. The bug is fixed in - the most recent versions of Python, so this will default to False - starting on Python 3.4. - - If `fix_entities` is True, replace HTML entities with their equivalent - characters. If it's "auto" (the default), then consider replacing HTML - entities, but don't do so in text where you have seen a pair of actual - angle brackets (that's probably actually HTML and you shouldn't mess - with the entities). - - If `remove_terminal_escapes` is True, remove sequences of bytes that are - instructions for Unix terminals, such as the codes that make text appear - in different colors. - - If `fix_encoding` is True, look for common mistakes that come from - encoding or decoding Unicode text incorrectly, and fix them if they are - reasonably fixable. See `fix_text_encoding` for details. - - If `normalization` is not None, apply the specified form of Unicode - normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'. - The default, 'NFKC', applies the following relevant transformations: - - - C: Combine characters and diacritics that are written using separate - code points, such as converting "e" plus an acute accent modifier - into "é", or converting "ka" (か) plus a dakuten into the - single character "ga" (が). - - K: Replace characters that are functionally equivalent with the most - common form. For example, half-width katakana will be replaced with - full-width versions, full-width Roman characters will be replaced with - ASCII characters, ellipsis characters will be replaced with three - periods, and the ligature 'fl' will be replaced with 'fl'. - - - If `uncurl_quotes` is True, replace various curly quotation marks with - plain-ASCII straight quotes. - - If `fix_line_breaks` is true, convert all line breaks to Unix style - (CRLF and CR line breaks become LF line breaks). - - If `fix_control_characters` is true, remove all C0 control characters - except the common useful ones: TAB, CR, LF, and FF. (CR characters - may have already been removed by the `fix_line_breaks` step.) - - If `remove_bom` is True, remove the Byte-Order Mark if it exists. - - If anything was changed, repeat all the steps, so that the function is - idempotent. "&amp;" will become "&", for example, not "&". - - `fix_text` will work one line at a time, with the possibility that some - lines are in different encodings. When it encounters lines longer than - `max_decode_length`, it will not run the `fix_encoding` step, to avoid - unbounded slowdowns. - - If you are certain your entire text is in the same encoding (though that - encoding is possibly flawed), and do not mind performing operations on - the whole text at once, use `fix_text_segment`. - """ - if isinstance(text, bytes): - raise UnicodeError(fixes.BYTES_ERROR_TEXT) - - out = [] - pos = 0 - while pos < len(text): - textbreak = text.find('\n', pos) + 1 - fix_encoding_this_time = fix_encoding - if textbreak == 0: - textbreak = len(text) - if (textbreak - pos) > max_decode_length: - fix_encoding_this_time = False - - substring = text[pos:textbreak] - - if fix_entities == 'auto' and '<' in substring and '>' in substring: - # we see angle brackets together; this could be HTML - fix_entities = False - - out.append( - fix_text_segment( - substring, - remove_unsafe_private_use=remove_unsafe_private_use, - fix_entities=fix_entities, - remove_terminal_escapes=remove_terminal_escapes, - fix_encoding=fix_encoding_this_time, - normalization=normalization, - uncurl_quotes=uncurl_quotes, - fix_line_breaks=fix_line_breaks, - remove_control_chars=remove_control_chars, - remove_bom=remove_bom - ) - ) - pos = textbreak - - return ''.join(out) - -ftfy = fix_text - - -def fix_file(input_file, - remove_unsafe_private_use=True, - fix_entities='auto', - remove_terminal_escapes=True, - fix_encoding=True, - normalization='NFKC', - uncurl_quotes=True, - fix_line_breaks=True, - remove_control_chars=True, - remove_bom=True): - """ - Fix text that is found in a file. - - If the file is being read as Unicode text, use that. If it's being read as - bytes, then unfortunately, we have to guess what encoding it is. We'll try - a few common encodings, but we make no promises. See the `guess_bytes` - function for how this is done. - - The output is a stream of fixed lines of text. - """ - entities = fix_entities - for line in input_file: - if isinstance(line, bytes): - line, encoding = guess_bytes(line) - if fix_entities == 'auto' and '<' in line and '>' in line: - entities = False - yield fix_text_segment( - line, - remove_unsafe_private_use=remove_unsafe_private_use, - fix_entities=entities, - remove_terminal_escapes=remove_terminal_escapes, - fix_encoding=fix_encoding, - normalization=normalization, - uncurl_quotes=uncurl_quotes, - fix_line_breaks=fix_line_breaks, - remove_control_chars=remove_control_chars, - remove_bom=remove_bom - ) - - -def fix_text_segment(text, - remove_unsafe_private_use=True, - fix_entities='auto', - remove_terminal_escapes=True, - fix_encoding=True, - normalization='NFKC', - uncurl_quotes=True, - fix_line_breaks=True, - remove_control_chars=True, - remove_bom=True): - """ - Apply fixes to text in a single chunk. This could be a line of text - within a larger run of `fix_text`, or it could be a larger amount - of text that you are certain is all in the same encoding. - - See `fix_text` for a description of the parameters. - """ - if isinstance(text, bytes): - raise UnicodeError(fixes.BYTES_ERROR_TEXT) - - if fix_entities == 'auto' and '<' in text and '>' in text: - fix_entities = False - while True: - origtext = text - if remove_unsafe_private_use: - text = fixes.remove_unsafe_private_use(text) - if fix_entities: - text = fixes.unescape_html(text) - if remove_terminal_escapes: - text = fixes.remove_terminal_escapes(text) - if fix_encoding: - text = fixes.fix_text_encoding(text) - if normalization is not None: - text = unicodedata.normalize(normalization, text) - if uncurl_quotes: - text = fixes.uncurl_quotes(text) - if fix_line_breaks: - text = fixes.fix_line_breaks(text) - if remove_control_chars: - text = fixes.remove_control_chars(text) - if remove_bom: - text = fixes.remove_bom(text) - if text == origtext: - return text - - -def guess_bytes(bstring): - """ - If you have some bytes in an unknown encoding, here's a reasonable - strategy for decoding them, by trying a few common encodings that - can be distinguished from each other. - - This is not a magic bullet. If the bytes are coming from some MySQL - database with the "character set" set to ISO Elbonian, this won't figure - it out. Perhaps more relevantly, this currently doesn't try East Asian - encodings. - - The encodings we try are: - - - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks - like nothing else - - UTF-8, because it's the global de facto standard - - "utf-8-variants", because it's what people actually implement when they - think they're doing UTF-8 - - MacRoman, because Microsoft Office thinks it's still a thing, and it - can be distinguished by its line breaks. (If there are no line breaks in - the string, though, you're out of luck.) - - "sloppy-windows-1252", the Latin-1-like encoding that is the most common - single-byte encoding - """ - if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'): - return bstring.decode('utf-16'), 'utf-16' - - byteset = set(bytes(bstring)) - byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n' - - try: - if byte_ed in byteset or byte_c0 in byteset: - # Byte 0xed can be used to encode a range of codepoints that - # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates, - # so when we see 0xed, it's very likely we're being asked to - # decode CESU-8, the variant that encodes UTF-16 surrogates - # instead of the original characters themselves. - # - # This will occasionally trigger on standard UTF-8, as there - # are some Korean characters that also use byte 0xed, but that's - # not harmful. - # - # Byte 0xc0 is impossible because, numerically, it would only - # encode characters lower than U+0040. Those already have - # single-byte representations, and UTF-8 requires using the - # shortest possible representation. However, Java hides the null - # codepoint, U+0000, in a non-standard longer representation -- it - # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00 - # will never appear in the encoded bytes. - # - # The 'utf-8-variants' decoder can handle both of these cases, as - # well as standard UTF-8, at the cost of a bit of speed. - return bstring.decode('utf-8-variants'), 'utf-8-variants' - else: - return bstring.decode('utf-8'), 'utf-8' - except UnicodeDecodeError: - pass - - if byte_CR in bstring and byte_LF not in bstring: - return bstring.decode('macroman'), 'macroman' - else: - return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252' - - -def explain_unicode(text): - """ - A utility method that's useful for debugging mysterious Unicode. - - It breaks down a string, showing you for each codepoint its number in - hexadecimal, its glyph, its category in the Unicode standard, and its name - in the Unicode standard. - - >>> explain_unicode('(╯°□°)╯︵ ┻━┻') - U+0028 ( [Ps] LEFT PARENTHESIS - U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT - U+00B0 ° [So] DEGREE SIGN - U+25A1 □ [So] WHITE SQUARE - U+00B0 ° [So] DEGREE SIGN - U+0029 ) [Pe] RIGHT PARENTHESIS - U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT - U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS - U+0020 [Zs] SPACE - U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL - U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL - U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL - """ - for char in text: - if is_printable(char): - display = char - else: - display = char.encode('unicode-escape').decode('ascii') - print('U+{code:04X} {display:<7} [{category}] {name}'.format( - display=display, - code=ord(char), - category=unicodedata.category(char), - name=unicodedata.name(char, '') - )) - - -def fix_bad_encoding(text): - """ - Kept for compatibility with previous versions of ftfy. - """ - warnings.warn( - 'fix_bad_encoding is now known as fix_text_encoding', - DeprecationWarning - ) - return fix_text_encoding(text) diff --git a/lib/ftfy/bad_codecs/__init__.py b/lib/ftfy/bad_codecs/__init__.py deleted file mode 100644 index 0984bd52..00000000 --- a/lib/ftfy/bad_codecs/__init__.py +++ /dev/null @@ -1,94 +0,0 @@ -# coding: utf-8 -r""" -Give Python the ability to decode some common, flawed encodings. - -Python does not want you to be sloppy with your text. Its encoders and decoders -("codecs") follow the relevant standards whenever possible, which means that -when you get text that *doesn't* follow those standards, you'll probably fail -to decode it. Or you might succeed at decoding it for implementation-specific -reasons, which is perhaps worse. - -There are some encodings out there that Python wishes didn't exist, which are -widely used outside of Python: - -- "utf-8-variants", a family of not-quite-UTF-8 encodings, including the - ever-popular CESU-8 and "Java modified UTF-8". -- "Sloppy" versions of character map encodings, where bytes that don't map to - anything will instead map to the Unicode character with the same number. - -Simply importing this module, or in fact any part of the `ftfy` package, will -make these new "bad codecs" available to Python through the standard Codecs -API. You never have to actually call any functions inside `ftfy.bad_codecs`. - -However, if you want to call something because your code checker insists on it, -you can call ``ftfy.bad_codecs.ok()``. - -A quick example of decoding text that's encoded in CESU-8: - - >>> import ftfy.bad_codecs - >>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants')) - 😍 -""" -from __future__ import unicode_literals -from encodings import normalize_encoding -import codecs - -_CACHE = {} - -# Define some aliases for 'utf-8-variants'. All hyphens get turned into -# underscores, because of `normalize_encoding`. -UTF8_VAR_NAMES = ( - 'utf_8_variants', 'utf8_variants', - 'utf_8_variant', 'utf8_variant', - 'utf_8_var', 'utf8_var', - 'cesu_8', 'cesu8', - 'java_utf_8', 'java_utf8' -) - - -def search_function(encoding): - """ - Register our "bad codecs" with Python's codecs API. This involves adding - a search function that takes in an encoding name, and returns a codec - for that encoding if it knows one, or None if it doesn't. - - The encodings this will match are: - - - Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N', - where the non-sloppy version is an encoding that leaves some bytes - unmapped to characters. - - The 'utf-8-variants' encoding, which has the several aliases seen - above. - """ - if encoding in _CACHE: - return _CACHE[encoding] - - norm_encoding = normalize_encoding(encoding) - codec = None - if norm_encoding in UTF8_VAR_NAMES: - from ftfy.bad_codecs.utf8_variants import CODEC_INFO - codec = CODEC_INFO - elif norm_encoding.startswith('sloppy_'): - from ftfy.bad_codecs.sloppy import CODECS - codec = CODECS.get(norm_encoding) - - if codec is not None: - _CACHE[encoding] = codec - - return codec - - -def ok(): - """ - A feel-good function that gives you something to call after importing - this package. - - Why is this here? Pyflakes. Pyflakes gets upset when you import a module - and appear not to use it. It doesn't know that you're using it when - you use the ``unicode.encode`` and ``bytes.decode`` methods with certain - encodings. - """ - pass - - -codecs.register(search_function) diff --git a/lib/ftfy/bad_codecs/sloppy.py b/lib/ftfy/bad_codecs/sloppy.py deleted file mode 100644 index adca2213..00000000 --- a/lib/ftfy/bad_codecs/sloppy.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding: utf-8 -r""" -Decodes single-byte encodings, filling their "holes" in the same messy way that -everyone else does. - -A single-byte encoding maps each byte to a Unicode character, except that some -bytes are left unmapped. In the commonly-used Windows-1252 encoding, for -example, bytes 0x81 and 0x8D, among others, have no meaning. - -Python, wanting to preserve some sense of decorum, will handle these bytes -as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're -different from each other. It just hasn't defined what they are in terms of -Unicode. - -Software that has to interoperate with Windows-1252 and Unicode -- such as all -the common Web browsers -- will pick some Unicode characters for them to map -to, and the characters they pick are the Unicode characters with the same -numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the -resulting characters tend to fall into a range of Unicode that's set aside for -obselete Latin-1 control characters anyway. - -These sloppy codecs let Python do the same thing, thus interoperating with -other software that works this way. It defines a sloppy version of many -single-byte encodings with holes. (There is no need for a sloppy version of -an encoding without holes: for example, there is no such thing as -sloppy-iso-8859-2 or sloppy-macroman.) - -The following encodings will become defined: - -- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2) -- sloppy-windows-1251 (Cyrillic) -- sloppy-windows-1252 (Western European, based on Latin-1) -- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7) -- sloppy-windows-1254 (Turkish, based on ISO-8859-9) -- sloppy-windows-1255 (Hebrew, based on ISO-8859-8) -- sloppy-windows-1256 (Arabic) -- sloppy-windows-1257 (Baltic, based on ISO-8859-13) -- sloppy-windows-1258 (Vietnamese) -- sloppy-cp874 (Thai, based on ISO-8859-11) -- sloppy-iso-8859-3 (Maltese and Esperanto, I guess) -- sloppy-iso-8859-6 (different Arabic) -- sloppy-iso-8859-7 (Greek) -- sloppy-iso-8859-8 (Hebrew) -- sloppy-iso-8859-11 (Thai) - -Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be -defined. - -Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy; -the rest are rather uncommon. - -Here are some examples, using `ftfy.explain_unicode` to illustrate how -sloppy-windows-1252 merges Windows-1252 with Latin-1: - - >>> from ftfy import explain_unicode - >>> some_bytes = b'\x80\x81\x82' - >>> explain_unicode(some_bytes.decode('latin-1')) - U+0080 \x80 [Cc] - U+0081 \x81 [Cc] - U+0082 \x82 [Cc] - - >>> explain_unicode(some_bytes.decode('windows-1252', 'replace')) - U+20AC € [Sc] EURO SIGN - U+FFFD � [So] REPLACEMENT CHARACTER - U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK - - >>> explain_unicode(some_bytes.decode('sloppy-windows-1252')) - U+20AC € [Sc] EURO SIGN - U+0081 \x81 [Cc] - U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK -""" -from __future__ import unicode_literals -import codecs -from encodings import normalize_encoding - -REPLACEMENT_CHAR = '\ufffd' - - -def make_sloppy_codec(encoding): - """ - Take a codec name, and return a 'sloppy' version of that codec that can - encode and decode the unassigned bytes in that encoding. - - Single-byte encodings in the standard library are defined using some - boilerplate classes surrounding the functions that do the actual work, - `codecs.charmap_decode` and `charmap_encode`. This function, given an - encoding name, *defines* those boilerplate classes. - """ - # Make an array of all 256 possible bytes. - all_bytes = bytearray(range(256)) - - # Get a list of what they would decode to in Latin-1. - sloppy_chars = list(all_bytes.decode('latin-1')) - - # Get a list of what they decode to in the given encoding. Use the - # replacement character for unassigned bytes. - decoded_chars = all_bytes.decode(encoding, 'replace') - - # Update the sloppy_chars list. Each byte that was successfully decoded - # gets its decoded value in the list. The unassigned bytes are left as - # they are, which gives their decoding in Latin-1. - for i, char in enumerate(decoded_chars): - if char != REPLACEMENT_CHAR: - sloppy_chars[i] = char - - # Create the data structures that tell the charmap methods how to encode - # and decode in this sloppy encoding. - decoding_table = ''.join(sloppy_chars) - encoding_table = codecs.charmap_build(decoding_table) - - # Now produce all the class boilerplate. Look at the Python source for - # `encodings.cp1252` for comparison; this is almost exactly the same, - # except I made it follow pep8. - class Codec(codecs.Codec): - def encode(self, input, errors='strict'): - return codecs.charmap_encode(input, errors, encoding_table) - - def decode(self, input, errors='strict'): - return codecs.charmap_decode(input, errors, decoding_table) - - class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - return codecs.charmap_encode(input, self.errors, encoding_table)[0] - - class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return codecs.charmap_decode(input, self.errors, decoding_table)[0] - - class StreamWriter(Codec, codecs.StreamWriter): - pass - - class StreamReader(Codec, codecs.StreamReader): - pass - - return codecs.CodecInfo( - name='sloppy-' + encoding, - encode=Codec().encode, - decode=Codec().decode, - incrementalencoder=IncrementalEncoder, - incrementaldecoder=IncrementalDecoder, - streamreader=StreamReader, - streamwriter=StreamWriter, - ) - -# Define a codec for each incomplete encoding. The resulting CODECS dictionary -# can be used by the main module of ftfy.bad_codecs. -CODECS = {} -INCOMPLETE_ENCODINGS = ( - ['windows-%s' % num for num in range(1250, 1259)] + - ['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] + - ['cp%s' % num for num in range(1250, 1259)] + ['cp874'] -) - -for _encoding in INCOMPLETE_ENCODINGS: - _new_name = normalize_encoding('sloppy-' + _encoding) - CODECS[_new_name] = make_sloppy_codec(_encoding) diff --git a/lib/ftfy/bad_codecs/utf8_variants.py b/lib/ftfy/bad_codecs/utf8_variants.py deleted file mode 100644 index 565cb2b4..00000000 --- a/lib/ftfy/bad_codecs/utf8_variants.py +++ /dev/null @@ -1,281 +0,0 @@ -r""" -This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can -decode text that's been encoded with a popular non-standard version of UTF-8. -This includes CESU-8, the accidental encoding made by layering UTF-8 on top of -UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for -codepoint 0. - -This is particularly relevant in Python 3, which provides no other way of -decoding CESU-8 or Java's encoding. [1] - -The easiest way to use the codec is to simply import `ftfy.bad_codecs`: - - >>> import ftfy.bad_codecs - >>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var') - >>> print(repr(result).lstrip('u')) - 'here comes a null! \x00' - -The codec does not at all enforce "correct" CESU-8. For example, the Unicode -Consortium's not-quite-standard describing CESU-8 requires that there is only -one possible encoding of any character, so it does not allow mixing of valid -UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8 -decoder does. - -Characters in the Basic Multilingual Plane still have only one encoding. This -codec still enforces the rule, within the BMP, that characters must appear in -their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`, -instead of just `0x00`, may be used to encode the null character `U+0000`, like -in Java. - -If you encode with this codec, you get legitimate UTF-8. Decoding with this -codec and then re-encoding is not idempotent, although encoding and then -decoding is. So this module won't produce CESU-8 for you. Look for that -functionality in the sister module, "Breaks Text For You", coming approximately -never. - -[1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: first -decode the bytes (incorrectly), then encode them, then decode them again, using -UTF-8 as the codec every time. -""" - -from __future__ import unicode_literals -from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2 -from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder, - IncrementalEncoder as UTF8IncrementalEncoder) -import re -import codecs - -NAME = 'utf-8-variants' -# This regular expression matches all possible six-byte CESU-8 sequences. -CESU8_RE = re.compile(b'\xed[\xa0-\xaf][\x80-\xbf]\xed[\xb0-\xbf][\x80-\xbf]') - - -class IncrementalDecoder(UTF8IncrementalDecoder): - """ - An incremental decoder that extends Python's built-in UTF-8 decoder. - - This encoder needs to take in bytes, possibly arriving in a stream, and - output the correctly decoded text. The general strategy for doing this - is to fall back on the real UTF-8 decoder whenever possible, because - the real UTF-8 decoder is way optimized, but to call specialized methods - we define here for the cases the real encoder isn't expecting. - """ - def _buffer_decode(self, input, errors, final): - """ - Decode bytes that may be arriving in a stream, following the Codecs - API. - - `input` is the incoming sequence of bytes. `errors` tells us how to - handle errors, though we delegate all error-handling cases to the real - UTF-8 decoder to ensure correct behavior. `final` indicates whether - this is the end of the sequence, in which case we should raise an - error given incomplete input. - - Returns as much decoded text as possible, and the number of bytes - consumed. - """ - # decoded_segments are the pieces of text we have decoded so far, - # and position is our current position in the byte string. (Bytes - # before this position have been consumed, and bytes after it have - # yet to be decoded.) - decoded_segments = [] - position = 0 - while True: - # Use _buffer_decode_step to decode a segment of text. - decoded, consumed = self._buffer_decode_step( - input[position:], - errors, - final - ) - if consumed == 0: - # Either there's nothing left to decode, or we need to wait - # for more input. Either way, we're done for now. - break - - # Append the decoded text to the list, and update our position. - decoded_segments.append(decoded) - position += consumed - - if final: - # _buffer_decode_step must consume all the bytes when `final` is - # true. - assert position == len(input) - - return ''.join(decoded_segments), position - - def _buffer_decode_step(self, input, errors, final): - """ - There are three possibilities for each decoding step: - - - Decode as much real UTF-8 as possible. - - Decode a six-byte CESU-8 sequence at the current position. - - Decode a Java-style null at the current position. - - This method figures out which step is appropriate, and does it. - """ - # Get a reference to the superclass method that we'll be using for - # most of the real work. - sup = UTF8IncrementalDecoder._buffer_decode - - # Find the next byte position that indicates a variant of UTF-8. - # CESU-8 sequences always start with 0xed, and Java nulls always - # start with 0xc0, both of which are conveniently impossible in - # real UTF-8. - cutoff1 = input.find(b'\xed') - cutoff2 = input.find(b'\xc0') - - # Set `cutoff` to whichever cutoff comes first. - if cutoff1 != -1 and cutoff2 != -1: - cutoff = min(cutoff1, cutoff2) - elif cutoff1 != -1: - cutoff = cutoff1 - elif cutoff2 != -1: - cutoff = cutoff2 - else: - # The entire input can be decoded as UTF-8, so just do so. - return sup(input, errors, final) - - if cutoff1 == 0: - # Decode a possible six-byte sequence starting with 0xed. - return self._buffer_decode_surrogates(sup, input, errors, final) - elif cutoff2 == 0: - # Decode a possible two-byte sequence, 0xc0 0x80. - return self._buffer_decode_null(sup, input, errors, final) - else: - # Decode the bytes up until the next weird thing as UTF-8. - # Set final=True because 0xc0 and 0xed don't make sense in the - # middle of a sequence, in any variant. - return sup(input[:cutoff], errors, True) - - @staticmethod - def _buffer_decode_null(sup, input, errors, final): - """ - Decode the bytes 0xc0 0x80 as U+0000, like Java does. - """ - nextbyte = input[1:2] - if nextbyte == b'': - if final: - # We found 0xc0 at the end of the stream, which is an error. - # Delegate to the superclass method to handle that error. - return sup(input, errors, final) - else: - # We found 0xc0 and we don't know what comes next, so consume - # no bytes and wait. - return '', 0 - elif nextbyte == b'\x80': - # We found the usual 0xc0 0x80 sequence, so decode it and consume - # two bytes. - return '\u0000', 2 - else: - # We found 0xc0 followed by something else, which is an error. - # Whatever should happen is equivalent to what happens when the - # superclass is given just the byte 0xc0, with final=True. - return sup(b'\xc0', errors, True) - - @staticmethod - def _buffer_decode_surrogates(sup, input, errors, final): - """ - When we have improperly encoded surrogates, we can still see the - bits that they were meant to represent. - - The surrogates were meant to encode a 20-bit number, to which we - add 0x10000 to get a codepoint. That 20-bit number now appears in - this form: - - 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst - - The CESU8_RE above matches byte sequences of this form. Then we need - to extract the bits and assemble a codepoint number from them. - """ - if len(input) < 6: - if final: - # We found 0xed near the end of the stream, and there aren't - # six bytes to decode. Delegate to the superclass method to - # handle it as normal UTF-8. It might be a Hangul character - # or an error. - if PYTHON2 and len(input) >= 3: - # We can't trust Python 2 to raise an error when it's - # asked to decode a surrogate, so let's force the issue. - input = mangle_surrogates(input) - return sup(input, errors, final) - else: - # We found 0xed, the stream isn't over yet, and we don't know - # enough of the following bytes to decode anything, so consume - # zero bytes and wait. - return '', 0 - else: - if CESU8_RE.match(input): - # If this is a CESU-8 sequence, do some math to pull out - # the intended 20-bit value, and consume six bytes. - bytenums = bytes_to_ints(input[:6]) - codepoint = ( - ((bytenums[1] & 0x0f) << 16) + - ((bytenums[2] & 0x3f) << 10) + - ((bytenums[4] & 0x0f) << 6) + - (bytenums[5] & 0x3f) + - 0x10000 - ) - return unichr(codepoint), 6 - else: - # This looked like a CESU-8 sequence, but it wasn't one. - # 0xed indicates the start of a three-byte sequence, so give - # three bytes to the superclass to decode as usual -- except - # for working around the Python 2 discrepancy as before. - if PYTHON2: - input = mangle_surrogates(input) - return sup(input[:3], errors, False) - - -def mangle_surrogates(bytestring): - """ - When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats - it as an error (which it is). In 'replace' mode, it will decode as three - replacement characters. But Python 2 will just output the surrogate - codepoint. - - To ensure consistency between Python 2 and Python 3, and protect downstream - applications from malformed strings, we turn surrogate sequences at the - start of the string into the bytes `ff ff ff`, which we're *sure* won't - decode, and which turn into three replacement characters in 'replace' mode. - """ - if PYTHON2: - if bytestring.startswith(b'\xed') and len(bytestring) >= 3: - decoded = bytestring[:3].decode('utf-8', 'replace') - if '\ud800' <= decoded <= '\udfff': - return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:]) - return bytestring - else: - # On Python 3, nothing needs to be done. - return bytestring - -# The encoder is identical to UTF-8. -IncrementalEncoder = UTF8IncrementalEncoder - - -# Everything below here is boilerplate that matches the modules in the -# built-in `encodings` package. -def encode(input, errors='strict'): - return IncrementalEncoder(errors).encode(input, final=True), len(input) - - -def decode(input, errors='strict'): - return IncrementalDecoder(errors).decode(input, final=True), len(input) - - -class StreamWriter(codecs.StreamWriter): - encode = encode - - -class StreamReader(codecs.StreamReader): - decode = decode - - -CODEC_INFO = codecs.CodecInfo( - name=NAME, - encode=encode, - decode=decode, - incrementalencoder=IncrementalEncoder, - incrementaldecoder=IncrementalDecoder, - streamreader=StreamReader, - streamwriter=StreamWriter, -) diff --git a/lib/ftfy/badness.py b/lib/ftfy/badness.py deleted file mode 100644 index f94fc552..00000000 --- a/lib/ftfy/badness.py +++ /dev/null @@ -1,144 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Heuristics to determine whether re-encoding text is actually making it -more reasonable. -""" - -from __future__ import unicode_literals -from ftfy.chardata import chars_to_classes -import re -import unicodedata - -# The following regex uses the mapping of character classes to ASCII -# characters defined in chardata.py and build_data.py: -# -# L = Latin capital letter -# l = Latin lowercase letter -# A = Non-latin capital or title-case letter -# a = Non-latin lowercase letter -# C = Non-cased letter (Lo) -# X = Control character (Cc) -# m = Letter modifier (Lm) -# M = Mark (Mc, Me, Mn) -# N = Miscellaneous numbers (No) -# 0 = Math symbol (Sm) -# 1 = Currency symbol (Sc) -# 2 = Symbol modifier (Sk) -# 3 = Other symbol (So) -# S = UTF-16 surrogate -# _ = Unassigned character -# = Whitespace -# o = Other - - -def _make_weirdness_regex(): - """ - Creates a list of regexes that match 'weird' character sequences. - The more matches there are, the weirder the text is. - """ - groups = [] - - # Match lowercase letters that are followed by non-ASCII uppercase letters - groups.append('lA') - - # Match diacritical marks, except when they modify a non-cased letter or - # another mark. - # - # You wouldn't put a diacritical mark on a digit or a space, for example. - # You might put it on a Latin letter, but in that case there will almost - # always be a pre-composed version, and we normalize to pre-composed - # versions first. The cases that can't be pre-composed tend to be in - # large scripts without case, which are in class C. - groups.append('[^CM]M') - - # Match non-Latin characters adjacent to Latin characters. - # - # This is a simplification from ftfy version 2, which compared all - # adjacent scripts. However, the ambiguities we need to resolve come from - # encodings designed to represent Latin characters. - groups.append('[Ll][AaC]') - groups.append('[AaC][Ll]') - - # Match C1 control characters, which are almost always the result of - # decoding Latin-1 that was meant to be Windows-1252. - groups.append('X') - - # Match private use and unassigned characters. - groups.append('P') - groups.append('_') - - # Match adjacent characters from any different pair of these categories: - # - Modifier marks (M) - # - Letter modifiers (m) - # - Miscellaneous numbers (N) - # - Symbols (0123) - - exclusive_categories = 'MmN0123' - for cat1 in exclusive_categories: - others_range = ''.join(c for c in exclusive_categories if c != cat1) - groups.append('{cat1}[{others_range}]'.format( - cat1=cat1, others_range=others_range - )) - regex = '|'.join('({0})'.format(group) for group in groups) - return re.compile(regex) - -WEIRDNESS_RE = _make_weirdness_regex() - -# A few characters are common ending punctuation that can show up at the end -# of a mojibake sequence. It's plausible that such a character could appear -# after an accented capital letter, for example, so we'll want to add a -# slight preference to leave these characters alone. -# -# The match ends with a + so that we only give the bonus once for a -# consecutive sequence of these characters. -ENDING_PUNCT_RE = re.compile( - '[' - '\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}' - '\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}' - '\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}' - '\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}' - ']+' -) - -def sequence_weirdness(text): - """ - Determine how often a text has unexpected characters or sequences of - characters. This metric is used to disambiguate when text should be - re-decoded or left as is. - - We start by normalizing text in NFC form, so that penalties for - diacritical marks don't apply to characters that know what to do with - them. - - The following things are deemed weird: - - - Lowercase letters followed by non-ASCII uppercase letters - - Non-Latin characters next to Latin characters - - Un-combined diacritical marks, unless they're stacking on non-alphabetic - characters (in languages that do that kind of thing a lot) or other - marks - - C1 control characters - - Adjacent symbols from any different pair of these categories: - - - Modifier marks - - Letter modifiers - - Non-digit numbers - - Symbols (including math and currency) - - The return value is the number of instances of weirdness. - """ - text2 = unicodedata.normalize('NFC', text) - weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2))) - punct_discount = len(ENDING_PUNCT_RE.findall(text2)) - return weirdness * 2 - punct_discount - - -def text_cost(text): - """ - An overall cost function for text. Weirder is worse, but all else being - equal, shorter strings are better. - - The overall cost is measured as the "weirdness" (see - :func:`sequence_weirdness`) plus the length. - """ - return sequence_weirdness(text) + len(text) diff --git a/lib/ftfy/build_data.py b/lib/ftfy/build_data.py deleted file mode 100644 index f556b306..00000000 --- a/lib/ftfy/build_data.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -A script to make the char_classes.dat file. - -This never needs to run in normal usage. It needs to be run if the character -classes we care about change, or if a new version of Python supports a new -Unicode standard and we want it to affect our string decoding. - -The file that we generate is based on Unicode 6.1, as supported by Python 3.3. -You can certainly use it in earlier versions. This simply makes sure that we -get consistent results from running ftfy on different versions of Python. - -The file will be written to the current directory. -""" -from __future__ import unicode_literals -import unicodedata -import sys -import zlib -if sys.hexversion >= 0x03000000: - unichr = chr - -# L = Latin capital letter -# l = Latin lowercase letter -# A = Non-latin capital or title-case letter -# a = Non-latin lowercase letter -# C = Non-cased letter (Lo) -# X = Control character (Cc) -# m = Letter modifier (Lm) -# M = Mark (Mc, Me, Mn) -# N = Miscellaneous numbers (No) -# P = Private use (Co) -# 0 = Math symbol (Sm) -# 1 = Currency symbol (Sc) -# 2 = Symbol modifier (Sk) -# 3 = Other symbol (So) -# S = UTF-16 surrogate -# _ = Unassigned character -# = Whitespace -# o = Other - - -def make_char_data_file(do_it_anyway=False): - """ - Build the compressed data file 'char_classes.dat' and write it to the - current directory. - - If you run this, run it in Python 3.3 or later. It will run in earlier - versions, but you won't get the current Unicode standard, leading to - inconsistent behavior. To protect against this, running this in the - wrong version of Python will raise an error unless you pass - `do_it_anyway=True`. - """ - if sys.hexversion < 0x03030000 and not do_it_anyway: - raise RuntimeError( - "This function should be run in Python 3.3 or later." - ) - - cclasses = [None] * 0x110000 - for codepoint in range(0x0, 0x110000): - char = unichr(codepoint) - category = unicodedata.category(char) - - if category.startswith('L'): # letters - is_latin = unicodedata.name(char).startswith('LATIN') - if is_latin and codepoint < 0x200: - if category == 'Lu': - cclasses[codepoint] = 'L' - else: - cclasses[codepoint] = 'l' - else: # non-Latin letter, or close enough - if category == 'Lu' or category == 'Lt': - cclasses[codepoint] = 'A' - elif category == 'Ll': - cclasses[codepoint] = 'a' - elif category == 'Lo': - cclasses[codepoint] = 'C' - elif category == 'Lm': - cclasses[codepoint] = 'm' - else: - raise ValueError('got some weird kind of letter') - elif category.startswith('M'): # marks - cclasses[codepoint] = 'M' - elif category == 'No': - cclasses[codepoint] = 'N' - elif category == 'Sm': - cclasses[codepoint] = '0' - elif category == 'Sc': - cclasses[codepoint] = '1' - elif category == 'Sk': - cclasses[codepoint] = '2' - elif category == 'So': - cclasses[codepoint] = '3' - elif category == 'Cn': - cclasses[codepoint] = '_' - elif category == 'Cc': - cclasses[codepoint] = 'X' - elif category == 'Cs': - cclasses[codepoint] = 'S' - elif category == 'Co': - cclasses[codepoint] = 'P' - elif category.startswith('Z'): - cclasses[codepoint] = ' ' - else: - cclasses[codepoint] = 'o' - - cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' ' - out = open('char_classes.dat', 'wb') - out.write(zlib.compress(''.join(cclasses).encode('ascii'))) - out.close() - -if __name__ == '__main__': - make_char_data_file() diff --git a/lib/ftfy/char_classes.dat b/lib/ftfy/char_classes.dat deleted file mode 100644 index 84155cd5201b19cbdc66d6a977a356a1bf1732e8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3568 zcmeHHYg7{07IyC3TaA@Yr8!yZ^wO-9EZ>^eG>uv+llXq2QkMD3%tQgXJ+*vLzAJ&z z@`WPhBN9X!O-WHH2uTr{3^XM~P(TIb-deNP%$hapuK9b{n*HPKv-h|6I_LZLItN-I z=AA68>i>t%EBwMn|G+3mtNa3Jk;~5WORW4doHR2PTep6j!88cV8W?3m>4rt*?N zeP%P9`c{4EAZ>^y%e&!03DnfUOH<(RF^d(r?ayv8y6mdCNd^{Gvu-fe255tEM68Uh z)KxHqEfpzl%Cf_g2V3@Ubu?UNrHeyG>WPmpAf7ZAG_#ft`LtBoW`sJUhcZrO$4TqX zx9m(D$JdckN35sjVoBm5Y&J>!lsj5Vr&z|Z2MRn0P!~uD{LyRD`Mvjv#yeRh^5V%# zz&4f-#S&o3J`^iCXk|_C!v)-1w==_Lz{1N#c5&Jpe)KZZ1=C%yh$k21KXU z8&h1TEw2{pXEeIHoO zw?e-3EXksx7U)>BY%~O-(Pt%>x#zQOBDHTHIX9j? zrSqHK`D)76n{6w)D+2~0VOjf;nbQGCZ@cfEBDFRiMAD|=E9uopv=`MEQN9vo5OQzD zv*4ysl7xf^lSAEa-ECDN_s#afTZyIS{@w>CYq!iZuRMlO!y?8;POGlLhS{5j3i4+A*Bp z#{m3$_QCXA^Fkb*I035eYirmO>vH~7MHccw`OxS*ramH@o6AYOmb4PD#3fxNrb~0{ zl^a93tjUYp=$-bZ(wePsgJ9&d4r?Sk-3~=OMMKq>W z*t(e}@HQ3-!V6yXU>QG>h1iQTU+=9mu3@krR_MtUA$mKpJ;pt6Dh)z+I~k-Fu2_yI zR0j@WD`Rl#>x0`6ItHQzyr*mI71b6QX_>jhNjj(*y(NZ7S-GJbD4lFo47tz>sq9IU z-uZkpL)Yz0;Q@VfnGy5^@>YwBs=F^@lGr?}YU{0;7hW*-Hwz3x$q?nb&5ie0WT_pv zqy5Bpc@74y>$)>Ko4;W@V^!|R#s(J=;ujnFJK&~oeIcSk!x5W~q+GK*a?7%a2rySPiw`G1hBKr_L#=+<_UX{FM$BI*OJK;Lg5)e?`2L z$U`_j&<*i$PeTgwWy28(WTmxRKD~-npAMK_Ejf~0Ua$+Reu8%DK@acgo3gO z^u2r79cQGzAvF~?Ybz>6E}8ONK#2q-Wpy0qti03A2XQVWGDA(O)rJEqyJk-8i)rb~!`Nc3jOrSRR7qZ4q0_&{I=h$8$hg z%N72m@C%6}iDko#(yO&BqXYO-Q&cwSqHB%Usq$kY=aDn*^giFyhoTFe~Q|t z8d%&Kh=lr&`iZW+VbK7d;7AiUJWBGOvW7`lZ_hNb?^xiryN5G2Ph5tn>N?4a)gB6- z@JXaO4xq8-(mG|p5?xX$uo_w`CSvRJkO}_1j>7{i4`~?73^<<@2cMZbeP)?L=3fF% zfy1@k+%0Vf-@(6Hv{V$pQV{P=be`LX0o|_f#wqRk30i4??6rk)e}5gF#VLyF`kFG|ha&;mDO!K5F6>hs~E?WH6^5^`tRP zM0wR9LDS3VP>KOl;T%`w?b41A*2g4Qp^|^-e+{nHLlXK%(iDP0=i_UEN>;bvMD`5> z?Nhtl(0&J)a`DSg=RfDHAri3UHpJ5YtJUa`=va+A4Zy~pG@LZ;yk}Gs*7peQNqT*m z)pV$Q$cq2r0@SZZv-@dQ{nBxMnI7}rOE9wcz;o|#$TE1q`b;7$j+8O-joJ|3;U;*Q zsR_p17aP4D)s+vdXWN=7{R%}BTjIF{wme~bk9ZA@f4SbpGY~#8St#f%MgP@g8u*))so== diff --git a/lib/ftfy/chardata.py b/lib/ftfy/chardata.py deleted file mode 100644 index e853ed3e..00000000 --- a/lib/ftfy/chardata.py +++ /dev/null @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- -""" -This gives other modules access to the gritty details about characters and the -encodings that use them. -""" - -from __future__ import unicode_literals -import re -import zlib -from pkg_resources import resource_string -from ftfy.compatibility import unichr - -# These are the five encodings we will try to fix in ftfy, in the -# order that they should be tried. -CHARMAP_ENCODINGS = [ - 'latin-1', - 'sloppy-windows-1252', - 'macroman', - 'cp437', - 'sloppy-windows-1251', -] - - -def _build_regexes(): - """ - ENCODING_REGEXES contain reasonably fast ways to detect if we - could represent a given string in a given encoding. The simplest one is - the 'ascii' detector, which of course just determines if all characters - are between U+0000 and U+007F. - """ - # Define a regex that matches ASCII text. - encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')} - - for encoding in CHARMAP_ENCODINGS: - latin1table = ''.join(unichr(i) for i in range(128, 256)) - charlist = latin1table.encode('latin-1').decode(encoding) - - # Build a regex from the ASCII range, followed by the decodings of - # bytes 0x80-0xff in this character set. (This uses the fact that all - # regex special characters are ASCII, and therefore won't appear in the - # string.) - regex = '^[\x00-\x7f{0}]*$'.format(charlist) - encoding_regexes[encoding] = re.compile(regex) - return encoding_regexes -ENCODING_REGEXES = _build_regexes() - - -def possible_encoding(text, encoding): - """ - Given text and a single-byte encoding, check whether that text could have - been decoded from that single-byte encoding. - - In other words, check whether it can be encoded in that encoding, possibly - sloppily. - """ - return bool(ENCODING_REGEXES[encoding].match(text)) - - -CHAR_CLASS_STRING = zlib.decompress( - resource_string(__name__, 'char_classes.dat') -).decode('ascii') - -def chars_to_classes(string): - """ - Convert each Unicode character to a letter indicating which of many - classes it's in. - - See build_data.py for where this data comes from and what it means. - """ - return string.translate(CHAR_CLASS_STRING) - - -# A translate mapping that will strip all C0 control characters except -# those that represent whitespace. -CONTROL_CHARS = {} -for i in range(32): - CONTROL_CHARS[i] = None - -# Map whitespace control characters to themselves. -for char in '\t\n\f\r': - del CONTROL_CHARS[ord(char)] diff --git a/lib/ftfy/cli.py b/lib/ftfy/cli.py deleted file mode 100644 index 6ac83706..00000000 --- a/lib/ftfy/cli.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -A simple command-line utility for fixing text found in a file. - -Because files do not come with their encoding marked, it first runs the file -through `ftfy.guess_bytes`, then runs it through `ftfy.fix_text`. -""" -from ftfy import fix_file - -import sys -ENCODE_STDOUT = (sys.hexversion < 0x03000000) - - -def main(): - """ - Run ftfy as a command-line utility. (Requires Python 2.7 or later, or - the 'argparse' module.) - """ - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument('filename', help='file to transcode') - - args = parser.parse_args() - - file = open(args.filename) - for line in fix_file(file): - if ENCODE_STDOUT: - sys.stdout.write(line.encode('utf-8')) - else: - sys.stdout.write(line) - - -if __name__ == '__main__': - main() diff --git a/lib/ftfy/compatibility.py b/lib/ftfy/compatibility.py deleted file mode 100644 index 1246248c..00000000 --- a/lib/ftfy/compatibility.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -Makes some function names and behavior consistent between Python 2 and -Python 3, and also between narrow and wide builds. -""" -from __future__ import unicode_literals -import sys -import re -import unicodedata - -if sys.hexversion >= 0x03000000: - from html import entities - unichr = chr - xrange = range - PYTHON2 = False -else: - import htmlentitydefs as entities - unichr = unichr - xrange = xrange - PYTHON2 = True -htmlentitydefs = entities - -PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000) - - -def _narrow_unichr_workaround(codepoint): - """ - A replacement for unichr() on narrow builds of Python. This will get - us the narrow representation of an astral character, which will be - a string of length two, containing two UTF-16 surrogates. - """ - escaped = b'\\U%08x' % codepoint - return escaped.decode('unicode-escape') - - -if sys.maxunicode < 0x10000: - unichr = _narrow_unichr_workaround - # In a narrow build of Python, we can't write a regex involving astral - # characters. If we want to write the regex: - # - # [\U00100000-\U0010ffff] - # - # The actual string that defines it quietly turns into: - # - # [\udbc0\udc00-\udbff\udfff] - # - # And now the range operator only applies to the middle two characters. - # It looks like a range that's going backwards from \dc00 to \dbff, - # which is an error. - # - # What we can do instead is rewrite the expression to be _about_ the two - # surrogates that make up the astral characters, instead of the characters - # themselves. This would be wrong on a wide build, but it works on a - # narrow build. - UNSAFE_PRIVATE_USE_RE = re.compile('[\udbc0-\udbff][\udc00-\udfff]') -else: - UNSAFE_PRIVATE_USE_RE = re.compile('[\U00100000-\U0010ffff]') - - -def bytes_to_ints(bytestring): - """ - No matter what version of Python this is, make a sequence of integers from - a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a - sequence of integers. - """ - if PYTHON2: - return [ord(b) for b in bytestring] - else: - return bytestring - - -def is_printable(char): - """ - str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so - let's make a crude approximation in Python 2. - """ - if PYTHON2: - return not unicodedata.category(char).startswith('C') - else: - return char.isprintable() diff --git a/lib/ftfy/fixes.py b/lib/ftfy/fixes.py deleted file mode 100644 index 8da51aa4..00000000 --- a/lib/ftfy/fixes.py +++ /dev/null @@ -1,473 +0,0 @@ -# -*- coding: utf-8 -*- -""" -This module contains the individual fixes that the main fix_text function -can perform. -""" - -from __future__ import unicode_literals -from ftfy.chardata import (possible_encoding, - CHARMAP_ENCODINGS, CONTROL_CHARS) -from ftfy.badness import text_cost -from ftfy.compatibility import htmlentitydefs, unichr, UNSAFE_PRIVATE_USE_RE -import re -import sys -import codecs - - -BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode. - -ftfy is designed to fix problems that were introduced by handling Unicode -incorrectly. It might be able to fix the bytes you just handed it, but the -fact that you just gave a pile of bytes to a function that fixes text means -that your code is *also* handling Unicode incorrectly. - -ftfy takes Unicode text as input. You should take these bytes and decode -them from the encoding you think they are in. If you're not sure what encoding -they're in: - -- First, try to find out. 'utf-8' is a good assumption. -- If the encoding is simply unknowable, try running your bytes through - ftfy.guess_bytes. As the name implies, this may not always be accurate. - -If you're confused by this, please read the Python Unicode HOWTO: - - http://docs.python.org/%d/howto/unicode.html -""" % sys.version_info[0] - - -def fix_text_encoding(text): - r""" - Fix text with incorrectly-decoded garbage ("mojibake") whenever possible. - - Something you will find all over the place, in real-world text, is text - that's mistakenly encoded as utf-8, decoded in some ugly format like - latin-1 or even Windows codepage 1252, and encoded as utf-8 again. - - This causes your perfectly good Unicode-aware code to end up with garbage - text because someone else (or maybe "someone else") made a mistake. - - This function looks for the evidence of that having happened and fixes it. - It determines whether it should replace nonsense sequences of single-byte - characters that were really meant to be UTF-8 characters, and if so, turns - them into the correctly-encoded Unicode character that they were meant to - represent. - - The input to the function must be Unicode. If you don't have Unicode text, - you're not using the right tool to solve your problem. - - .. note:: - The following examples are written using unmarked literal strings, - but they are Unicode text. In Python 2 we have "unicode_literals" - turned on, and in Python 3 this is always the case. - - ftfy decodes text that looks like it was decoded incorrectly. It leaves - alone text that doesn't. - - >>> print(fix_text_encoding('único')) - único - - >>> print(fix_text_encoding('This text is fine already :þ')) - This text is fine already :þ - - Because these characters often come from Microsoft products, we allow - for the possibility that we get not just Unicode characters 128-255, but - also Windows's conflicting idea of what characters 128-160 are. - - >>> print(fix_text_encoding('This — should be an em dash')) - This — should be an em dash - - We might have to deal with both Windows characters and raw control - characters at the same time, especially when dealing with characters like - 0x81 that have no mapping in Windows. This is a string that Python's - standard `.encode` and `.decode` methods cannot correct. - - >>> print(fix_text_encoding('This text is sad .â\x81”.')) - This text is sad .⁔. - - However, it has safeguards against fixing sequences of letters and - punctuation that can occur in valid text: - - >>> print(fix_text_encoding('not such a fan of Charlotte Brontë…”')) - not such a fan of Charlotte Brontë…” - - Cases of genuine ambiguity can sometimes be addressed by finding other - characters that are not double-encoded, and expecting the encoding to - be consistent: - - >>> print(fix_text_encoding('AHÅ™, the new sofa from IKEA®')) - AHÅ™, the new sofa from IKEA® - - Finally, we handle the case where the text is in a single-byte encoding - that was intended as Windows-1252 all along but read as Latin-1: - - >>> print(fix_text_encoding('This text was never UTF-8 at all\x85')) - This text was never UTF-8 at all… - - The best version of the text is found using - :func:`ftfy.badness.text_cost`. - """ - text, _plan = fix_encoding_and_explain(text) - return text - - -def fix_encoding_and_explain(text): - """ - Re-decodes text that has been decoded incorrectly, and also return a - "plan" indicating all the steps required to fix it. - - To fix similar text in the same way, without having to detect anything, - you can use the ``apply_plan`` function. - """ - best_version = text - best_cost = text_cost(text) - best_plan = [] - plan_so_far = [] - while True: - prevtext = text - text, plan = fix_one_step_and_explain(text) - plan_so_far.extend(plan) - cost = text_cost(text) - - # Add a penalty if we used a particularly obsolete encoding. The result - # is that we won't use these encodings unless they can successfully - # replace multiple characters. - if ('encode', 'macroman') in plan_so_far or\ - ('encode', 'cp437') in plan_so_far: - cost += 2 - - # We need pretty solid evidence to decode from Windows-1251 (Cyrillic). - if ('encode', 'sloppy-windows-1251') in plan_so_far: - cost += 5 - - if cost < best_cost: - best_cost = cost - best_version = text - best_plan = list(plan_so_far) - if text == prevtext: - return best_version, best_plan - - -def fix_one_step_and_explain(text): - """ - Performs a single step of re-decoding text that's been decoded incorrectly. - - Returns the decoded text, plus a "plan" for how to reproduce what it - did. - """ - if isinstance(text, bytes): - raise UnicodeError(BYTES_ERROR_TEXT) - if len(text) == 0: - return text, [] - - # The first plan is to return ASCII text unchanged. - if possible_encoding(text, 'ascii'): - return text, [] - - # As we go through the next step, remember the possible encodings - # that we encounter but don't successfully fix yet. We may need them - # later. - possible_1byte_encodings = [] - - # Suppose the text was supposed to be UTF-8, but it was decoded using - # a single-byte encoding instead. When these cases can be fixed, they - # are usually the correct thing to do, so try them next. - for encoding in CHARMAP_ENCODINGS: - if possible_encoding(text, encoding): - encoded_bytes = text.encode(encoding) - - # Now, find out if it's UTF-8 (or close enough). Otherwise, - # remember the encoding for later. - try: - decoding = 'utf-8' - if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes: - decoding = 'utf-8-variants' - fixed = encoded_bytes.decode(decoding) - steps = [('encode', encoding), ('decode', decoding)] - return fixed, steps - except UnicodeDecodeError: - possible_1byte_encodings.append(encoding) - - # The next most likely case is that this is Latin-1 that was intended to - # be read as Windows-1252, because those two encodings in particular are - # easily confused. - if 'latin-1' in possible_1byte_encodings: - if 'windows-1252' in possible_1byte_encodings: - # This text is in the intersection of Latin-1 and - # Windows-1252, so it's probably legit. - return text, [] - else: - # Otherwise, it means we have characters that are in Latin-1 but - # not in Windows-1252. Those are C1 control characters. Nobody - # wants those. Assume they were meant to be Windows-1252. Don't - # use the sloppy codec, because bad Windows-1252 characters are - # a bad sign. - encoded = text.encode('latin-1') - try: - fixed = encoded.decode('windows-1252') - steps = [] - if fixed != text: - steps = [('encode', 'latin-1'), ('decode', 'windows-1252')] - return fixed, steps - except UnicodeDecodeError: - # This text contained characters that don't even make sense - # if you assume they were supposed to be Windows-1252. In - # that case, let's not assume anything. - pass - - # The cases that remain are mixups between two different single-byte - # encodings, and not the common case of Latin-1 vs. Windows-1252. - # - # Those cases are somewhat rare, and impossible to solve without false - # positives. If you're in one of these situations, you should try using - # the `ftfy.guess_bytes` function. - - # Return the text unchanged; the plan is empty. - return text, [] - - -def apply_plan(text, plan): - """ - Apply a plan for fixing the encoding of text. - - The plan is a list of tuples of the form (operation, encoding), where - `operation` is either 'encode' or 'decode', and `encoding` is an encoding - name such as 'utf-8' or 'latin-1'. - - Because only text can be encoded, and only bytes can be decoded, the plan - should alternate 'encode' and 'decode' steps, or else this function will - encounter an error. - """ - obj = text - for operation, encoding in plan: - if operation == 'encode': - obj = obj.encode(encoding) - elif operation == 'decode': - obj = obj.decode(encoding) - else: - raise ValueError("Unknown plan step: %s" % operation) - - return obj - - -HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};") - - -def unescape_html(text): - """ - Decode all three types of HTML entities/character references. - - Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change - to it for efficiency: it won't match entities longer than 8 characters, - because there are no valid entities like that. - - >>> print(unescape_html('<tag>')) - - """ - def fixup(match): - """ - Replace one matched HTML entity with the character it represents, - if possible. - """ - text = match.group(0) - if text[:2] == "&#": - # character reference - try: - if text[:3] == "&#x": - return unichr(int(text[3:-1], 16)) - else: - return unichr(int(text[2:-1])) - except ValueError: - pass - else: - # named entity - try: - text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) - except KeyError: - pass - return text # leave as is - return HTML_ENTITY_RE.sub(fixup, text) - - -ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])') - -def remove_terminal_escapes(text): - r""" - Strip out "ANSI" terminal escape sequences, such as those that produce - colored text on Unix. - - >>> print(remove_terminal_escapes( - ... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m" - ... )) - I'm blue, da ba dee da ba doo... - """ - return ANSI_RE.sub('', text) - - -SINGLE_QUOTE_RE = re.compile('[\u2018-\u201b]') -DOUBLE_QUOTE_RE = re.compile('[\u201c-\u201f]') - -def uncurl_quotes(text): - r""" - Replace curly quotation marks with straight equivalents. - - >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d')) - "here's a test" - """ - return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text)) - - -def fix_line_breaks(text): - r""" - Convert all line breaks to Unix style. - - This will convert the following sequences into the standard \\n - line break: - - - CRLF (\\r\\n), used on Windows and in some communication - protocols - - CR (\\r), once used on Mac OS Classic, and now kept alive - by misguided software such as Microsoft Office for Mac - - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), - defined by Unicode and used to sow confusion and discord - - NEXT LINE (\\x85), a C1 control character that is certainly - not what you meant - - The NEXT LINE character is a bit of an odd case, because it - usually won't show up if `fix_encoding` is also being run. - \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS. - - >>> print(fix_line_breaks( - ... "This string is made of two things:\u2029" - ... "1. Unicode\u2028" - ... "2. Spite" - ... )) - This string is made of two things: - 1. Unicode - 2. Spite - - For further testing and examples, let's define a function to make sure - we can see the control characters in their escaped form: - - >>> def eprint(text): - ... print(text.encode('unicode-escape').decode('ascii')) - - >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi.")) - Content-type: text/plain\n\nHi. - - >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users")) - This is how Microsoft \n trolls Mac users - - >>> eprint(fix_line_breaks("What is this \x85 I don't even")) - What is this \n I don't even - """ - return text.replace('\r\n', '\n').replace('\r', '\n')\ - .replace('\u2028', '\n').replace('\u2029', '\n')\ - .replace('\u0085', '\n') - - -def remove_control_chars(text): - """ - Remove all control characters except for the important ones. - - This removes characters in these ranges: - - - U+0000 to U+0008 - - U+000B - - U+000E to U+001F - - U+007F - - It leaves alone these characters that are commonly used for formatting: - - - TAB (U+0009) - - LF (U+000A) - - FF (U+000C) - - CR (U+000D) - """ - return text.translate(CONTROL_CHARS) - - -def remove_bom(text): - r""" - Remove a left-over byte-order mark. - - >>> print(remove_bom("\ufeffWhere do you want to go today?")) - Where do you want to go today? - """ - return text.lstrip(unichr(0xfeff)) - - -def remove_unsafe_private_use(text): - r""" - Python 3.3's Unicode support isn't perfect, and in fact there are certain - string operations that will crash some versions of it with a SystemError: - http://bugs.python.org/issue18183 - - The best solution is to remove all characters from Supplementary Private - Use Area B, using a regex that is known not to crash given those - characters. - - These are the characters from U+100000 to U+10FFFF. It's sad to lose an - entire plane of Unicode, but on the other hand, these characters are not - assigned and never will be. If you get one of these characters and don't - know what its purpose is, its purpose is probably to crash your code. - - If you were using these for actual private use, this might be inconvenient. - You can turn off this fixer, of course, but I kind of encourage using - Supplementary Private Use Area A instead. - - >>> print(remove_unsafe_private_use('\U0001F4A9\U00100000')) - 💩 - - This fixer is off by default in Python 3.4 or later. (The bug is actually - fixed in 3.3.3 and 2.7.6, but I don't want the default behavior to change - based on a micro version upgrade of Python.) - """ - return UNSAFE_PRIVATE_USE_RE.sub('', text) - - -# Define a regex to match valid escape sequences in Python string literals. -ESCAPE_SEQUENCE_RE = re.compile(r''' - ( \\U........ # 8-digit hex escapes - | \\u.... # 4-digit hex escapes - | \\x.. # 2-digit hex escapes - | \\[0-7]{1,3} # Octal escapes - | \\N\{[^}]+\} # Unicode characters by name - | \\[\\'"abfnrtv] # Single-character escapes - )''', re.UNICODE | re.VERBOSE) - - -def decode_escapes(text): - r""" - Decode backslashed escape sequences, including \\x, \\u, and \\U character - references, even in the presence of other Unicode. - - This is what Python's "string-escape" and "unicode-escape" codecs were - meant to do, but in contrast, this actually works. It will decode the - string exactly the same way that the Python interpreter decodes its string - literals. - - >>> factoid = '\\u20a1 is the currency symbol for the colón.' - >>> print(factoid[1:]) - u20a1 is the currency symbol for the colón. - >>> print(decode_escapes(factoid)) - ₡ is the currency symbol for the colón. - - Even though Python itself can read string literals with a combination of - escapes and literal Unicode -- you're looking at one right now -- the - "unicode-escape" codec doesn't work on literal Unicode. (See - http://stackoverflow.com/a/24519338/773754 for more details.) - - Instead, this function searches for just the parts of a string that - represent escape sequences, and decodes them, leaving the rest alone. All - valid escape sequences are made of ASCII characters, and this allows - "unicode-escape" to work correctly. - - This fix cannot be automatically applied by the `ftfy.fix_text` function, - because escaped text is not necessarily a mistake, and there is no way - to distinguish text that's supposed to be escaped from text that isn't. - """ - def decode_match(match): - "Given a regex match, decode the escape sequence it contains." - return codecs.decode(match.group(0), 'unicode-escape') - - return ESCAPE_SEQUENCE_RE.sub(decode_match, text) diff --git a/lib/ftfy/streamtester/__init__.py b/lib/ftfy/streamtester/__init__.py deleted file mode 100644 index 4b5c0614..00000000 --- a/lib/ftfy/streamtester/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -This file defines a general method for evaluating ftfy using data that arrives -in a stream. A concrete implementation of it is found in `twitter_tester.py`. -""" -from __future__ import print_function, unicode_literals -from ftfy.fixes import fix_text_encoding -from ftfy.chardata import possible_encoding - - -class StreamTester: - """ - Take in a sequence of texts, and show the ones that will be changed by - ftfy. This will also periodically show updates, such as the proportion of - texts that changed. - """ - def __init__(self): - self.num_fixed = 0 - self.count = 0 - - def check_ftfy(self, text): - """ - Given a single text input, check whether `ftfy.fix_text_encoding` - would change it. If so, display the change. - """ - self.count += 1 - if not possible_encoding(text, 'ascii'): - fixed = fix_text_encoding(text) - if text != fixed: - # possibly filter common bots before printing - print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format( - text=text, fixed=fixed - )) - self.num_fixed += 1 - - # Print status updates once in a while - if self.count % 100 == 0: - print('.', end='', flush=True) - if self.count % 10000 == 0: - print('\n%d/%d fixed' % (self.num_fixed, self.count)) diff --git a/lib/ftfy/streamtester/oauth.py b/lib/ftfy/streamtester/oauth.py deleted file mode 100644 index 8e300ed7..00000000 --- a/lib/ftfy/streamtester/oauth.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -""" -Do what is necessary to authenticate this tester as a Twitter "app", using -somebody's Twitter account. -""" -from __future__ import unicode_literals -import os - - -AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth') - -def get_auth(): - """ - Twitter has some bizarre requirements about how to authorize an "app" to - use its API. - - The user of the app has to log in to get a secret token. That's fine. But - the app itself has its own "consumer secret" token. The app has to know it, - and the user of the app has to not know it. - - This is, of course, impossible. It's equivalent to DRM. Your computer can't - *really* make use of secret information while hiding the same information - from you. - - The threat appears to be that, if you have this super-sekrit token, you can - impersonate the app while doing something different. Well, of course you - can do that, because you *have the source code* and you can change it to do - what you want. You still have to log in as a particular user who has a - token that's actually secret, you know. - - Even developers of closed-source applications that use the Twitter API are - unsure what to do, for good reason. These "secrets" are not secret in any - cryptographic sense. A bit of Googling shows that the secret tokens for - every popular Twitter app are already posted on the Web. - - Twitter wants us to pretend this string can be kept secret, and hide this - secret behind a fig leaf like everybody else does. So that's what we've - done. - """ - - from twitter.oauth import OAuth - from twitter import oauth_dance, read_token_file - - def unhide(secret): - """ - Do something mysterious and exactly as secure as every other Twitter - app. - """ - return ''.join([chr(ord(c) - 0x2800) for c in secret]) - - fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁' - consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw' - - if os.path.exists(AUTH_TOKEN_PATH): - token, token_secret = read_token_file(AUTH_TOKEN_PATH) - else: - authdir = os.path.dirname(AUTH_TOKEN_PATH) - if not os.path.exists(authdir): - os.makedirs(authdir) - token, token_secret = oauth_dance( - app_name='ftfy-tester', - consumer_key=consumer_key, - consumer_secret=unhide(fig_leaf), - token_filename=AUTH_TOKEN_PATH - ) - - return OAuth( - token=token, - token_secret=token_secret, - consumer_key=consumer_key, - consumer_secret=unhide(fig_leaf) - ) - diff --git a/lib/ftfy/streamtester/twitter_tester.py b/lib/ftfy/streamtester/twitter_tester.py deleted file mode 100644 index 6ad125ee..00000000 --- a/lib/ftfy/streamtester/twitter_tester.py +++ /dev/null @@ -1,89 +0,0 @@ -""" -Implements a StreamTester that runs over Twitter data. See the class -docstring. - -This module is written for Python 3 only. The __future__ imports you see here -are just to let Python 2 scan the file without crashing with a SyntaxError. -""" -from __future__ import print_function, unicode_literals -import os -from collections import defaultdict -from ftfy.streamtester import StreamTester - - -class TwitterTester(StreamTester): - """ - This class uses the StreamTester code (defined in `__init__.py`) to - evaluate ftfy's real-world performance, by feeding it live data from - Twitter. - - This is a semi-manual evaluation. It requires a human to look at the - results and determine if they are good. The three possible cases we - can see here are: - - - Success: the process takes in mojibake and outputs correct text. - - False positive: the process takes in correct text, and outputs - mojibake. Every false positive should be considered a bug, and - reported on GitHub if it isn't already. - - Confusion: the process takes in mojibake and outputs different - mojibake. Not a great outcome, but not as dire as a false - positive. - - This tester cannot reveal false negatives. So far, that can only be - done by the unit tests. - """ - OUTPUT_DIR = './twitterlogs' - - def __init__(self): - self.lines_by_lang = defaultdict(list) - super().__init__() - - def save_files(self): - """ - When processing data from live Twitter, save it to log files so that - it can be replayed later. - """ - if not os.path.exists(self.OUTPUT_DIR): - os.makedirs(self.OUTPUT_DIR) - for lang, lines in self.lines_by_lang.items(): - filename = 'tweets.{}.txt'.format(lang) - fullname = os.path.join(self.OUTPUT_DIR, filename) - langfile = open(fullname, 'a') - for line in lines: - print(line.replace('\n', ' '), file=langfile) - langfile.close() - self.lines_by_lang = defaultdict(list) - - def run_sample(self): - """ - Listen to live data from Twitter, and pass on the fully-formed tweets - to `check_ftfy`. This requires the `twitter` Python package as a - dependency. - """ - from twitter import TwitterStream - from ftfy.streamtester.oauth import get_auth - twitter_stream = TwitterStream(auth=get_auth()) - iterator = twitter_stream.statuses.sample() - for tweet in iterator: - if 'text' in tweet: - self.check_ftfy(tweet['text']) - if 'user' in tweet: - lang = tweet['user'].get('lang', 'NONE') - self.lines_by_lang[lang].append(tweet['text']) - if self.count % 10000 == 100: - self.save_files() - - -def main(): - """ - When run from the command line, this script connects to the Twitter stream - and runs the TwitterTester on it forever. Or at least until the stream - drops. - """ - tester = TwitterTester() - tester.run_sample() - - -if __name__ == '__main__': - main() - diff --git a/sickbeard/encodingKludge.py b/sickbeard/encodingKludge.py index de1fd499..a3d94541 100644 --- a/sickbeard/encodingKludge.py +++ b/sickbeard/encodingKludge.py @@ -17,53 +17,71 @@ # along with SickRage. If not, see . import os +import traceback import sickbeard from sickbeard import logger -import ftfy -import ftfy.bad_codecs +import six +import chardet + # This module tries to deal with the apparently random behavior of python when dealing with unicode <-> utf-8 # encodings. It tries to just use unicode, but if that fails then it tries forcing it to utf-8. Any functions # which return something should always return unicode. -def fixStupidEncodings(x, silent=False): - if type(x) == str: - try: - return str(ftfy.fix_text(u'' + x)).decode(sickbeard.SYS_ENCODING) - except UnicodeDecodeError: - logger.log(u"Unable to decode value: " + repr(x), logger.ERROR) +def toUnicode(x): + try: + if isinstance(x, unicode): return x - except UnicodeEncodeError: - logger.log(u"Unable to encode value: " + repr(x), logger.ERROR) - return x - elif type(x) == unicode: - return x - else: - logger.log( - u"Unknown value passed in, ignoring it: " + str(type(x)) + " (" + repr(x) + ":" + repr(type(x)) + ")", - logger.DEBUG if silent else logger.ERROR) + else: + try: + return six.text_type(x) + except: + try: + if chardet.detect(x).get('encoding') == 'utf-8': + return x.decode('utf-8') + if isinstance(x, str): + try: + return x.decode(sickbeard.SYS_ENCODING) + except UnicodeDecodeError: + raise + return x + except: + raise + except: + logger.log('Unable to decode value "%s..." : %s ' % (repr(x)[:20], traceback.format_exc()), logger.WARNING) + ascii_text = str(x).encode('string_escape') + return toUnicode(ascii_text) +def ss(x): + u_x = toUnicode(x) + + try: + return u_x.encode(sickbeard.SYS_ENCODING) + except Exception as e: + logger.log('Failed ss encoding char, force UTF8: %s' % e, logger.WARNING) + try: + return u_x.encode(sickbeard.SYS_ENCODING, 'replace') + except: + return u_x.encode('utf-8', 'replace') def fixListEncodings(x): - if type(x) != list and type(x) != tuple: + if not isinstance(x, (list, tuple)): return x else: - return filter(lambda x: x != None, map(fixStupidEncodings, x)) + return filter(lambda x: x != None, map(toUnicode, x)) def ek(func, *args, **kwargs): if os.name == 'nt': result = func(*args, **kwargs) else: - result = func( - *[fixStupidEncodings(x).encode(sickbeard.SYS_ENCODING) if type(x) in (str, unicode) else x for x in args], - **kwargs) + result = func(*[ss(x) if isinstance(x, (str, unicode)) else x for x in args], **kwargs) - if type(result) in (list, tuple): + if isinstance(result, (list, tuple)): return fixListEncodings(result) - elif type(result) == str: - return fixStupidEncodings(result) + elif isinstance(result, str): + return toUnicode(result) else: return result diff --git a/sickbeard/exceptions.py b/sickbeard/exceptions.py index 3a81bfa7..c209ee74 100644 --- a/sickbeard/exceptions.py +++ b/sickbeard/exceptions.py @@ -16,7 +16,7 @@ # You should have received a copy of the GNU General Public License # along with SickRage. If not, see . -from sickbeard.encodingKludge import fixStupidEncodings +from sickbeard.encodingKludge import toUnicode def ex(e): """ @@ -32,11 +32,11 @@ def ex(e): if arg is not None: if isinstance(arg, (str, unicode)): - fixed_arg = fixStupidEncodings(arg, True) + fixed_arg = toUnicode(arg, True) else: try: - fixed_arg = u"error " + fixStupidEncodings(str(arg), True) + fixed_arg = u"error " + toUnicode(str(arg), True) except: fixed_arg = None diff --git a/sickbeard/failed_history.py b/sickbeard/failed_history.py index 32c3a8cd..0fc1484b 100644 --- a/sickbeard/failed_history.py +++ b/sickbeard/failed_history.py @@ -26,7 +26,7 @@ from sickbeard.exceptions import ex, EpisodeNotFoundException from sickbeard.history import dateFormat from sickbeard.common import Quality from sickbeard.common import WANTED, FAILED -from sickbeard.encodingKludge import fixStupidEncodings +from sickbeard.encodingKludge import toUnicode def prepareFailedName(release): """Standardizes release name for failed DB""" @@ -36,7 +36,7 @@ def prepareFailedName(release): fixed = fixed.rpartition(".")[0] fixed = re.sub("[\.\-\+\ ]", "_", fixed) - fixed = fixStupidEncodings(fixed) + fixed = toUnicode(fixed) return fixed diff --git a/sickbeard/history.py b/sickbeard/history.py index 45f15f96..cb1a8486 100644 --- a/sickbeard/history.py +++ b/sickbeard/history.py @@ -20,7 +20,7 @@ import db import datetime from sickbeard.common import SNATCHED, SUBTITLED, FAILED, Quality -from sickbeard.encodingKludge import fixStupidEncodings +from sickbeard.encodingKludge import toUnicode dateFormat = "%Y%m%d%H%M%S" @@ -28,7 +28,7 @@ dateFormat = "%Y%m%d%H%M%S" def _logHistoryItem(action, showid, season, episode, quality, resource, provider, version=-1): logDate = datetime.datetime.today().strftime(dateFormat) - resource = fixStupidEncodings(resource) + resource = toUnicode(resource) myDB = db.DBConnection() myDB.action( diff --git a/sickbeard/notifiers/emailnotify.py b/sickbeard/notifiers/emailnotify.py index ff412f2c..1dac6758 100644 --- a/sickbeard/notifiers/emailnotify.py +++ b/sickbeard/notifiers/emailnotify.py @@ -29,7 +29,7 @@ import sickbeard from sickbeard import logger, common from sickbeard import db -from sickbeard.encodingKludge import fixStupidEncodings +from sickbeard.encodingKludge import toUnicode from sickbeard.exceptions import ex @@ -51,7 +51,7 @@ class EmailNotifier: ep_name: The name of the episode that was snatched title: The title of the notification (optional) """ - ep_name = fixStupidEncodings(ep_name) + ep_name = toUnicode(ep_name) if sickbeard.EMAIL_NOTIFY_ONSNATCH: show = self._parseEp(ep_name) @@ -86,7 +86,7 @@ class EmailNotifier: ep_name: The name of the episode that was downloaded title: The title of the notification (optional) """ - ep_name = fixStupidEncodings(ep_name) + ep_name = toUnicode(ep_name) if sickbeard.EMAIL_NOTIFY_ONDOWNLOAD: show = self._parseEp(ep_name) @@ -121,7 +121,7 @@ class EmailNotifier: ep_name: The name of the episode that was downloaded lang: Subtitle language wanted """ - ep_name = fixStupidEncodings(ep_name) + ep_name = toUnicode(ep_name) if sickbeard.EMAIL_NOTIFY_ONSUBTITLEDOWNLOAD: show = self._parseEp(ep_name) @@ -198,7 +198,7 @@ class EmailNotifier: return False def _parseEp(self, ep_name): - ep_name = fixStupidEncodings(ep_name) + ep_name = toUnicode(ep_name) sep = " - " titles = ep_name.split(sep) diff --git a/sickbeard/notifiers/plex.py b/sickbeard/notifiers/plex.py index f3d51c9f..3d9f8717 100644 --- a/sickbeard/notifiers/plex.py +++ b/sickbeard/notifiers/plex.py @@ -25,7 +25,7 @@ import sickbeard from sickbeard import logger from sickbeard import common from sickbeard.exceptions import ex -from sickbeard.encodingKludge import fixStupidEncodings +from sickbeard.encodingKludge import toUnicode from sickbeard.notifiers.xbmc import XBMCNotifier diff --git a/sickbeard/notifiers/xbmc.py b/sickbeard/notifiers/xbmc.py index 6feca826..49683bba 100644 --- a/sickbeard/notifiers/xbmc.py +++ b/sickbeard/notifiers/xbmc.py @@ -26,7 +26,7 @@ import sickbeard from sickbeard import logger from sickbeard import common from sickbeard.exceptions import ex -from sickbeard.encodingKludge import fixStupidEncodings +from sickbeard.encodingKludge import toUnicode try: @@ -236,9 +236,9 @@ class XBMCNotifier: base64string = base64.encodestring('%s:%s' % (username, password))[:-1] authheader = "Basic %s" % base64string req.add_header("Authorization", authheader) - logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG) + logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG) else: - logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG) + logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG) response = urllib2.urlopen(req) result = response.read().decode(sickbeard.SYS_ENCODING) @@ -248,7 +248,7 @@ class XBMCNotifier: return result except (urllib2.URLError, IOError), e: - logger.log(u"Warning: Couldn't contact XBMC HTTP at " + fixStupidEncodings(url) + " " + ex(e), + logger.log(u"Warning: Couldn't contact XBMC HTTP at " + toUnicode(url) + " " + ex(e), logger.WARNING) return False @@ -379,9 +379,9 @@ class XBMCNotifier: base64string = base64.encodestring('%s:%s' % (username, password))[:-1] authheader = "Basic %s" % base64string req.add_header("Authorization", authheader) - logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG) + logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG) else: - logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG) + logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG) try: response = urllib2.urlopen(req) @@ -401,7 +401,7 @@ class XBMCNotifier: return False except IOError, e: - logger.log(u"Warning: Couldn't contact XBMC JSON API at " + fixStupidEncodings(url) + " " + ex(e), + logger.log(u"Warning: Couldn't contact XBMC JSON API at " + toUnicode(url) + " " + ex(e), logger.WARNING) return False diff --git a/sickbeard/nzbSplitter.py b/sickbeard/nzbSplitter.py index 6b60c20c..39d1df63 100644 --- a/sickbeard/nzbSplitter.py +++ b/sickbeard/nzbSplitter.py @@ -29,7 +29,7 @@ from sickbeard import encodingKludge as ek from sickbeard.exceptions import ex from name_parser.parser import NameParser, InvalidNameException, InvalidShowException -from sickbeard.encodingKludge import fixStupidEncodings +from sickbeard.encodingKludge import toUnicode def getSeasonNZBs(name, urlData, season): @@ -85,7 +85,7 @@ def createNZBString(fileElements, xmlns): for curFile in fileElements: rootElement.append(stripNS(curFile, xmlns)) - return xml.etree.ElementTree.tostring(fixStupidEncodings(rootElement)) + return xml.etree.ElementTree.tostring(toUnicode(rootElement)) def saveNZB(nzbName, nzbString): diff --git a/sickbeard/scene_exceptions.py b/sickbeard/scene_exceptions.py index 7ca5f977..f4154449 100644 --- a/sickbeard/scene_exceptions.py +++ b/sickbeard/scene_exceptions.py @@ -27,7 +27,7 @@ from sickbeard import helpers from sickbeard import name_cache from sickbeard import logger from sickbeard import db -from sickbeard.encodingKludge import fixStupidEncodings +from sickbeard.encodingKludge import toUnicode exception_dict = {} anidb_exception_dict = {} @@ -234,7 +234,7 @@ def retrieve_exceptions(): # if this exception isn't already in the DB then add it if cur_exception not in existing_exceptions: - cur_exception = fixStupidEncodings(cur_exception) + cur_exception = toUnicode(cur_exception) myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)", [cur_indexer_id, cur_exception, curSeason]) @@ -267,7 +267,7 @@ def update_scene_exceptions(indexer_id, scene_exceptions, season=-1): exceptionsCache[indexer_id][season] = scene_exceptions for cur_exception in scene_exceptions: - cur_exception = fixStupidEncodings(cur_exception) + cur_exception = toUnicode(cur_exception) myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)", [indexer_id, cur_exception, season]) diff --git a/sickbeard/show_name_helpers.py b/sickbeard/show_name_helpers.py index e408d535..736cbf42 100644 --- a/sickbeard/show_name_helpers.py +++ b/sickbeard/show_name_helpers.py @@ -234,7 +234,7 @@ def isGoodResult(name, show, log=True, season=-1): all_show_names = allPossibleShowNames(show, season=season) showNames = map(sanitizeSceneName, all_show_names) + all_show_names - showNames += map(unidecode, all_show_names) + showNames += map(ek.toUnicode, all_show_names) for curName in set(showNames): if not show.is_anime: diff --git a/sickbeard/tvcache.py b/sickbeard/tvcache.py index 01d3453e..dec8280d 100644 --- a/sickbeard/tvcache.py +++ b/sickbeard/tvcache.py @@ -33,7 +33,7 @@ from sickbeard.exceptions import AuthException from sickbeard.rssfeeds import RSSFeeds from sickbeard import clients from name_parser.parser import NameParser, InvalidNameException, InvalidShowException -from sickbeard.encodingKludge import fixStupidEncodings +from sickbeard.encodingKludge import toUnicode class CacheDBConnection(db.DBConnection): def __init__(self, providerName): @@ -263,7 +263,7 @@ class TVCache(): # get quality of release quality = parse_result.quality - name = fixStupidEncodings(name) + name = toUnicode(name) # get release group release_group = parse_result.release_group diff --git a/sickbeard/webserve.py b/sickbeard/webserve.py index bfc73f49..08277811 100644 --- a/sickbeard/webserve.py +++ b/sickbeard/webserve.py @@ -3288,7 +3288,7 @@ class ErrorLogs(MainHandler): for x in reversed(data): - x = ek.fixStupidEncodings(x) + x = ek.toUnicode(x) match = re.match(regex, x) if match: diff --git a/tests/all_tests.py b/tests/all_tests.py index dfe63c9e..28bdeb63 100644 --- a/tests/all_tests.py +++ b/tests/all_tests.py @@ -18,23 +18,27 @@ # You should have received a copy of the GNU General Public License # along with SickRage. If not, see . +import glob +import unittest +import sys + +class AllTests(unittest.TestCase): + def setUp(self): + self.test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__] + self.module_strings = [file_string[0:len(file_string) - 3] for file_string in self.test_file_strings] + self.suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in self.module_strings] + self.testSuite = unittest.TestSuite(self.suites) + + def testAll(self): + print "==================" + print "STARTING - ALL TESTS" + print "==================" + for includedfiles in self.test_file_strings: + print "- " + includedfiles + + text_runner = unittest.TextTestRunner().run(self.testSuite) + if not text_runner.wasSuccessful(): + sys.exit(-1) + if __name__ == "__main__": - import glob - import unittest - import sys - - test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__] - module_strings = [file_string[0:len(file_string) - 3] for file_string in test_file_strings] - suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in module_strings] - testSuite = unittest.TestSuite(suites) - - print "==================" - print "STARTING - ALL TESTS" - print "==================" - print "this will include" - for includedfiles in test_file_strings: - print "- " + includedfiles - - text_runner = unittest.TextTestRunner().run(testSuite) - if not text_runner.wasSuccessful(): - sys.exit(-1) + unittest.main() \ No newline at end of file diff --git a/tests/common_tests.py b/tests/common_tests.py index de620965..19b4632e 100644 --- a/tests/common_tests.py +++ b/tests/common_tests.py @@ -8,7 +8,6 @@ sys.path.append(os.path.abspath('../lib')) from sickbeard import common - class QualityTests(unittest.TestCase): # TODO: repack / proper ? air-by-date ? season rip? multi-ep? diff --git a/tests/test_lib.py b/tests/test_lib.py index 7f956b67..201d2182 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -51,7 +51,6 @@ EPISODE = 2 FILENAME = u"show name - s0" + str(SEASON) + "e0" + str(EPISODE) + ".mkv" FILEDIR = os.path.join(TESTDIR, SHOWNAME) FILEPATH = os.path.join(FILEDIR, FILENAME) - SHOWDIR = os.path.join(TESTDIR, SHOWNAME + " final") #sickbeard.logger.sb_log_instance = sickbeard.logger.SBRotatingLogHandler(os.path.join(TESTDIR, 'sickbeard.log'), sickbeard.logger.NUM_LOGS, sickbeard.logger.LOG_SIZE)