mirror of
https://github.com/moparisthebest/SickRage
synced 2025-01-07 11:58:01 -05:00
Merge branch 'release/v3.2.1'
This commit is contained in:
commit
1510ac32ca
@ -1,351 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
ftfy: fixes text for you
|
||||
|
||||
This is a module for making text less broken. See the `fix_text` function
|
||||
for more information.
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# See the docstring for ftfy.bad_codecs to see what we're doing here.
|
||||
import ftfy.bad_codecs
|
||||
ftfy.bad_codecs.ok()
|
||||
|
||||
from ftfy import fixes
|
||||
from ftfy.fixes import fix_text_encoding
|
||||
from ftfy.compatibility import PYTHON34_OR_LATER, is_printable
|
||||
import unicodedata
|
||||
import warnings
|
||||
|
||||
|
||||
def fix_text(text,
|
||||
remove_unsafe_private_use=(not PYTHON34_OR_LATER),
|
||||
fix_entities='auto',
|
||||
remove_terminal_escapes=True,
|
||||
fix_encoding=True,
|
||||
normalization='NFKC',
|
||||
uncurl_quotes=True,
|
||||
fix_line_breaks=True,
|
||||
remove_control_chars=True,
|
||||
remove_bom=True,
|
||||
max_decode_length=2**16):
|
||||
r"""
|
||||
Given Unicode text as input, make its representation consistent and
|
||||
possibly less broken.
|
||||
|
||||
Let's start with some examples:
|
||||
|
||||
>>> print(fix_text('ünicode'))
|
||||
ünicode
|
||||
|
||||
>>> print(fix_text('Broken text… it’s flubberific!'))
|
||||
Broken text... it's flubberific!
|
||||
|
||||
>>> print(fix_text('HTML entities <3'))
|
||||
HTML entities <3
|
||||
|
||||
>>> print(fix_text('<em>HTML entities <3</em>'))
|
||||
<em>HTML entities <3</em>
|
||||
|
||||
>>> print(fix_text('\001\033[36;44mI’m blue, da ba dee da ba '
|
||||
... 'doo…\033[0m'))
|
||||
I'm blue, da ba dee da ba doo...
|
||||
|
||||
>>> # This example string starts with a byte-order mark, even if
|
||||
>>> # you can't see it on the Web.
|
||||
>>> print(fix_text('\ufeffParty like\nit’s 1999!'))
|
||||
Party like
|
||||
it's 1999!
|
||||
|
||||
>>> len(fix_text('fi' * 100000))
|
||||
200000
|
||||
|
||||
>>> len(fix_text(''))
|
||||
0
|
||||
|
||||
Based on the options you provide, ftfy applies these steps in order:
|
||||
|
||||
- If `remove_unsafe_private_use` is True, it removes a range of private-use
|
||||
characters that could trigger a Python bug. The bug is fixed in
|
||||
the most recent versions of Python, so this will default to False
|
||||
starting on Python 3.4.
|
||||
- If `fix_entities` is True, replace HTML entities with their equivalent
|
||||
characters. If it's "auto" (the default), then consider replacing HTML
|
||||
entities, but don't do so in text where you have seen a pair of actual
|
||||
angle brackets (that's probably actually HTML and you shouldn't mess
|
||||
with the entities).
|
||||
- If `remove_terminal_escapes` is True, remove sequences of bytes that are
|
||||
instructions for Unix terminals, such as the codes that make text appear
|
||||
in different colors.
|
||||
- If `fix_encoding` is True, look for common mistakes that come from
|
||||
encoding or decoding Unicode text incorrectly, and fix them if they are
|
||||
reasonably fixable. See `fix_text_encoding` for details.
|
||||
- If `normalization` is not None, apply the specified form of Unicode
|
||||
normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
|
||||
The default, 'NFKC', applies the following relevant transformations:
|
||||
|
||||
- C: Combine characters and diacritics that are written using separate
|
||||
code points, such as converting "e" plus an acute accent modifier
|
||||
into "é", or converting "ka" (か) plus a dakuten into the
|
||||
single character "ga" (が).
|
||||
- K: Replace characters that are functionally equivalent with the most
|
||||
common form. For example, half-width katakana will be replaced with
|
||||
full-width versions, full-width Roman characters will be replaced with
|
||||
ASCII characters, ellipsis characters will be replaced with three
|
||||
periods, and the ligature 'fl' will be replaced with 'fl'.
|
||||
|
||||
- If `uncurl_quotes` is True, replace various curly quotation marks with
|
||||
plain-ASCII straight quotes.
|
||||
- If `fix_line_breaks` is true, convert all line breaks to Unix style
|
||||
(CRLF and CR line breaks become LF line breaks).
|
||||
- If `fix_control_characters` is true, remove all C0 control characters
|
||||
except the common useful ones: TAB, CR, LF, and FF. (CR characters
|
||||
may have already been removed by the `fix_line_breaks` step.)
|
||||
- If `remove_bom` is True, remove the Byte-Order Mark if it exists.
|
||||
- If anything was changed, repeat all the steps, so that the function is
|
||||
idempotent. "&amp;" will become "&", for example, not "&".
|
||||
|
||||
`fix_text` will work one line at a time, with the possibility that some
|
||||
lines are in different encodings. When it encounters lines longer than
|
||||
`max_decode_length`, it will not run the `fix_encoding` step, to avoid
|
||||
unbounded slowdowns.
|
||||
|
||||
If you are certain your entire text is in the same encoding (though that
|
||||
encoding is possibly flawed), and do not mind performing operations on
|
||||
the whole text at once, use `fix_text_segment`.
|
||||
"""
|
||||
if isinstance(text, bytes):
|
||||
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
|
||||
|
||||
out = []
|
||||
pos = 0
|
||||
while pos < len(text):
|
||||
textbreak = text.find('\n', pos) + 1
|
||||
fix_encoding_this_time = fix_encoding
|
||||
if textbreak == 0:
|
||||
textbreak = len(text)
|
||||
if (textbreak - pos) > max_decode_length:
|
||||
fix_encoding_this_time = False
|
||||
|
||||
substring = text[pos:textbreak]
|
||||
|
||||
if fix_entities == 'auto' and '<' in substring and '>' in substring:
|
||||
# we see angle brackets together; this could be HTML
|
||||
fix_entities = False
|
||||
|
||||
out.append(
|
||||
fix_text_segment(
|
||||
substring,
|
||||
remove_unsafe_private_use=remove_unsafe_private_use,
|
||||
fix_entities=fix_entities,
|
||||
remove_terminal_escapes=remove_terminal_escapes,
|
||||
fix_encoding=fix_encoding_this_time,
|
||||
normalization=normalization,
|
||||
uncurl_quotes=uncurl_quotes,
|
||||
fix_line_breaks=fix_line_breaks,
|
||||
remove_control_chars=remove_control_chars,
|
||||
remove_bom=remove_bom
|
||||
)
|
||||
)
|
||||
pos = textbreak
|
||||
|
||||
return ''.join(out)
|
||||
|
||||
ftfy = fix_text
|
||||
|
||||
|
||||
def fix_file(input_file,
|
||||
remove_unsafe_private_use=True,
|
||||
fix_entities='auto',
|
||||
remove_terminal_escapes=True,
|
||||
fix_encoding=True,
|
||||
normalization='NFKC',
|
||||
uncurl_quotes=True,
|
||||
fix_line_breaks=True,
|
||||
remove_control_chars=True,
|
||||
remove_bom=True):
|
||||
"""
|
||||
Fix text that is found in a file.
|
||||
|
||||
If the file is being read as Unicode text, use that. If it's being read as
|
||||
bytes, then unfortunately, we have to guess what encoding it is. We'll try
|
||||
a few common encodings, but we make no promises. See the `guess_bytes`
|
||||
function for how this is done.
|
||||
|
||||
The output is a stream of fixed lines of text.
|
||||
"""
|
||||
entities = fix_entities
|
||||
for line in input_file:
|
||||
if isinstance(line, bytes):
|
||||
line, encoding = guess_bytes(line)
|
||||
if fix_entities == 'auto' and '<' in line and '>' in line:
|
||||
entities = False
|
||||
yield fix_text_segment(
|
||||
line,
|
||||
remove_unsafe_private_use=remove_unsafe_private_use,
|
||||
fix_entities=entities,
|
||||
remove_terminal_escapes=remove_terminal_escapes,
|
||||
fix_encoding=fix_encoding,
|
||||
normalization=normalization,
|
||||
uncurl_quotes=uncurl_quotes,
|
||||
fix_line_breaks=fix_line_breaks,
|
||||
remove_control_chars=remove_control_chars,
|
||||
remove_bom=remove_bom
|
||||
)
|
||||
|
||||
|
||||
def fix_text_segment(text,
|
||||
remove_unsafe_private_use=True,
|
||||
fix_entities='auto',
|
||||
remove_terminal_escapes=True,
|
||||
fix_encoding=True,
|
||||
normalization='NFKC',
|
||||
uncurl_quotes=True,
|
||||
fix_line_breaks=True,
|
||||
remove_control_chars=True,
|
||||
remove_bom=True):
|
||||
"""
|
||||
Apply fixes to text in a single chunk. This could be a line of text
|
||||
within a larger run of `fix_text`, or it could be a larger amount
|
||||
of text that you are certain is all in the same encoding.
|
||||
|
||||
See `fix_text` for a description of the parameters.
|
||||
"""
|
||||
if isinstance(text, bytes):
|
||||
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
|
||||
|
||||
if fix_entities == 'auto' and '<' in text and '>' in text:
|
||||
fix_entities = False
|
||||
while True:
|
||||
origtext = text
|
||||
if remove_unsafe_private_use:
|
||||
text = fixes.remove_unsafe_private_use(text)
|
||||
if fix_entities:
|
||||
text = fixes.unescape_html(text)
|
||||
if remove_terminal_escapes:
|
||||
text = fixes.remove_terminal_escapes(text)
|
||||
if fix_encoding:
|
||||
text = fixes.fix_text_encoding(text)
|
||||
if normalization is not None:
|
||||
text = unicodedata.normalize(normalization, text)
|
||||
if uncurl_quotes:
|
||||
text = fixes.uncurl_quotes(text)
|
||||
if fix_line_breaks:
|
||||
text = fixes.fix_line_breaks(text)
|
||||
if remove_control_chars:
|
||||
text = fixes.remove_control_chars(text)
|
||||
if remove_bom:
|
||||
text = fixes.remove_bom(text)
|
||||
if text == origtext:
|
||||
return text
|
||||
|
||||
|
||||
def guess_bytes(bstring):
|
||||
"""
|
||||
If you have some bytes in an unknown encoding, here's a reasonable
|
||||
strategy for decoding them, by trying a few common encodings that
|
||||
can be distinguished from each other.
|
||||
|
||||
This is not a magic bullet. If the bytes are coming from some MySQL
|
||||
database with the "character set" set to ISO Elbonian, this won't figure
|
||||
it out. Perhaps more relevantly, this currently doesn't try East Asian
|
||||
encodings.
|
||||
|
||||
The encodings we try are:
|
||||
|
||||
- UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
|
||||
like nothing else
|
||||
- UTF-8, because it's the global de facto standard
|
||||
- "utf-8-variants", because it's what people actually implement when they
|
||||
think they're doing UTF-8
|
||||
- MacRoman, because Microsoft Office thinks it's still a thing, and it
|
||||
can be distinguished by its line breaks. (If there are no line breaks in
|
||||
the string, though, you're out of luck.)
|
||||
- "sloppy-windows-1252", the Latin-1-like encoding that is the most common
|
||||
single-byte encoding
|
||||
"""
|
||||
if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
|
||||
return bstring.decode('utf-16'), 'utf-16'
|
||||
|
||||
byteset = set(bytes(bstring))
|
||||
byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
|
||||
|
||||
try:
|
||||
if byte_ed in byteset or byte_c0 in byteset:
|
||||
# Byte 0xed can be used to encode a range of codepoints that
|
||||
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
|
||||
# so when we see 0xed, it's very likely we're being asked to
|
||||
# decode CESU-8, the variant that encodes UTF-16 surrogates
|
||||
# instead of the original characters themselves.
|
||||
#
|
||||
# This will occasionally trigger on standard UTF-8, as there
|
||||
# are some Korean characters that also use byte 0xed, but that's
|
||||
# not harmful.
|
||||
#
|
||||
# Byte 0xc0 is impossible because, numerically, it would only
|
||||
# encode characters lower than U+0040. Those already have
|
||||
# single-byte representations, and UTF-8 requires using the
|
||||
# shortest possible representation. However, Java hides the null
|
||||
# codepoint, U+0000, in a non-standard longer representation -- it
|
||||
# encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
|
||||
# will never appear in the encoded bytes.
|
||||
#
|
||||
# The 'utf-8-variants' decoder can handle both of these cases, as
|
||||
# well as standard UTF-8, at the cost of a bit of speed.
|
||||
return bstring.decode('utf-8-variants'), 'utf-8-variants'
|
||||
else:
|
||||
return bstring.decode('utf-8'), 'utf-8'
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
if byte_CR in bstring and byte_LF not in bstring:
|
||||
return bstring.decode('macroman'), 'macroman'
|
||||
else:
|
||||
return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
|
||||
|
||||
|
||||
def explain_unicode(text):
|
||||
"""
|
||||
A utility method that's useful for debugging mysterious Unicode.
|
||||
|
||||
It breaks down a string, showing you for each codepoint its number in
|
||||
hexadecimal, its glyph, its category in the Unicode standard, and its name
|
||||
in the Unicode standard.
|
||||
|
||||
>>> explain_unicode('(╯°□°)╯︵ ┻━┻')
|
||||
U+0028 ( [Ps] LEFT PARENTHESIS
|
||||
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
|
||||
U+00B0 ° [So] DEGREE SIGN
|
||||
U+25A1 □ [So] WHITE SQUARE
|
||||
U+00B0 ° [So] DEGREE SIGN
|
||||
U+0029 ) [Pe] RIGHT PARENTHESIS
|
||||
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
|
||||
U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
|
||||
U+0020 [Zs] SPACE
|
||||
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
|
||||
U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL
|
||||
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
|
||||
"""
|
||||
for char in text:
|
||||
if is_printable(char):
|
||||
display = char
|
||||
else:
|
||||
display = char.encode('unicode-escape').decode('ascii')
|
||||
print('U+{code:04X} {display:<7} [{category}] {name}'.format(
|
||||
display=display,
|
||||
code=ord(char),
|
||||
category=unicodedata.category(char),
|
||||
name=unicodedata.name(char, '<unknown>')
|
||||
))
|
||||
|
||||
|
||||
def fix_bad_encoding(text):
|
||||
"""
|
||||
Kept for compatibility with previous versions of ftfy.
|
||||
"""
|
||||
warnings.warn(
|
||||
'fix_bad_encoding is now known as fix_text_encoding',
|
||||
DeprecationWarning
|
||||
)
|
||||
return fix_text_encoding(text)
|
@ -1,94 +0,0 @@
|
||||
# coding: utf-8
|
||||
r"""
|
||||
Give Python the ability to decode some common, flawed encodings.
|
||||
|
||||
Python does not want you to be sloppy with your text. Its encoders and decoders
|
||||
("codecs") follow the relevant standards whenever possible, which means that
|
||||
when you get text that *doesn't* follow those standards, you'll probably fail
|
||||
to decode it. Or you might succeed at decoding it for implementation-specific
|
||||
reasons, which is perhaps worse.
|
||||
|
||||
There are some encodings out there that Python wishes didn't exist, which are
|
||||
widely used outside of Python:
|
||||
|
||||
- "utf-8-variants", a family of not-quite-UTF-8 encodings, including the
|
||||
ever-popular CESU-8 and "Java modified UTF-8".
|
||||
- "Sloppy" versions of character map encodings, where bytes that don't map to
|
||||
anything will instead map to the Unicode character with the same number.
|
||||
|
||||
Simply importing this module, or in fact any part of the `ftfy` package, will
|
||||
make these new "bad codecs" available to Python through the standard Codecs
|
||||
API. You never have to actually call any functions inside `ftfy.bad_codecs`.
|
||||
|
||||
However, if you want to call something because your code checker insists on it,
|
||||
you can call ``ftfy.bad_codecs.ok()``.
|
||||
|
||||
A quick example of decoding text that's encoded in CESU-8:
|
||||
|
||||
>>> import ftfy.bad_codecs
|
||||
>>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
|
||||
😍
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
from encodings import normalize_encoding
|
||||
import codecs
|
||||
|
||||
_CACHE = {}
|
||||
|
||||
# Define some aliases for 'utf-8-variants'. All hyphens get turned into
|
||||
# underscores, because of `normalize_encoding`.
|
||||
UTF8_VAR_NAMES = (
|
||||
'utf_8_variants', 'utf8_variants',
|
||||
'utf_8_variant', 'utf8_variant',
|
||||
'utf_8_var', 'utf8_var',
|
||||
'cesu_8', 'cesu8',
|
||||
'java_utf_8', 'java_utf8'
|
||||
)
|
||||
|
||||
|
||||
def search_function(encoding):
|
||||
"""
|
||||
Register our "bad codecs" with Python's codecs API. This involves adding
|
||||
a search function that takes in an encoding name, and returns a codec
|
||||
for that encoding if it knows one, or None if it doesn't.
|
||||
|
||||
The encodings this will match are:
|
||||
|
||||
- Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N',
|
||||
where the non-sloppy version is an encoding that leaves some bytes
|
||||
unmapped to characters.
|
||||
- The 'utf-8-variants' encoding, which has the several aliases seen
|
||||
above.
|
||||
"""
|
||||
if encoding in _CACHE:
|
||||
return _CACHE[encoding]
|
||||
|
||||
norm_encoding = normalize_encoding(encoding)
|
||||
codec = None
|
||||
if norm_encoding in UTF8_VAR_NAMES:
|
||||
from ftfy.bad_codecs.utf8_variants import CODEC_INFO
|
||||
codec = CODEC_INFO
|
||||
elif norm_encoding.startswith('sloppy_'):
|
||||
from ftfy.bad_codecs.sloppy import CODECS
|
||||
codec = CODECS.get(norm_encoding)
|
||||
|
||||
if codec is not None:
|
||||
_CACHE[encoding] = codec
|
||||
|
||||
return codec
|
||||
|
||||
|
||||
def ok():
|
||||
"""
|
||||
A feel-good function that gives you something to call after importing
|
||||
this package.
|
||||
|
||||
Why is this here? Pyflakes. Pyflakes gets upset when you import a module
|
||||
and appear not to use it. It doesn't know that you're using it when
|
||||
you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
|
||||
encodings.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
codecs.register(search_function)
|
@ -1,156 +0,0 @@
|
||||
# coding: utf-8
|
||||
r"""
|
||||
Decodes single-byte encodings, filling their "holes" in the same messy way that
|
||||
everyone else does.
|
||||
|
||||
A single-byte encoding maps each byte to a Unicode character, except that some
|
||||
bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
|
||||
example, bytes 0x81 and 0x8D, among others, have no meaning.
|
||||
|
||||
Python, wanting to preserve some sense of decorum, will handle these bytes
|
||||
as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
|
||||
different from each other. It just hasn't defined what they are in terms of
|
||||
Unicode.
|
||||
|
||||
Software that has to interoperate with Windows-1252 and Unicode -- such as all
|
||||
the common Web browsers -- will pick some Unicode characters for them to map
|
||||
to, and the characters they pick are the Unicode characters with the same
|
||||
numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
|
||||
resulting characters tend to fall into a range of Unicode that's set aside for
|
||||
obselete Latin-1 control characters anyway.
|
||||
|
||||
These sloppy codecs let Python do the same thing, thus interoperating with
|
||||
other software that works this way. It defines a sloppy version of many
|
||||
single-byte encodings with holes. (There is no need for a sloppy version of
|
||||
an encoding without holes: for example, there is no such thing as
|
||||
sloppy-iso-8859-2 or sloppy-macroman.)
|
||||
|
||||
The following encodings will become defined:
|
||||
|
||||
- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
|
||||
- sloppy-windows-1251 (Cyrillic)
|
||||
- sloppy-windows-1252 (Western European, based on Latin-1)
|
||||
- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
|
||||
- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
|
||||
- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
|
||||
- sloppy-windows-1256 (Arabic)
|
||||
- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
|
||||
- sloppy-windows-1258 (Vietnamese)
|
||||
- sloppy-cp874 (Thai, based on ISO-8859-11)
|
||||
- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
|
||||
- sloppy-iso-8859-6 (different Arabic)
|
||||
- sloppy-iso-8859-7 (Greek)
|
||||
- sloppy-iso-8859-8 (Hebrew)
|
||||
- sloppy-iso-8859-11 (Thai)
|
||||
|
||||
Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
|
||||
defined.
|
||||
|
||||
Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
|
||||
the rest are rather uncommon.
|
||||
|
||||
Here are some examples, using `ftfy.explain_unicode` to illustrate how
|
||||
sloppy-windows-1252 merges Windows-1252 with Latin-1:
|
||||
|
||||
>>> from ftfy import explain_unicode
|
||||
>>> some_bytes = b'\x80\x81\x82'
|
||||
>>> explain_unicode(some_bytes.decode('latin-1'))
|
||||
U+0080 \x80 [Cc] <unknown>
|
||||
U+0081 \x81 [Cc] <unknown>
|
||||
U+0082 \x82 [Cc] <unknown>
|
||||
|
||||
>>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
|
||||
U+20AC € [Sc] EURO SIGN
|
||||
U+FFFD <EFBFBD> [So] REPLACEMENT CHARACTER
|
||||
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
|
||||
|
||||
>>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
|
||||
U+20AC € [Sc] EURO SIGN
|
||||
U+0081 \x81 [Cc] <unknown>
|
||||
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
import codecs
|
||||
from encodings import normalize_encoding
|
||||
|
||||
REPLACEMENT_CHAR = '\ufffd'
|
||||
|
||||
|
||||
def make_sloppy_codec(encoding):
|
||||
"""
|
||||
Take a codec name, and return a 'sloppy' version of that codec that can
|
||||
encode and decode the unassigned bytes in that encoding.
|
||||
|
||||
Single-byte encodings in the standard library are defined using some
|
||||
boilerplate classes surrounding the functions that do the actual work,
|
||||
`codecs.charmap_decode` and `charmap_encode`. This function, given an
|
||||
encoding name, *defines* those boilerplate classes.
|
||||
"""
|
||||
# Make an array of all 256 possible bytes.
|
||||
all_bytes = bytearray(range(256))
|
||||
|
||||
# Get a list of what they would decode to in Latin-1.
|
||||
sloppy_chars = list(all_bytes.decode('latin-1'))
|
||||
|
||||
# Get a list of what they decode to in the given encoding. Use the
|
||||
# replacement character for unassigned bytes.
|
||||
decoded_chars = all_bytes.decode(encoding, 'replace')
|
||||
|
||||
# Update the sloppy_chars list. Each byte that was successfully decoded
|
||||
# gets its decoded value in the list. The unassigned bytes are left as
|
||||
# they are, which gives their decoding in Latin-1.
|
||||
for i, char in enumerate(decoded_chars):
|
||||
if char != REPLACEMENT_CHAR:
|
||||
sloppy_chars[i] = char
|
||||
|
||||
# Create the data structures that tell the charmap methods how to encode
|
||||
# and decode in this sloppy encoding.
|
||||
decoding_table = ''.join(sloppy_chars)
|
||||
encoding_table = codecs.charmap_build(decoding_table)
|
||||
|
||||
# Now produce all the class boilerplate. Look at the Python source for
|
||||
# `encodings.cp1252` for comparison; this is almost exactly the same,
|
||||
# except I made it follow pep8.
|
||||
class Codec(codecs.Codec):
|
||||
def encode(self, input, errors='strict'):
|
||||
return codecs.charmap_encode(input, errors, encoding_table)
|
||||
|
||||
def decode(self, input, errors='strict'):
|
||||
return codecs.charmap_decode(input, errors, decoding_table)
|
||||
|
||||
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||
def encode(self, input, final=False):
|
||||
return codecs.charmap_encode(input, self.errors, encoding_table)[0]
|
||||
|
||||
class IncrementalDecoder(codecs.IncrementalDecoder):
|
||||
def decode(self, input, final=False):
|
||||
return codecs.charmap_decode(input, self.errors, decoding_table)[0]
|
||||
|
||||
class StreamWriter(Codec, codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
class StreamReader(Codec, codecs.StreamReader):
|
||||
pass
|
||||
|
||||
return codecs.CodecInfo(
|
||||
name='sloppy-' + encoding,
|
||||
encode=Codec().encode,
|
||||
decode=Codec().decode,
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
)
|
||||
|
||||
# Define a codec for each incomplete encoding. The resulting CODECS dictionary
|
||||
# can be used by the main module of ftfy.bad_codecs.
|
||||
CODECS = {}
|
||||
INCOMPLETE_ENCODINGS = (
|
||||
['windows-%s' % num for num in range(1250, 1259)] +
|
||||
['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
|
||||
['cp%s' % num for num in range(1250, 1259)] + ['cp874']
|
||||
)
|
||||
|
||||
for _encoding in INCOMPLETE_ENCODINGS:
|
||||
_new_name = normalize_encoding('sloppy-' + _encoding)
|
||||
CODECS[_new_name] = make_sloppy_codec(_encoding)
|
@ -1,281 +0,0 @@
|
||||
r"""
|
||||
This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can
|
||||
decode text that's been encoded with a popular non-standard version of UTF-8.
|
||||
This includes CESU-8, the accidental encoding made by layering UTF-8 on top of
|
||||
UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for
|
||||
codepoint 0.
|
||||
|
||||
This is particularly relevant in Python 3, which provides no other way of
|
||||
decoding CESU-8 or Java's encoding. [1]
|
||||
|
||||
The easiest way to use the codec is to simply import `ftfy.bad_codecs`:
|
||||
|
||||
>>> import ftfy.bad_codecs
|
||||
>>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var')
|
||||
>>> print(repr(result).lstrip('u'))
|
||||
'here comes a null! \x00'
|
||||
|
||||
The codec does not at all enforce "correct" CESU-8. For example, the Unicode
|
||||
Consortium's not-quite-standard describing CESU-8 requires that there is only
|
||||
one possible encoding of any character, so it does not allow mixing of valid
|
||||
UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8
|
||||
decoder does.
|
||||
|
||||
Characters in the Basic Multilingual Plane still have only one encoding. This
|
||||
codec still enforces the rule, within the BMP, that characters must appear in
|
||||
their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`,
|
||||
instead of just `0x00`, may be used to encode the null character `U+0000`, like
|
||||
in Java.
|
||||
|
||||
If you encode with this codec, you get legitimate UTF-8. Decoding with this
|
||||
codec and then re-encoding is not idempotent, although encoding and then
|
||||
decoding is. So this module won't produce CESU-8 for you. Look for that
|
||||
functionality in the sister module, "Breaks Text For You", coming approximately
|
||||
never.
|
||||
|
||||
[1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: first
|
||||
decode the bytes (incorrectly), then encode them, then decode them again, using
|
||||
UTF-8 as the codec every time.
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2
|
||||
from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder,
|
||||
IncrementalEncoder as UTF8IncrementalEncoder)
|
||||
import re
|
||||
import codecs
|
||||
|
||||
NAME = 'utf-8-variants'
|
||||
# This regular expression matches all possible six-byte CESU-8 sequences.
|
||||
CESU8_RE = re.compile(b'\xed[\xa0-\xaf][\x80-\xbf]\xed[\xb0-\xbf][\x80-\xbf]')
|
||||
|
||||
|
||||
class IncrementalDecoder(UTF8IncrementalDecoder):
|
||||
"""
|
||||
An incremental decoder that extends Python's built-in UTF-8 decoder.
|
||||
|
||||
This encoder needs to take in bytes, possibly arriving in a stream, and
|
||||
output the correctly decoded text. The general strategy for doing this
|
||||
is to fall back on the real UTF-8 decoder whenever possible, because
|
||||
the real UTF-8 decoder is way optimized, but to call specialized methods
|
||||
we define here for the cases the real encoder isn't expecting.
|
||||
"""
|
||||
def _buffer_decode(self, input, errors, final):
|
||||
"""
|
||||
Decode bytes that may be arriving in a stream, following the Codecs
|
||||
API.
|
||||
|
||||
`input` is the incoming sequence of bytes. `errors` tells us how to
|
||||
handle errors, though we delegate all error-handling cases to the real
|
||||
UTF-8 decoder to ensure correct behavior. `final` indicates whether
|
||||
this is the end of the sequence, in which case we should raise an
|
||||
error given incomplete input.
|
||||
|
||||
Returns as much decoded text as possible, and the number of bytes
|
||||
consumed.
|
||||
"""
|
||||
# decoded_segments are the pieces of text we have decoded so far,
|
||||
# and position is our current position in the byte string. (Bytes
|
||||
# before this position have been consumed, and bytes after it have
|
||||
# yet to be decoded.)
|
||||
decoded_segments = []
|
||||
position = 0
|
||||
while True:
|
||||
# Use _buffer_decode_step to decode a segment of text.
|
||||
decoded, consumed = self._buffer_decode_step(
|
||||
input[position:],
|
||||
errors,
|
||||
final
|
||||
)
|
||||
if consumed == 0:
|
||||
# Either there's nothing left to decode, or we need to wait
|
||||
# for more input. Either way, we're done for now.
|
||||
break
|
||||
|
||||
# Append the decoded text to the list, and update our position.
|
||||
decoded_segments.append(decoded)
|
||||
position += consumed
|
||||
|
||||
if final:
|
||||
# _buffer_decode_step must consume all the bytes when `final` is
|
||||
# true.
|
||||
assert position == len(input)
|
||||
|
||||
return ''.join(decoded_segments), position
|
||||
|
||||
def _buffer_decode_step(self, input, errors, final):
|
||||
"""
|
||||
There are three possibilities for each decoding step:
|
||||
|
||||
- Decode as much real UTF-8 as possible.
|
||||
- Decode a six-byte CESU-8 sequence at the current position.
|
||||
- Decode a Java-style null at the current position.
|
||||
|
||||
This method figures out which step is appropriate, and does it.
|
||||
"""
|
||||
# Get a reference to the superclass method that we'll be using for
|
||||
# most of the real work.
|
||||
sup = UTF8IncrementalDecoder._buffer_decode
|
||||
|
||||
# Find the next byte position that indicates a variant of UTF-8.
|
||||
# CESU-8 sequences always start with 0xed, and Java nulls always
|
||||
# start with 0xc0, both of which are conveniently impossible in
|
||||
# real UTF-8.
|
||||
cutoff1 = input.find(b'\xed')
|
||||
cutoff2 = input.find(b'\xc0')
|
||||
|
||||
# Set `cutoff` to whichever cutoff comes first.
|
||||
if cutoff1 != -1 and cutoff2 != -1:
|
||||
cutoff = min(cutoff1, cutoff2)
|
||||
elif cutoff1 != -1:
|
||||
cutoff = cutoff1
|
||||
elif cutoff2 != -1:
|
||||
cutoff = cutoff2
|
||||
else:
|
||||
# The entire input can be decoded as UTF-8, so just do so.
|
||||
return sup(input, errors, final)
|
||||
|
||||
if cutoff1 == 0:
|
||||
# Decode a possible six-byte sequence starting with 0xed.
|
||||
return self._buffer_decode_surrogates(sup, input, errors, final)
|
||||
elif cutoff2 == 0:
|
||||
# Decode a possible two-byte sequence, 0xc0 0x80.
|
||||
return self._buffer_decode_null(sup, input, errors, final)
|
||||
else:
|
||||
# Decode the bytes up until the next weird thing as UTF-8.
|
||||
# Set final=True because 0xc0 and 0xed don't make sense in the
|
||||
# middle of a sequence, in any variant.
|
||||
return sup(input[:cutoff], errors, True)
|
||||
|
||||
@staticmethod
|
||||
def _buffer_decode_null(sup, input, errors, final):
|
||||
"""
|
||||
Decode the bytes 0xc0 0x80 as U+0000, like Java does.
|
||||
"""
|
||||
nextbyte = input[1:2]
|
||||
if nextbyte == b'':
|
||||
if final:
|
||||
# We found 0xc0 at the end of the stream, which is an error.
|
||||
# Delegate to the superclass method to handle that error.
|
||||
return sup(input, errors, final)
|
||||
else:
|
||||
# We found 0xc0 and we don't know what comes next, so consume
|
||||
# no bytes and wait.
|
||||
return '', 0
|
||||
elif nextbyte == b'\x80':
|
||||
# We found the usual 0xc0 0x80 sequence, so decode it and consume
|
||||
# two bytes.
|
||||
return '\u0000', 2
|
||||
else:
|
||||
# We found 0xc0 followed by something else, which is an error.
|
||||
# Whatever should happen is equivalent to what happens when the
|
||||
# superclass is given just the byte 0xc0, with final=True.
|
||||
return sup(b'\xc0', errors, True)
|
||||
|
||||
@staticmethod
|
||||
def _buffer_decode_surrogates(sup, input, errors, final):
|
||||
"""
|
||||
When we have improperly encoded surrogates, we can still see the
|
||||
bits that they were meant to represent.
|
||||
|
||||
The surrogates were meant to encode a 20-bit number, to which we
|
||||
add 0x10000 to get a codepoint. That 20-bit number now appears in
|
||||
this form:
|
||||
|
||||
11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
|
||||
|
||||
The CESU8_RE above matches byte sequences of this form. Then we need
|
||||
to extract the bits and assemble a codepoint number from them.
|
||||
"""
|
||||
if len(input) < 6:
|
||||
if final:
|
||||
# We found 0xed near the end of the stream, and there aren't
|
||||
# six bytes to decode. Delegate to the superclass method to
|
||||
# handle it as normal UTF-8. It might be a Hangul character
|
||||
# or an error.
|
||||
if PYTHON2 and len(input) >= 3:
|
||||
# We can't trust Python 2 to raise an error when it's
|
||||
# asked to decode a surrogate, so let's force the issue.
|
||||
input = mangle_surrogates(input)
|
||||
return sup(input, errors, final)
|
||||
else:
|
||||
# We found 0xed, the stream isn't over yet, and we don't know
|
||||
# enough of the following bytes to decode anything, so consume
|
||||
# zero bytes and wait.
|
||||
return '', 0
|
||||
else:
|
||||
if CESU8_RE.match(input):
|
||||
# If this is a CESU-8 sequence, do some math to pull out
|
||||
# the intended 20-bit value, and consume six bytes.
|
||||
bytenums = bytes_to_ints(input[:6])
|
||||
codepoint = (
|
||||
((bytenums[1] & 0x0f) << 16) +
|
||||
((bytenums[2] & 0x3f) << 10) +
|
||||
((bytenums[4] & 0x0f) << 6) +
|
||||
(bytenums[5] & 0x3f) +
|
||||
0x10000
|
||||
)
|
||||
return unichr(codepoint), 6
|
||||
else:
|
||||
# This looked like a CESU-8 sequence, but it wasn't one.
|
||||
# 0xed indicates the start of a three-byte sequence, so give
|
||||
# three bytes to the superclass to decode as usual -- except
|
||||
# for working around the Python 2 discrepancy as before.
|
||||
if PYTHON2:
|
||||
input = mangle_surrogates(input)
|
||||
return sup(input[:3], errors, False)
|
||||
|
||||
|
||||
def mangle_surrogates(bytestring):
|
||||
"""
|
||||
When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats
|
||||
it as an error (which it is). In 'replace' mode, it will decode as three
|
||||
replacement characters. But Python 2 will just output the surrogate
|
||||
codepoint.
|
||||
|
||||
To ensure consistency between Python 2 and Python 3, and protect downstream
|
||||
applications from malformed strings, we turn surrogate sequences at the
|
||||
start of the string into the bytes `ff ff ff`, which we're *sure* won't
|
||||
decode, and which turn into three replacement characters in 'replace' mode.
|
||||
"""
|
||||
if PYTHON2:
|
||||
if bytestring.startswith(b'\xed') and len(bytestring) >= 3:
|
||||
decoded = bytestring[:3].decode('utf-8', 'replace')
|
||||
if '\ud800' <= decoded <= '\udfff':
|
||||
return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:])
|
||||
return bytestring
|
||||
else:
|
||||
# On Python 3, nothing needs to be done.
|
||||
return bytestring
|
||||
|
||||
# The encoder is identical to UTF-8.
|
||||
IncrementalEncoder = UTF8IncrementalEncoder
|
||||
|
||||
|
||||
# Everything below here is boilerplate that matches the modules in the
|
||||
# built-in `encodings` package.
|
||||
def encode(input, errors='strict'):
|
||||
return IncrementalEncoder(errors).encode(input, final=True), len(input)
|
||||
|
||||
|
||||
def decode(input, errors='strict'):
|
||||
return IncrementalDecoder(errors).decode(input, final=True), len(input)
|
||||
|
||||
|
||||
class StreamWriter(codecs.StreamWriter):
|
||||
encode = encode
|
||||
|
||||
|
||||
class StreamReader(codecs.StreamReader):
|
||||
decode = decode
|
||||
|
||||
|
||||
CODEC_INFO = codecs.CodecInfo(
|
||||
name=NAME,
|
||||
encode=encode,
|
||||
decode=decode,
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
)
|
@ -1,144 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Heuristics to determine whether re-encoding text is actually making it
|
||||
more reasonable.
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from ftfy.chardata import chars_to_classes
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
# The following regex uses the mapping of character classes to ASCII
|
||||
# characters defined in chardata.py and build_data.py:
|
||||
#
|
||||
# L = Latin capital letter
|
||||
# l = Latin lowercase letter
|
||||
# A = Non-latin capital or title-case letter
|
||||
# a = Non-latin lowercase letter
|
||||
# C = Non-cased letter (Lo)
|
||||
# X = Control character (Cc)
|
||||
# m = Letter modifier (Lm)
|
||||
# M = Mark (Mc, Me, Mn)
|
||||
# N = Miscellaneous numbers (No)
|
||||
# 0 = Math symbol (Sm)
|
||||
# 1 = Currency symbol (Sc)
|
||||
# 2 = Symbol modifier (Sk)
|
||||
# 3 = Other symbol (So)
|
||||
# S = UTF-16 surrogate
|
||||
# _ = Unassigned character
|
||||
# = Whitespace
|
||||
# o = Other
|
||||
|
||||
|
||||
def _make_weirdness_regex():
|
||||
"""
|
||||
Creates a list of regexes that match 'weird' character sequences.
|
||||
The more matches there are, the weirder the text is.
|
||||
"""
|
||||
groups = []
|
||||
|
||||
# Match lowercase letters that are followed by non-ASCII uppercase letters
|
||||
groups.append('lA')
|
||||
|
||||
# Match diacritical marks, except when they modify a non-cased letter or
|
||||
# another mark.
|
||||
#
|
||||
# You wouldn't put a diacritical mark on a digit or a space, for example.
|
||||
# You might put it on a Latin letter, but in that case there will almost
|
||||
# always be a pre-composed version, and we normalize to pre-composed
|
||||
# versions first. The cases that can't be pre-composed tend to be in
|
||||
# large scripts without case, which are in class C.
|
||||
groups.append('[^CM]M')
|
||||
|
||||
# Match non-Latin characters adjacent to Latin characters.
|
||||
#
|
||||
# This is a simplification from ftfy version 2, which compared all
|
||||
# adjacent scripts. However, the ambiguities we need to resolve come from
|
||||
# encodings designed to represent Latin characters.
|
||||
groups.append('[Ll][AaC]')
|
||||
groups.append('[AaC][Ll]')
|
||||
|
||||
# Match C1 control characters, which are almost always the result of
|
||||
# decoding Latin-1 that was meant to be Windows-1252.
|
||||
groups.append('X')
|
||||
|
||||
# Match private use and unassigned characters.
|
||||
groups.append('P')
|
||||
groups.append('_')
|
||||
|
||||
# Match adjacent characters from any different pair of these categories:
|
||||
# - Modifier marks (M)
|
||||
# - Letter modifiers (m)
|
||||
# - Miscellaneous numbers (N)
|
||||
# - Symbols (0123)
|
||||
|
||||
exclusive_categories = 'MmN0123'
|
||||
for cat1 in exclusive_categories:
|
||||
others_range = ''.join(c for c in exclusive_categories if c != cat1)
|
||||
groups.append('{cat1}[{others_range}]'.format(
|
||||
cat1=cat1, others_range=others_range
|
||||
))
|
||||
regex = '|'.join('({0})'.format(group) for group in groups)
|
||||
return re.compile(regex)
|
||||
|
||||
WEIRDNESS_RE = _make_weirdness_regex()
|
||||
|
||||
# A few characters are common ending punctuation that can show up at the end
|
||||
# of a mojibake sequence. It's plausible that such a character could appear
|
||||
# after an accented capital letter, for example, so we'll want to add a
|
||||
# slight preference to leave these characters alone.
|
||||
#
|
||||
# The match ends with a + so that we only give the bonus once for a
|
||||
# consecutive sequence of these characters.
|
||||
ENDING_PUNCT_RE = re.compile(
|
||||
'['
|
||||
'\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}'
|
||||
'\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}'
|
||||
'\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}'
|
||||
'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}'
|
||||
']+'
|
||||
)
|
||||
|
||||
def sequence_weirdness(text):
|
||||
"""
|
||||
Determine how often a text has unexpected characters or sequences of
|
||||
characters. This metric is used to disambiguate when text should be
|
||||
re-decoded or left as is.
|
||||
|
||||
We start by normalizing text in NFC form, so that penalties for
|
||||
diacritical marks don't apply to characters that know what to do with
|
||||
them.
|
||||
|
||||
The following things are deemed weird:
|
||||
|
||||
- Lowercase letters followed by non-ASCII uppercase letters
|
||||
- Non-Latin characters next to Latin characters
|
||||
- Un-combined diacritical marks, unless they're stacking on non-alphabetic
|
||||
characters (in languages that do that kind of thing a lot) or other
|
||||
marks
|
||||
- C1 control characters
|
||||
- Adjacent symbols from any different pair of these categories:
|
||||
|
||||
- Modifier marks
|
||||
- Letter modifiers
|
||||
- Non-digit numbers
|
||||
- Symbols (including math and currency)
|
||||
|
||||
The return value is the number of instances of weirdness.
|
||||
"""
|
||||
text2 = unicodedata.normalize('NFC', text)
|
||||
weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2)))
|
||||
punct_discount = len(ENDING_PUNCT_RE.findall(text2))
|
||||
return weirdness * 2 - punct_discount
|
||||
|
||||
|
||||
def text_cost(text):
|
||||
"""
|
||||
An overall cost function for text. Weirder is worse, but all else being
|
||||
equal, shorter strings are better.
|
||||
|
||||
The overall cost is measured as the "weirdness" (see
|
||||
:func:`sequence_weirdness`) plus the length.
|
||||
"""
|
||||
return sequence_weirdness(text) + len(text)
|
@ -1,111 +0,0 @@
|
||||
"""
|
||||
A script to make the char_classes.dat file.
|
||||
|
||||
This never needs to run in normal usage. It needs to be run if the character
|
||||
classes we care about change, or if a new version of Python supports a new
|
||||
Unicode standard and we want it to affect our string decoding.
|
||||
|
||||
The file that we generate is based on Unicode 6.1, as supported by Python 3.3.
|
||||
You can certainly use it in earlier versions. This simply makes sure that we
|
||||
get consistent results from running ftfy on different versions of Python.
|
||||
|
||||
The file will be written to the current directory.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
import unicodedata
|
||||
import sys
|
||||
import zlib
|
||||
if sys.hexversion >= 0x03000000:
|
||||
unichr = chr
|
||||
|
||||
# L = Latin capital letter
|
||||
# l = Latin lowercase letter
|
||||
# A = Non-latin capital or title-case letter
|
||||
# a = Non-latin lowercase letter
|
||||
# C = Non-cased letter (Lo)
|
||||
# X = Control character (Cc)
|
||||
# m = Letter modifier (Lm)
|
||||
# M = Mark (Mc, Me, Mn)
|
||||
# N = Miscellaneous numbers (No)
|
||||
# P = Private use (Co)
|
||||
# 0 = Math symbol (Sm)
|
||||
# 1 = Currency symbol (Sc)
|
||||
# 2 = Symbol modifier (Sk)
|
||||
# 3 = Other symbol (So)
|
||||
# S = UTF-16 surrogate
|
||||
# _ = Unassigned character
|
||||
# = Whitespace
|
||||
# o = Other
|
||||
|
||||
|
||||
def make_char_data_file(do_it_anyway=False):
|
||||
"""
|
||||
Build the compressed data file 'char_classes.dat' and write it to the
|
||||
current directory.
|
||||
|
||||
If you run this, run it in Python 3.3 or later. It will run in earlier
|
||||
versions, but you won't get the current Unicode standard, leading to
|
||||
inconsistent behavior. To protect against this, running this in the
|
||||
wrong version of Python will raise an error unless you pass
|
||||
`do_it_anyway=True`.
|
||||
"""
|
||||
if sys.hexversion < 0x03030000 and not do_it_anyway:
|
||||
raise RuntimeError(
|
||||
"This function should be run in Python 3.3 or later."
|
||||
)
|
||||
|
||||
cclasses = [None] * 0x110000
|
||||
for codepoint in range(0x0, 0x110000):
|
||||
char = unichr(codepoint)
|
||||
category = unicodedata.category(char)
|
||||
|
||||
if category.startswith('L'): # letters
|
||||
is_latin = unicodedata.name(char).startswith('LATIN')
|
||||
if is_latin and codepoint < 0x200:
|
||||
if category == 'Lu':
|
||||
cclasses[codepoint] = 'L'
|
||||
else:
|
||||
cclasses[codepoint] = 'l'
|
||||
else: # non-Latin letter, or close enough
|
||||
if category == 'Lu' or category == 'Lt':
|
||||
cclasses[codepoint] = 'A'
|
||||
elif category == 'Ll':
|
||||
cclasses[codepoint] = 'a'
|
||||
elif category == 'Lo':
|
||||
cclasses[codepoint] = 'C'
|
||||
elif category == 'Lm':
|
||||
cclasses[codepoint] = 'm'
|
||||
else:
|
||||
raise ValueError('got some weird kind of letter')
|
||||
elif category.startswith('M'): # marks
|
||||
cclasses[codepoint] = 'M'
|
||||
elif category == 'No':
|
||||
cclasses[codepoint] = 'N'
|
||||
elif category == 'Sm':
|
||||
cclasses[codepoint] = '0'
|
||||
elif category == 'Sc':
|
||||
cclasses[codepoint] = '1'
|
||||
elif category == 'Sk':
|
||||
cclasses[codepoint] = '2'
|
||||
elif category == 'So':
|
||||
cclasses[codepoint] = '3'
|
||||
elif category == 'Cn':
|
||||
cclasses[codepoint] = '_'
|
||||
elif category == 'Cc':
|
||||
cclasses[codepoint] = 'X'
|
||||
elif category == 'Cs':
|
||||
cclasses[codepoint] = 'S'
|
||||
elif category == 'Co':
|
||||
cclasses[codepoint] = 'P'
|
||||
elif category.startswith('Z'):
|
||||
cclasses[codepoint] = ' '
|
||||
else:
|
||||
cclasses[codepoint] = 'o'
|
||||
|
||||
cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
|
||||
out = open('char_classes.dat', 'wb')
|
||||
out.write(zlib.compress(''.join(cclasses).encode('ascii')))
|
||||
out.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
make_char_data_file()
|
Binary file not shown.
@ -1,81 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
This gives other modules access to the gritty details about characters and the
|
||||
encodings that use them.
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
import re
|
||||
import zlib
|
||||
from pkg_resources import resource_string
|
||||
from ftfy.compatibility import unichr
|
||||
|
||||
# These are the five encodings we will try to fix in ftfy, in the
|
||||
# order that they should be tried.
|
||||
CHARMAP_ENCODINGS = [
|
||||
'latin-1',
|
||||
'sloppy-windows-1252',
|
||||
'macroman',
|
||||
'cp437',
|
||||
'sloppy-windows-1251',
|
||||
]
|
||||
|
||||
|
||||
def _build_regexes():
|
||||
"""
|
||||
ENCODING_REGEXES contain reasonably fast ways to detect if we
|
||||
could represent a given string in a given encoding. The simplest one is
|
||||
the 'ascii' detector, which of course just determines if all characters
|
||||
are between U+0000 and U+007F.
|
||||
"""
|
||||
# Define a regex that matches ASCII text.
|
||||
encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
|
||||
|
||||
for encoding in CHARMAP_ENCODINGS:
|
||||
latin1table = ''.join(unichr(i) for i in range(128, 256))
|
||||
charlist = latin1table.encode('latin-1').decode(encoding)
|
||||
|
||||
# Build a regex from the ASCII range, followed by the decodings of
|
||||
# bytes 0x80-0xff in this character set. (This uses the fact that all
|
||||
# regex special characters are ASCII, and therefore won't appear in the
|
||||
# string.)
|
||||
regex = '^[\x00-\x7f{0}]*$'.format(charlist)
|
||||
encoding_regexes[encoding] = re.compile(regex)
|
||||
return encoding_regexes
|
||||
ENCODING_REGEXES = _build_regexes()
|
||||
|
||||
|
||||
def possible_encoding(text, encoding):
|
||||
"""
|
||||
Given text and a single-byte encoding, check whether that text could have
|
||||
been decoded from that single-byte encoding.
|
||||
|
||||
In other words, check whether it can be encoded in that encoding, possibly
|
||||
sloppily.
|
||||
"""
|
||||
return bool(ENCODING_REGEXES[encoding].match(text))
|
||||
|
||||
|
||||
CHAR_CLASS_STRING = zlib.decompress(
|
||||
resource_string(__name__, 'char_classes.dat')
|
||||
).decode('ascii')
|
||||
|
||||
def chars_to_classes(string):
|
||||
"""
|
||||
Convert each Unicode character to a letter indicating which of many
|
||||
classes it's in.
|
||||
|
||||
See build_data.py for where this data comes from and what it means.
|
||||
"""
|
||||
return string.translate(CHAR_CLASS_STRING)
|
||||
|
||||
|
||||
# A translate mapping that will strip all C0 control characters except
|
||||
# those that represent whitespace.
|
||||
CONTROL_CHARS = {}
|
||||
for i in range(32):
|
||||
CONTROL_CHARS[i] = None
|
||||
|
||||
# Map whitespace control characters to themselves.
|
||||
for char in '\t\n\f\r':
|
||||
del CONTROL_CHARS[ord(char)]
|
@ -1,34 +0,0 @@
|
||||
"""
|
||||
A simple command-line utility for fixing text found in a file.
|
||||
|
||||
Because files do not come with their encoding marked, it first runs the file
|
||||
through `ftfy.guess_bytes`, then runs it through `ftfy.fix_text`.
|
||||
"""
|
||||
from ftfy import fix_file
|
||||
|
||||
import sys
|
||||
ENCODE_STDOUT = (sys.hexversion < 0x03000000)
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Run ftfy as a command-line utility. (Requires Python 2.7 or later, or
|
||||
the 'argparse' module.)
|
||||
"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('filename', help='file to transcode')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
file = open(args.filename)
|
||||
for line in fix_file(file):
|
||||
if ENCODE_STDOUT:
|
||||
sys.stdout.write(line.encode('utf-8'))
|
||||
else:
|
||||
sys.stdout.write(line)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,79 +0,0 @@
|
||||
"""
|
||||
Makes some function names and behavior consistent between Python 2 and
|
||||
Python 3, and also between narrow and wide builds.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
import sys
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
if sys.hexversion >= 0x03000000:
|
||||
from html import entities
|
||||
unichr = chr
|
||||
xrange = range
|
||||
PYTHON2 = False
|
||||
else:
|
||||
import htmlentitydefs as entities
|
||||
unichr = unichr
|
||||
xrange = xrange
|
||||
PYTHON2 = True
|
||||
htmlentitydefs = entities
|
||||
|
||||
PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000)
|
||||
|
||||
|
||||
def _narrow_unichr_workaround(codepoint):
|
||||
"""
|
||||
A replacement for unichr() on narrow builds of Python. This will get
|
||||
us the narrow representation of an astral character, which will be
|
||||
a string of length two, containing two UTF-16 surrogates.
|
||||
"""
|
||||
escaped = b'\\U%08x' % codepoint
|
||||
return escaped.decode('unicode-escape')
|
||||
|
||||
|
||||
if sys.maxunicode < 0x10000:
|
||||
unichr = _narrow_unichr_workaround
|
||||
# In a narrow build of Python, we can't write a regex involving astral
|
||||
# characters. If we want to write the regex:
|
||||
#
|
||||
# [\U00100000-\U0010ffff]
|
||||
#
|
||||
# The actual string that defines it quietly turns into:
|
||||
#
|
||||
# [\udbc0\udc00-\udbff\udfff]
|
||||
#
|
||||
# And now the range operator only applies to the middle two characters.
|
||||
# It looks like a range that's going backwards from \dc00 to \dbff,
|
||||
# which is an error.
|
||||
#
|
||||
# What we can do instead is rewrite the expression to be _about_ the two
|
||||
# surrogates that make up the astral characters, instead of the characters
|
||||
# themselves. This would be wrong on a wide build, but it works on a
|
||||
# narrow build.
|
||||
UNSAFE_PRIVATE_USE_RE = re.compile('[\udbc0-\udbff][\udc00-\udfff]')
|
||||
else:
|
||||
UNSAFE_PRIVATE_USE_RE = re.compile('[\U00100000-\U0010ffff]')
|
||||
|
||||
|
||||
def bytes_to_ints(bytestring):
|
||||
"""
|
||||
No matter what version of Python this is, make a sequence of integers from
|
||||
a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a
|
||||
sequence of integers.
|
||||
"""
|
||||
if PYTHON2:
|
||||
return [ord(b) for b in bytestring]
|
||||
else:
|
||||
return bytestring
|
||||
|
||||
|
||||
def is_printable(char):
|
||||
"""
|
||||
str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so
|
||||
let's make a crude approximation in Python 2.
|
||||
"""
|
||||
if PYTHON2:
|
||||
return not unicodedata.category(char).startswith('C')
|
||||
else:
|
||||
return char.isprintable()
|
@ -1,473 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
This module contains the individual fixes that the main fix_text function
|
||||
can perform.
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from ftfy.chardata import (possible_encoding,
|
||||
CHARMAP_ENCODINGS, CONTROL_CHARS)
|
||||
from ftfy.badness import text_cost
|
||||
from ftfy.compatibility import htmlentitydefs, unichr, UNSAFE_PRIVATE_USE_RE
|
||||
import re
|
||||
import sys
|
||||
import codecs
|
||||
|
||||
|
||||
BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
|
||||
|
||||
ftfy is designed to fix problems that were introduced by handling Unicode
|
||||
incorrectly. It might be able to fix the bytes you just handed it, but the
|
||||
fact that you just gave a pile of bytes to a function that fixes text means
|
||||
that your code is *also* handling Unicode incorrectly.
|
||||
|
||||
ftfy takes Unicode text as input. You should take these bytes and decode
|
||||
them from the encoding you think they are in. If you're not sure what encoding
|
||||
they're in:
|
||||
|
||||
- First, try to find out. 'utf-8' is a good assumption.
|
||||
- If the encoding is simply unknowable, try running your bytes through
|
||||
ftfy.guess_bytes. As the name implies, this may not always be accurate.
|
||||
|
||||
If you're confused by this, please read the Python Unicode HOWTO:
|
||||
|
||||
http://docs.python.org/%d/howto/unicode.html
|
||||
""" % sys.version_info[0]
|
||||
|
||||
|
||||
def fix_text_encoding(text):
|
||||
r"""
|
||||
Fix text with incorrectly-decoded garbage ("mojibake") whenever possible.
|
||||
|
||||
Something you will find all over the place, in real-world text, is text
|
||||
that's mistakenly encoded as utf-8, decoded in some ugly format like
|
||||
latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
|
||||
|
||||
This causes your perfectly good Unicode-aware code to end up with garbage
|
||||
text because someone else (or maybe "someone else") made a mistake.
|
||||
|
||||
This function looks for the evidence of that having happened and fixes it.
|
||||
It determines whether it should replace nonsense sequences of single-byte
|
||||
characters that were really meant to be UTF-8 characters, and if so, turns
|
||||
them into the correctly-encoded Unicode character that they were meant to
|
||||
represent.
|
||||
|
||||
The input to the function must be Unicode. If you don't have Unicode text,
|
||||
you're not using the right tool to solve your problem.
|
||||
|
||||
.. note::
|
||||
The following examples are written using unmarked literal strings,
|
||||
but they are Unicode text. In Python 2 we have "unicode_literals"
|
||||
turned on, and in Python 3 this is always the case.
|
||||
|
||||
ftfy decodes text that looks like it was decoded incorrectly. It leaves
|
||||
alone text that doesn't.
|
||||
|
||||
>>> print(fix_text_encoding('único'))
|
||||
único
|
||||
|
||||
>>> print(fix_text_encoding('This text is fine already :þ'))
|
||||
This text is fine already :þ
|
||||
|
||||
Because these characters often come from Microsoft products, we allow
|
||||
for the possibility that we get not just Unicode characters 128-255, but
|
||||
also Windows's conflicting idea of what characters 128-160 are.
|
||||
|
||||
>>> print(fix_text_encoding('This — should be an em dash'))
|
||||
This — should be an em dash
|
||||
|
||||
We might have to deal with both Windows characters and raw control
|
||||
characters at the same time, especially when dealing with characters like
|
||||
0x81 that have no mapping in Windows. This is a string that Python's
|
||||
standard `.encode` and `.decode` methods cannot correct.
|
||||
|
||||
>>> print(fix_text_encoding('This text is sad .â\x81”.'))
|
||||
This text is sad .⁔.
|
||||
|
||||
However, it has safeguards against fixing sequences of letters and
|
||||
punctuation that can occur in valid text:
|
||||
|
||||
>>> print(fix_text_encoding('not such a fan of Charlotte Brontë…”'))
|
||||
not such a fan of Charlotte Brontë…”
|
||||
|
||||
Cases of genuine ambiguity can sometimes be addressed by finding other
|
||||
characters that are not double-encoded, and expecting the encoding to
|
||||
be consistent:
|
||||
|
||||
>>> print(fix_text_encoding('AHÅ™, the new sofa from IKEA®'))
|
||||
AHÅ™, the new sofa from IKEA®
|
||||
|
||||
Finally, we handle the case where the text is in a single-byte encoding
|
||||
that was intended as Windows-1252 all along but read as Latin-1:
|
||||
|
||||
>>> print(fix_text_encoding('This text was never UTF-8 at all\x85'))
|
||||
This text was never UTF-8 at all…
|
||||
|
||||
The best version of the text is found using
|
||||
:func:`ftfy.badness.text_cost`.
|
||||
"""
|
||||
text, _plan = fix_encoding_and_explain(text)
|
||||
return text
|
||||
|
||||
|
||||
def fix_encoding_and_explain(text):
|
||||
"""
|
||||
Re-decodes text that has been decoded incorrectly, and also return a
|
||||
"plan" indicating all the steps required to fix it.
|
||||
|
||||
To fix similar text in the same way, without having to detect anything,
|
||||
you can use the ``apply_plan`` function.
|
||||
"""
|
||||
best_version = text
|
||||
best_cost = text_cost(text)
|
||||
best_plan = []
|
||||
plan_so_far = []
|
||||
while True:
|
||||
prevtext = text
|
||||
text, plan = fix_one_step_and_explain(text)
|
||||
plan_so_far.extend(plan)
|
||||
cost = text_cost(text)
|
||||
|
||||
# Add a penalty if we used a particularly obsolete encoding. The result
|
||||
# is that we won't use these encodings unless they can successfully
|
||||
# replace multiple characters.
|
||||
if ('encode', 'macroman') in plan_so_far or\
|
||||
('encode', 'cp437') in plan_so_far:
|
||||
cost += 2
|
||||
|
||||
# We need pretty solid evidence to decode from Windows-1251 (Cyrillic).
|
||||
if ('encode', 'sloppy-windows-1251') in plan_so_far:
|
||||
cost += 5
|
||||
|
||||
if cost < best_cost:
|
||||
best_cost = cost
|
||||
best_version = text
|
||||
best_plan = list(plan_so_far)
|
||||
if text == prevtext:
|
||||
return best_version, best_plan
|
||||
|
||||
|
||||
def fix_one_step_and_explain(text):
|
||||
"""
|
||||
Performs a single step of re-decoding text that's been decoded incorrectly.
|
||||
|
||||
Returns the decoded text, plus a "plan" for how to reproduce what it
|
||||
did.
|
||||
"""
|
||||
if isinstance(text, bytes):
|
||||
raise UnicodeError(BYTES_ERROR_TEXT)
|
||||
if len(text) == 0:
|
||||
return text, []
|
||||
|
||||
# The first plan is to return ASCII text unchanged.
|
||||
if possible_encoding(text, 'ascii'):
|
||||
return text, []
|
||||
|
||||
# As we go through the next step, remember the possible encodings
|
||||
# that we encounter but don't successfully fix yet. We may need them
|
||||
# later.
|
||||
possible_1byte_encodings = []
|
||||
|
||||
# Suppose the text was supposed to be UTF-8, but it was decoded using
|
||||
# a single-byte encoding instead. When these cases can be fixed, they
|
||||
# are usually the correct thing to do, so try them next.
|
||||
for encoding in CHARMAP_ENCODINGS:
|
||||
if possible_encoding(text, encoding):
|
||||
encoded_bytes = text.encode(encoding)
|
||||
|
||||
# Now, find out if it's UTF-8 (or close enough). Otherwise,
|
||||
# remember the encoding for later.
|
||||
try:
|
||||
decoding = 'utf-8'
|
||||
if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes:
|
||||
decoding = 'utf-8-variants'
|
||||
fixed = encoded_bytes.decode(decoding)
|
||||
steps = [('encode', encoding), ('decode', decoding)]
|
||||
return fixed, steps
|
||||
except UnicodeDecodeError:
|
||||
possible_1byte_encodings.append(encoding)
|
||||
|
||||
# The next most likely case is that this is Latin-1 that was intended to
|
||||
# be read as Windows-1252, because those two encodings in particular are
|
||||
# easily confused.
|
||||
if 'latin-1' in possible_1byte_encodings:
|
||||
if 'windows-1252' in possible_1byte_encodings:
|
||||
# This text is in the intersection of Latin-1 and
|
||||
# Windows-1252, so it's probably legit.
|
||||
return text, []
|
||||
else:
|
||||
# Otherwise, it means we have characters that are in Latin-1 but
|
||||
# not in Windows-1252. Those are C1 control characters. Nobody
|
||||
# wants those. Assume they were meant to be Windows-1252. Don't
|
||||
# use the sloppy codec, because bad Windows-1252 characters are
|
||||
# a bad sign.
|
||||
encoded = text.encode('latin-1')
|
||||
try:
|
||||
fixed = encoded.decode('windows-1252')
|
||||
steps = []
|
||||
if fixed != text:
|
||||
steps = [('encode', 'latin-1'), ('decode', 'windows-1252')]
|
||||
return fixed, steps
|
||||
except UnicodeDecodeError:
|
||||
# This text contained characters that don't even make sense
|
||||
# if you assume they were supposed to be Windows-1252. In
|
||||
# that case, let's not assume anything.
|
||||
pass
|
||||
|
||||
# The cases that remain are mixups between two different single-byte
|
||||
# encodings, and not the common case of Latin-1 vs. Windows-1252.
|
||||
#
|
||||
# Those cases are somewhat rare, and impossible to solve without false
|
||||
# positives. If you're in one of these situations, you should try using
|
||||
# the `ftfy.guess_bytes` function.
|
||||
|
||||
# Return the text unchanged; the plan is empty.
|
||||
return text, []
|
||||
|
||||
|
||||
def apply_plan(text, plan):
|
||||
"""
|
||||
Apply a plan for fixing the encoding of text.
|
||||
|
||||
The plan is a list of tuples of the form (operation, encoding), where
|
||||
`operation` is either 'encode' or 'decode', and `encoding` is an encoding
|
||||
name such as 'utf-8' or 'latin-1'.
|
||||
|
||||
Because only text can be encoded, and only bytes can be decoded, the plan
|
||||
should alternate 'encode' and 'decode' steps, or else this function will
|
||||
encounter an error.
|
||||
"""
|
||||
obj = text
|
||||
for operation, encoding in plan:
|
||||
if operation == 'encode':
|
||||
obj = obj.encode(encoding)
|
||||
elif operation == 'decode':
|
||||
obj = obj.decode(encoding)
|
||||
else:
|
||||
raise ValueError("Unknown plan step: %s" % operation)
|
||||
|
||||
return obj
|
||||
|
||||
|
||||
HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};")
|
||||
|
||||
|
||||
def unescape_html(text):
|
||||
"""
|
||||
Decode all three types of HTML entities/character references.
|
||||
|
||||
Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
|
||||
to it for efficiency: it won't match entities longer than 8 characters,
|
||||
because there are no valid entities like that.
|
||||
|
||||
>>> print(unescape_html('<tag>'))
|
||||
<tag>
|
||||
"""
|
||||
def fixup(match):
|
||||
"""
|
||||
Replace one matched HTML entity with the character it represents,
|
||||
if possible.
|
||||
"""
|
||||
text = match.group(0)
|
||||
if text[:2] == "&#":
|
||||
# character reference
|
||||
try:
|
||||
if text[:3] == "&#x":
|
||||
return unichr(int(text[3:-1], 16))
|
||||
else:
|
||||
return unichr(int(text[2:-1]))
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
# named entity
|
||||
try:
|
||||
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
||||
except KeyError:
|
||||
pass
|
||||
return text # leave as is
|
||||
return HTML_ENTITY_RE.sub(fixup, text)
|
||||
|
||||
|
||||
ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])')
|
||||
|
||||
def remove_terminal_escapes(text):
|
||||
r"""
|
||||
Strip out "ANSI" terminal escape sequences, such as those that produce
|
||||
colored text on Unix.
|
||||
|
||||
>>> print(remove_terminal_escapes(
|
||||
... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
|
||||
... ))
|
||||
I'm blue, da ba dee da ba doo...
|
||||
"""
|
||||
return ANSI_RE.sub('', text)
|
||||
|
||||
|
||||
SINGLE_QUOTE_RE = re.compile('[\u2018-\u201b]')
|
||||
DOUBLE_QUOTE_RE = re.compile('[\u201c-\u201f]')
|
||||
|
||||
def uncurl_quotes(text):
|
||||
r"""
|
||||
Replace curly quotation marks with straight equivalents.
|
||||
|
||||
>>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
|
||||
"here's a test"
|
||||
"""
|
||||
return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
|
||||
|
||||
|
||||
def fix_line_breaks(text):
|
||||
r"""
|
||||
Convert all line breaks to Unix style.
|
||||
|
||||
This will convert the following sequences into the standard \\n
|
||||
line break:
|
||||
|
||||
- CRLF (\\r\\n), used on Windows and in some communication
|
||||
protocols
|
||||
- CR (\\r), once used on Mac OS Classic, and now kept alive
|
||||
by misguided software such as Microsoft Office for Mac
|
||||
- LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029),
|
||||
defined by Unicode and used to sow confusion and discord
|
||||
- NEXT LINE (\\x85), a C1 control character that is certainly
|
||||
not what you meant
|
||||
|
||||
The NEXT LINE character is a bit of an odd case, because it
|
||||
usually won't show up if `fix_encoding` is also being run.
|
||||
\\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
|
||||
|
||||
>>> print(fix_line_breaks(
|
||||
... "This string is made of two things:\u2029"
|
||||
... "1. Unicode\u2028"
|
||||
... "2. Spite"
|
||||
... ))
|
||||
This string is made of two things:
|
||||
1. Unicode
|
||||
2. Spite
|
||||
|
||||
For further testing and examples, let's define a function to make sure
|
||||
we can see the control characters in their escaped form:
|
||||
|
||||
>>> def eprint(text):
|
||||
... print(text.encode('unicode-escape').decode('ascii'))
|
||||
|
||||
>>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
|
||||
Content-type: text/plain\n\nHi.
|
||||
|
||||
>>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
|
||||
This is how Microsoft \n trolls Mac users
|
||||
|
||||
>>> eprint(fix_line_breaks("What is this \x85 I don't even"))
|
||||
What is this \n I don't even
|
||||
"""
|
||||
return text.replace('\r\n', '\n').replace('\r', '\n')\
|
||||
.replace('\u2028', '\n').replace('\u2029', '\n')\
|
||||
.replace('\u0085', '\n')
|
||||
|
||||
|
||||
def remove_control_chars(text):
|
||||
"""
|
||||
Remove all control characters except for the important ones.
|
||||
|
||||
This removes characters in these ranges:
|
||||
|
||||
- U+0000 to U+0008
|
||||
- U+000B
|
||||
- U+000E to U+001F
|
||||
- U+007F
|
||||
|
||||
It leaves alone these characters that are commonly used for formatting:
|
||||
|
||||
- TAB (U+0009)
|
||||
- LF (U+000A)
|
||||
- FF (U+000C)
|
||||
- CR (U+000D)
|
||||
"""
|
||||
return text.translate(CONTROL_CHARS)
|
||||
|
||||
|
||||
def remove_bom(text):
|
||||
r"""
|
||||
Remove a left-over byte-order mark.
|
||||
|
||||
>>> print(remove_bom("\ufeffWhere do you want to go today?"))
|
||||
Where do you want to go today?
|
||||
"""
|
||||
return text.lstrip(unichr(0xfeff))
|
||||
|
||||
|
||||
def remove_unsafe_private_use(text):
|
||||
r"""
|
||||
Python 3.3's Unicode support isn't perfect, and in fact there are certain
|
||||
string operations that will crash some versions of it with a SystemError:
|
||||
http://bugs.python.org/issue18183
|
||||
|
||||
The best solution is to remove all characters from Supplementary Private
|
||||
Use Area B, using a regex that is known not to crash given those
|
||||
characters.
|
||||
|
||||
These are the characters from U+100000 to U+10FFFF. It's sad to lose an
|
||||
entire plane of Unicode, but on the other hand, these characters are not
|
||||
assigned and never will be. If you get one of these characters and don't
|
||||
know what its purpose is, its purpose is probably to crash your code.
|
||||
|
||||
If you were using these for actual private use, this might be inconvenient.
|
||||
You can turn off this fixer, of course, but I kind of encourage using
|
||||
Supplementary Private Use Area A instead.
|
||||
|
||||
>>> print(remove_unsafe_private_use('\U0001F4A9\U00100000'))
|
||||
💩
|
||||
|
||||
This fixer is off by default in Python 3.4 or later. (The bug is actually
|
||||
fixed in 3.3.3 and 2.7.6, but I don't want the default behavior to change
|
||||
based on a micro version upgrade of Python.)
|
||||
"""
|
||||
return UNSAFE_PRIVATE_USE_RE.sub('', text)
|
||||
|
||||
|
||||
# Define a regex to match valid escape sequences in Python string literals.
|
||||
ESCAPE_SEQUENCE_RE = re.compile(r'''
|
||||
( \\U........ # 8-digit hex escapes
|
||||
| \\u.... # 4-digit hex escapes
|
||||
| \\x.. # 2-digit hex escapes
|
||||
| \\[0-7]{1,3} # Octal escapes
|
||||
| \\N\{[^}]+\} # Unicode characters by name
|
||||
| \\[\\'"abfnrtv] # Single-character escapes
|
||||
)''', re.UNICODE | re.VERBOSE)
|
||||
|
||||
|
||||
def decode_escapes(text):
|
||||
r"""
|
||||
Decode backslashed escape sequences, including \\x, \\u, and \\U character
|
||||
references, even in the presence of other Unicode.
|
||||
|
||||
This is what Python's "string-escape" and "unicode-escape" codecs were
|
||||
meant to do, but in contrast, this actually works. It will decode the
|
||||
string exactly the same way that the Python interpreter decodes its string
|
||||
literals.
|
||||
|
||||
>>> factoid = '\\u20a1 is the currency symbol for the colón.'
|
||||
>>> print(factoid[1:])
|
||||
u20a1 is the currency symbol for the colón.
|
||||
>>> print(decode_escapes(factoid))
|
||||
₡ is the currency symbol for the colón.
|
||||
|
||||
Even though Python itself can read string literals with a combination of
|
||||
escapes and literal Unicode -- you're looking at one right now -- the
|
||||
"unicode-escape" codec doesn't work on literal Unicode. (See
|
||||
http://stackoverflow.com/a/24519338/773754 for more details.)
|
||||
|
||||
Instead, this function searches for just the parts of a string that
|
||||
represent escape sequences, and decodes them, leaving the rest alone. All
|
||||
valid escape sequences are made of ASCII characters, and this allows
|
||||
"unicode-escape" to work correctly.
|
||||
|
||||
This fix cannot be automatically applied by the `ftfy.fix_text` function,
|
||||
because escaped text is not necessarily a mistake, and there is no way
|
||||
to distinguish text that's supposed to be escaped from text that isn't.
|
||||
"""
|
||||
def decode_match(match):
|
||||
"Given a regex match, decode the escape sequence it contains."
|
||||
return codecs.decode(match.group(0), 'unicode-escape')
|
||||
|
||||
return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
|
@ -1,39 +0,0 @@
|
||||
"""
|
||||
This file defines a general method for evaluating ftfy using data that arrives
|
||||
in a stream. A concrete implementation of it is found in `twitter_tester.py`.
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals
|
||||
from ftfy.fixes import fix_text_encoding
|
||||
from ftfy.chardata import possible_encoding
|
||||
|
||||
|
||||
class StreamTester:
|
||||
"""
|
||||
Take in a sequence of texts, and show the ones that will be changed by
|
||||
ftfy. This will also periodically show updates, such as the proportion of
|
||||
texts that changed.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.num_fixed = 0
|
||||
self.count = 0
|
||||
|
||||
def check_ftfy(self, text):
|
||||
"""
|
||||
Given a single text input, check whether `ftfy.fix_text_encoding`
|
||||
would change it. If so, display the change.
|
||||
"""
|
||||
self.count += 1
|
||||
if not possible_encoding(text, 'ascii'):
|
||||
fixed = fix_text_encoding(text)
|
||||
if text != fixed:
|
||||
# possibly filter common bots before printing
|
||||
print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format(
|
||||
text=text, fixed=fixed
|
||||
))
|
||||
self.num_fixed += 1
|
||||
|
||||
# Print status updates once in a while
|
||||
if self.count % 100 == 0:
|
||||
print('.', end='', flush=True)
|
||||
if self.count % 10000 == 0:
|
||||
print('\n%d/%d fixed' % (self.num_fixed, self.count))
|
@ -1,73 +0,0 @@
|
||||
# coding: utf-8
|
||||
"""
|
||||
Do what is necessary to authenticate this tester as a Twitter "app", using
|
||||
somebody's Twitter account.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
import os
|
||||
|
||||
|
||||
AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth')
|
||||
|
||||
def get_auth():
|
||||
"""
|
||||
Twitter has some bizarre requirements about how to authorize an "app" to
|
||||
use its API.
|
||||
|
||||
The user of the app has to log in to get a secret token. That's fine. But
|
||||
the app itself has its own "consumer secret" token. The app has to know it,
|
||||
and the user of the app has to not know it.
|
||||
|
||||
This is, of course, impossible. It's equivalent to DRM. Your computer can't
|
||||
*really* make use of secret information while hiding the same information
|
||||
from you.
|
||||
|
||||
The threat appears to be that, if you have this super-sekrit token, you can
|
||||
impersonate the app while doing something different. Well, of course you
|
||||
can do that, because you *have the source code* and you can change it to do
|
||||
what you want. You still have to log in as a particular user who has a
|
||||
token that's actually secret, you know.
|
||||
|
||||
Even developers of closed-source applications that use the Twitter API are
|
||||
unsure what to do, for good reason. These "secrets" are not secret in any
|
||||
cryptographic sense. A bit of Googling shows that the secret tokens for
|
||||
every popular Twitter app are already posted on the Web.
|
||||
|
||||
Twitter wants us to pretend this string can be kept secret, and hide this
|
||||
secret behind a fig leaf like everybody else does. So that's what we've
|
||||
done.
|
||||
"""
|
||||
|
||||
from twitter.oauth import OAuth
|
||||
from twitter import oauth_dance, read_token_file
|
||||
|
||||
def unhide(secret):
|
||||
"""
|
||||
Do something mysterious and exactly as secure as every other Twitter
|
||||
app.
|
||||
"""
|
||||
return ''.join([chr(ord(c) - 0x2800) for c in secret])
|
||||
|
||||
fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁'
|
||||
consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw'
|
||||
|
||||
if os.path.exists(AUTH_TOKEN_PATH):
|
||||
token, token_secret = read_token_file(AUTH_TOKEN_PATH)
|
||||
else:
|
||||
authdir = os.path.dirname(AUTH_TOKEN_PATH)
|
||||
if not os.path.exists(authdir):
|
||||
os.makedirs(authdir)
|
||||
token, token_secret = oauth_dance(
|
||||
app_name='ftfy-tester',
|
||||
consumer_key=consumer_key,
|
||||
consumer_secret=unhide(fig_leaf),
|
||||
token_filename=AUTH_TOKEN_PATH
|
||||
)
|
||||
|
||||
return OAuth(
|
||||
token=token,
|
||||
token_secret=token_secret,
|
||||
consumer_key=consumer_key,
|
||||
consumer_secret=unhide(fig_leaf)
|
||||
)
|
||||
|
@ -1,89 +0,0 @@
|
||||
"""
|
||||
Implements a StreamTester that runs over Twitter data. See the class
|
||||
docstring.
|
||||
|
||||
This module is written for Python 3 only. The __future__ imports you see here
|
||||
are just to let Python 2 scan the file without crashing with a SyntaxError.
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from ftfy.streamtester import StreamTester
|
||||
|
||||
|
||||
class TwitterTester(StreamTester):
|
||||
"""
|
||||
This class uses the StreamTester code (defined in `__init__.py`) to
|
||||
evaluate ftfy's real-world performance, by feeding it live data from
|
||||
Twitter.
|
||||
|
||||
This is a semi-manual evaluation. It requires a human to look at the
|
||||
results and determine if they are good. The three possible cases we
|
||||
can see here are:
|
||||
|
||||
- Success: the process takes in mojibake and outputs correct text.
|
||||
- False positive: the process takes in correct text, and outputs
|
||||
mojibake. Every false positive should be considered a bug, and
|
||||
reported on GitHub if it isn't already.
|
||||
- Confusion: the process takes in mojibake and outputs different
|
||||
mojibake. Not a great outcome, but not as dire as a false
|
||||
positive.
|
||||
|
||||
This tester cannot reveal false negatives. So far, that can only be
|
||||
done by the unit tests.
|
||||
"""
|
||||
OUTPUT_DIR = './twitterlogs'
|
||||
|
||||
def __init__(self):
|
||||
self.lines_by_lang = defaultdict(list)
|
||||
super().__init__()
|
||||
|
||||
def save_files(self):
|
||||
"""
|
||||
When processing data from live Twitter, save it to log files so that
|
||||
it can be replayed later.
|
||||
"""
|
||||
if not os.path.exists(self.OUTPUT_DIR):
|
||||
os.makedirs(self.OUTPUT_DIR)
|
||||
for lang, lines in self.lines_by_lang.items():
|
||||
filename = 'tweets.{}.txt'.format(lang)
|
||||
fullname = os.path.join(self.OUTPUT_DIR, filename)
|
||||
langfile = open(fullname, 'a')
|
||||
for line in lines:
|
||||
print(line.replace('\n', ' '), file=langfile)
|
||||
langfile.close()
|
||||
self.lines_by_lang = defaultdict(list)
|
||||
|
||||
def run_sample(self):
|
||||
"""
|
||||
Listen to live data from Twitter, and pass on the fully-formed tweets
|
||||
to `check_ftfy`. This requires the `twitter` Python package as a
|
||||
dependency.
|
||||
"""
|
||||
from twitter import TwitterStream
|
||||
from ftfy.streamtester.oauth import get_auth
|
||||
twitter_stream = TwitterStream(auth=get_auth())
|
||||
iterator = twitter_stream.statuses.sample()
|
||||
for tweet in iterator:
|
||||
if 'text' in tweet:
|
||||
self.check_ftfy(tweet['text'])
|
||||
if 'user' in tweet:
|
||||
lang = tweet['user'].get('lang', 'NONE')
|
||||
self.lines_by_lang[lang].append(tweet['text'])
|
||||
if self.count % 10000 == 100:
|
||||
self.save_files()
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
When run from the command line, this script connects to the Twitter stream
|
||||
and runs the TwitterTester on it forever. Or at least until the stream
|
||||
drops.
|
||||
"""
|
||||
tester = TwitterTester()
|
||||
tester.run_sample()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@ -17,53 +17,71 @@
|
||||
# along with SickRage. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import traceback
|
||||
|
||||
import sickbeard
|
||||
from sickbeard import logger
|
||||
|
||||
import ftfy
|
||||
import ftfy.bad_codecs
|
||||
import six
|
||||
import chardet
|
||||
|
||||
|
||||
# This module tries to deal with the apparently random behavior of python when dealing with unicode <-> utf-8
|
||||
# encodings. It tries to just use unicode, but if that fails then it tries forcing it to utf-8. Any functions
|
||||
# which return something should always return unicode.
|
||||
|
||||
def fixStupidEncodings(x, silent=False):
|
||||
if type(x) == str:
|
||||
try:
|
||||
return str(ftfy.fix_text(u'' + x)).decode(sickbeard.SYS_ENCODING)
|
||||
except UnicodeDecodeError:
|
||||
logger.log(u"Unable to decode value: " + repr(x), logger.ERROR)
|
||||
def toUnicode(x):
|
||||
try:
|
||||
if isinstance(x, unicode):
|
||||
return x
|
||||
except UnicodeEncodeError:
|
||||
logger.log(u"Unable to encode value: " + repr(x), logger.ERROR)
|
||||
return x
|
||||
elif type(x) == unicode:
|
||||
return x
|
||||
else:
|
||||
logger.log(
|
||||
u"Unknown value passed in, ignoring it: " + str(type(x)) + " (" + repr(x) + ":" + repr(type(x)) + ")",
|
||||
logger.DEBUG if silent else logger.ERROR)
|
||||
else:
|
||||
try:
|
||||
return six.text_type(x)
|
||||
except:
|
||||
try:
|
||||
if chardet.detect(x).get('encoding') == 'utf-8':
|
||||
return x.decode('utf-8')
|
||||
if isinstance(x, str):
|
||||
try:
|
||||
return x.decode(sickbeard.SYS_ENCODING)
|
||||
except UnicodeDecodeError:
|
||||
raise
|
||||
return x
|
||||
except:
|
||||
raise
|
||||
except:
|
||||
logger.log('Unable to decode value "%s..." : %s ' % (repr(x)[:20], traceback.format_exc()), logger.WARNING)
|
||||
ascii_text = str(x).encode('string_escape')
|
||||
return toUnicode(ascii_text)
|
||||
|
||||
def ss(x):
|
||||
u_x = toUnicode(x)
|
||||
|
||||
try:
|
||||
return u_x.encode(sickbeard.SYS_ENCODING)
|
||||
except Exception as e:
|
||||
logger.log('Failed ss encoding char, force UTF8: %s' % e, logger.WARNING)
|
||||
try:
|
||||
return u_x.encode(sickbeard.SYS_ENCODING, 'replace')
|
||||
except:
|
||||
return u_x.encode('utf-8', 'replace')
|
||||
|
||||
def fixListEncodings(x):
|
||||
if type(x) != list and type(x) != tuple:
|
||||
if not isinstance(x, (list, tuple)):
|
||||
return x
|
||||
else:
|
||||
return filter(lambda x: x != None, map(fixStupidEncodings, x))
|
||||
return filter(lambda x: x != None, map(toUnicode, x))
|
||||
|
||||
|
||||
def ek(func, *args, **kwargs):
|
||||
if os.name == 'nt':
|
||||
result = func(*args, **kwargs)
|
||||
else:
|
||||
result = func(
|
||||
*[fixStupidEncodings(x).encode(sickbeard.SYS_ENCODING) if type(x) in (str, unicode) else x for x in args],
|
||||
**kwargs)
|
||||
result = func(*[ss(x) if isinstance(x, (str, unicode)) else x for x in args], **kwargs)
|
||||
|
||||
if type(result) in (list, tuple):
|
||||
if isinstance(result, (list, tuple)):
|
||||
return fixListEncodings(result)
|
||||
elif type(result) == str:
|
||||
return fixStupidEncodings(result)
|
||||
elif isinstance(result, str):
|
||||
return toUnicode(result)
|
||||
else:
|
||||
return result
|
||||
|
@ -16,7 +16,7 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with SickRage. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from sickbeard.encodingKludge import fixStupidEncodings
|
||||
from sickbeard.encodingKludge import toUnicode
|
||||
|
||||
def ex(e):
|
||||
"""
|
||||
@ -32,11 +32,11 @@ def ex(e):
|
||||
|
||||
if arg is not None:
|
||||
if isinstance(arg, (str, unicode)):
|
||||
fixed_arg = fixStupidEncodings(arg, True)
|
||||
fixed_arg = toUnicode(arg, True)
|
||||
|
||||
else:
|
||||
try:
|
||||
fixed_arg = u"error " + fixStupidEncodings(str(arg), True)
|
||||
fixed_arg = u"error " + toUnicode(str(arg), True)
|
||||
|
||||
except:
|
||||
fixed_arg = None
|
||||
|
@ -26,7 +26,7 @@ from sickbeard.exceptions import ex, EpisodeNotFoundException
|
||||
from sickbeard.history import dateFormat
|
||||
from sickbeard.common import Quality
|
||||
from sickbeard.common import WANTED, FAILED
|
||||
from sickbeard.encodingKludge import fixStupidEncodings
|
||||
from sickbeard.encodingKludge import toUnicode
|
||||
|
||||
def prepareFailedName(release):
|
||||
"""Standardizes release name for failed DB"""
|
||||
@ -36,7 +36,7 @@ def prepareFailedName(release):
|
||||
fixed = fixed.rpartition(".")[0]
|
||||
|
||||
fixed = re.sub("[\.\-\+\ ]", "_", fixed)
|
||||
fixed = fixStupidEncodings(fixed)
|
||||
fixed = toUnicode(fixed)
|
||||
|
||||
return fixed
|
||||
|
||||
|
@ -20,7 +20,7 @@ import db
|
||||
import datetime
|
||||
|
||||
from sickbeard.common import SNATCHED, SUBTITLED, FAILED, Quality
|
||||
from sickbeard.encodingKludge import fixStupidEncodings
|
||||
from sickbeard.encodingKludge import toUnicode
|
||||
|
||||
|
||||
dateFormat = "%Y%m%d%H%M%S"
|
||||
@ -28,7 +28,7 @@ dateFormat = "%Y%m%d%H%M%S"
|
||||
|
||||
def _logHistoryItem(action, showid, season, episode, quality, resource, provider, version=-1):
|
||||
logDate = datetime.datetime.today().strftime(dateFormat)
|
||||
resource = fixStupidEncodings(resource)
|
||||
resource = toUnicode(resource)
|
||||
|
||||
myDB = db.DBConnection()
|
||||
myDB.action(
|
||||
|
@ -29,7 +29,7 @@ import sickbeard
|
||||
|
||||
from sickbeard import logger, common
|
||||
from sickbeard import db
|
||||
from sickbeard.encodingKludge import fixStupidEncodings
|
||||
from sickbeard.encodingKludge import toUnicode
|
||||
from sickbeard.exceptions import ex
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ class EmailNotifier:
|
||||
ep_name: The name of the episode that was snatched
|
||||
title: The title of the notification (optional)
|
||||
"""
|
||||
ep_name = fixStupidEncodings(ep_name)
|
||||
ep_name = toUnicode(ep_name)
|
||||
|
||||
if sickbeard.EMAIL_NOTIFY_ONSNATCH:
|
||||
show = self._parseEp(ep_name)
|
||||
@ -86,7 +86,7 @@ class EmailNotifier:
|
||||
ep_name: The name of the episode that was downloaded
|
||||
title: The title of the notification (optional)
|
||||
"""
|
||||
ep_name = fixStupidEncodings(ep_name)
|
||||
ep_name = toUnicode(ep_name)
|
||||
|
||||
if sickbeard.EMAIL_NOTIFY_ONDOWNLOAD:
|
||||
show = self._parseEp(ep_name)
|
||||
@ -121,7 +121,7 @@ class EmailNotifier:
|
||||
ep_name: The name of the episode that was downloaded
|
||||
lang: Subtitle language wanted
|
||||
"""
|
||||
ep_name = fixStupidEncodings(ep_name)
|
||||
ep_name = toUnicode(ep_name)
|
||||
|
||||
if sickbeard.EMAIL_NOTIFY_ONSUBTITLEDOWNLOAD:
|
||||
show = self._parseEp(ep_name)
|
||||
@ -198,7 +198,7 @@ class EmailNotifier:
|
||||
return False
|
||||
|
||||
def _parseEp(self, ep_name):
|
||||
ep_name = fixStupidEncodings(ep_name)
|
||||
ep_name = toUnicode(ep_name)
|
||||
|
||||
sep = " - "
|
||||
titles = ep_name.split(sep)
|
||||
|
@ -25,7 +25,7 @@ import sickbeard
|
||||
from sickbeard import logger
|
||||
from sickbeard import common
|
||||
from sickbeard.exceptions import ex
|
||||
from sickbeard.encodingKludge import fixStupidEncodings
|
||||
from sickbeard.encodingKludge import toUnicode
|
||||
|
||||
from sickbeard.notifiers.xbmc import XBMCNotifier
|
||||
|
||||
|
@ -26,7 +26,7 @@ import sickbeard
|
||||
from sickbeard import logger
|
||||
from sickbeard import common
|
||||
from sickbeard.exceptions import ex
|
||||
from sickbeard.encodingKludge import fixStupidEncodings
|
||||
from sickbeard.encodingKludge import toUnicode
|
||||
|
||||
|
||||
try:
|
||||
@ -236,9 +236,9 @@ class XBMCNotifier:
|
||||
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
|
||||
authheader = "Basic %s" % base64string
|
||||
req.add_header("Authorization", authheader)
|
||||
logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG)
|
||||
logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG)
|
||||
else:
|
||||
logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG)
|
||||
logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG)
|
||||
|
||||
response = urllib2.urlopen(req)
|
||||
result = response.read().decode(sickbeard.SYS_ENCODING)
|
||||
@ -248,7 +248,7 @@ class XBMCNotifier:
|
||||
return result
|
||||
|
||||
except (urllib2.URLError, IOError), e:
|
||||
logger.log(u"Warning: Couldn't contact XBMC HTTP at " + fixStupidEncodings(url) + " " + ex(e),
|
||||
logger.log(u"Warning: Couldn't contact XBMC HTTP at " + toUnicode(url) + " " + ex(e),
|
||||
logger.WARNING)
|
||||
return False
|
||||
|
||||
@ -379,9 +379,9 @@ class XBMCNotifier:
|
||||
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
|
||||
authheader = "Basic %s" % base64string
|
||||
req.add_header("Authorization", authheader)
|
||||
logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG)
|
||||
logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG)
|
||||
else:
|
||||
logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG)
|
||||
logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG)
|
||||
|
||||
try:
|
||||
response = urllib2.urlopen(req)
|
||||
@ -401,7 +401,7 @@ class XBMCNotifier:
|
||||
return False
|
||||
|
||||
except IOError, e:
|
||||
logger.log(u"Warning: Couldn't contact XBMC JSON API at " + fixStupidEncodings(url) + " " + ex(e),
|
||||
logger.log(u"Warning: Couldn't contact XBMC JSON API at " + toUnicode(url) + " " + ex(e),
|
||||
logger.WARNING)
|
||||
return False
|
||||
|
||||
|
@ -29,7 +29,7 @@ from sickbeard import encodingKludge as ek
|
||||
from sickbeard.exceptions import ex
|
||||
|
||||
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
|
||||
from sickbeard.encodingKludge import fixStupidEncodings
|
||||
from sickbeard.encodingKludge import toUnicode
|
||||
|
||||
|
||||
def getSeasonNZBs(name, urlData, season):
|
||||
@ -85,7 +85,7 @@ def createNZBString(fileElements, xmlns):
|
||||
for curFile in fileElements:
|
||||
rootElement.append(stripNS(curFile, xmlns))
|
||||
|
||||
return xml.etree.ElementTree.tostring(fixStupidEncodings(rootElement))
|
||||
return xml.etree.ElementTree.tostring(toUnicode(rootElement))
|
||||
|
||||
|
||||
def saveNZB(nzbName, nzbString):
|
||||
|
@ -166,95 +166,86 @@ class SCCProvider(generic.TorrentProvider):
|
||||
if not self._doLogin():
|
||||
return []
|
||||
|
||||
data = []
|
||||
searchURLS = []
|
||||
|
||||
for mode in search_params.keys():
|
||||
for search_string in search_params[mode]:
|
||||
|
||||
if isinstance(search_string, unicode):
|
||||
search_string = unidecode(search_string)
|
||||
|
||||
nonsceneSearchURL = None
|
||||
foreignSearchURL = None
|
||||
if mode == 'Season':
|
||||
searchURL = self.urls['archive'] % (search_string)
|
||||
data = [self.getURL(searchURL)]
|
||||
searchURLS += [self.urls['archive'] % (search_string)]
|
||||
else:
|
||||
searchURL = self.urls['search'] % (search_string, self.categories)
|
||||
nonsceneSearchURL = self.urls['nonscene'] % (search_string)
|
||||
foreignSearchURL = self.urls['foreign'] % (search_string)
|
||||
data = [self.getURL(searchURL),
|
||||
self.getURL(nonsceneSearchURL),
|
||||
self.getURL(foreignSearchURL)]
|
||||
logger.log(u"Search string: " + nonsceneSearchURL, logger.DEBUG)
|
||||
logger.log(u"Search string: " + foreignSearchURL, logger.DEBUG)
|
||||
searchURLS += [self.urls['search'] % (search_string, self.categories)]
|
||||
searchURLS += [self.urls['nonscene'] % (search_string)]
|
||||
searchURLS += [self.urls['foreign'] % (search_string)]
|
||||
|
||||
logger.log(u"Search string: " + searchURL, logger.DEBUG)
|
||||
for searchURL in searchURLS:
|
||||
logger.log(u"Search string: " + searchURL, logger.DEBUG)
|
||||
data += [x for x in [self.getURL(searchURL)] if x]
|
||||
|
||||
if not data:
|
||||
if not len(data):
|
||||
continue
|
||||
|
||||
try:
|
||||
for dataItem in data:
|
||||
with BS4Parser(dataItem, features=["html5lib", "permissive"]) as html:
|
||||
torrent_table = html.find('table', attrs={'id': 'torrents-table'})
|
||||
torrent_rows = torrent_table.find_all('tr') if torrent_table else []
|
||||
try:
|
||||
for dataItem in data:
|
||||
with BS4Parser(dataItem, features=["html5lib", "permissive"]) as html:
|
||||
torrent_table = html.find('table', attrs={'id': 'torrents-table'})
|
||||
torrent_rows = torrent_table.find_all('tr') if torrent_table else []
|
||||
|
||||
#Continue only if at least one Release is found
|
||||
if len(torrent_rows) < 2:
|
||||
if html.title:
|
||||
source = self.name + " (" + html.title.string + ")"
|
||||
#Continue only if at least one Release is found
|
||||
if len(torrent_rows) < 2:
|
||||
if html.title:
|
||||
source = self.name + " (" + html.title.string + ")"
|
||||
else:
|
||||
source = self.name
|
||||
logger.log(u"The Data returned from " + source + " does not contain any torrent", logger.DEBUG)
|
||||
continue
|
||||
|
||||
for result in torrent_table.find_all('tr')[1:]:
|
||||
|
||||
try:
|
||||
link = result.find('td', attrs={'class': 'ttr_name'}).find('a')
|
||||
all_urls = result.find('td', attrs={'class': 'td_dl'}).find_all('a', limit=2)
|
||||
# Foreign section contain two links, the others one
|
||||
if self._isSection('Foreign', dataItem):
|
||||
url = all_urls[1]
|
||||
else:
|
||||
source = self.name
|
||||
logger.log(u"The Data returned from " + source + " does not contain any torrent", logger.DEBUG)
|
||||
url = all_urls[0]
|
||||
|
||||
title = link.string
|
||||
if re.search('\.\.\.', title):
|
||||
data = self.getURL(self.url + "/" + link['href'])
|
||||
if data:
|
||||
with BS4Parser(data) as details_html:
|
||||
title = re.search('(?<=").+(?<!")', details_html.title.string).group(0)
|
||||
download_url = self.urls['download'] % url['href']
|
||||
id = int(link['href'].replace('details?id=', ''))
|
||||
seeders = int(result.find('td', attrs={'class': 'ttr_seeders'}).string)
|
||||
leechers = int(result.find('td', attrs={'class': 'ttr_leechers'}).string)
|
||||
except (AttributeError, TypeError):
|
||||
continue
|
||||
|
||||
for result in torrent_table.find_all('tr')[1:]:
|
||||
if mode != 'RSS' and (seeders < self.minseed or leechers < self.minleech):
|
||||
continue
|
||||
|
||||
try:
|
||||
link = result.find('td', attrs={'class': 'ttr_name'}).find('a')
|
||||
all_urls = result.find('td', attrs={'class': 'td_dl'}).find_all('a', limit=2)
|
||||
# Foreign section contain two links, the others one
|
||||
if self._isSection('Foreign', dataItem):
|
||||
url = all_urls[1]
|
||||
else:
|
||||
url = all_urls[0]
|
||||
if not title or not download_url:
|
||||
continue
|
||||
|
||||
title = link.string
|
||||
if re.search('\.\.\.', title):
|
||||
data = self.getURL(self.url + "/" + link['href'])
|
||||
if data:
|
||||
with BS4Parser(data) as details_html:
|
||||
title = re.search('(?<=").+(?<!")', details_html.title.string).group(0)
|
||||
download_url = self.urls['download'] % url['href']
|
||||
id = int(link['href'].replace('details?id=', ''))
|
||||
seeders = int(result.find('td', attrs={'class': 'ttr_seeders'}).string)
|
||||
leechers = int(result.find('td', attrs={'class': 'ttr_leechers'}).string)
|
||||
except (AttributeError, TypeError):
|
||||
continue
|
||||
item = title, download_url, id, seeders, leechers
|
||||
#logger.log(u"Found result: " + title + "(" + searchURL + ")", logger.DEBUG)
|
||||
|
||||
if mode != 'RSS' and (seeders < self.minseed or leechers < self.minleech):
|
||||
continue
|
||||
items[mode].append(item)
|
||||
|
||||
if not title or not download_url:
|
||||
continue
|
||||
# for each search mode sort all the items by seeders
|
||||
items[mode].sort(key=lambda tup: tup[3], reverse=True)
|
||||
results += items[mode]
|
||||
|
||||
item = title, download_url, id, seeders, leechers
|
||||
|
||||
if self._isSection('Non-Scene', dataItem):
|
||||
logger.log(u"Found result: " + title + "(" + nonsceneSearchURL + ")", logger.DEBUG)
|
||||
elif self._isSection('Foreign', dataItem):
|
||||
logger.log(u"Found result: " + title + "(" + foreignSearchURL + ")", logger.DEBUG)
|
||||
else:
|
||||
logger.log(u"Found result: " + title + "(" + searchURL + ")", logger.DEBUG)
|
||||
|
||||
items[mode].append(item)
|
||||
|
||||
except Exception, e:
|
||||
logger.log(u"Failed parsing " + self.name + " Traceback: " + traceback.format_exc(), logger.ERROR)
|
||||
|
||||
#For each search mode sort all the items by seeders
|
||||
items[mode].sort(key=lambda tup: tup[3], reverse=True)
|
||||
|
||||
results += items[mode]
|
||||
except Exception, e:
|
||||
logger.log(u"Failed parsing " + self.name + " Traceback: " + traceback.format_exc(), logger.ERROR)
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
|
@ -27,7 +27,7 @@ from sickbeard import helpers
|
||||
from sickbeard import name_cache
|
||||
from sickbeard import logger
|
||||
from sickbeard import db
|
||||
from sickbeard.encodingKludge import fixStupidEncodings
|
||||
from sickbeard.encodingKludge import toUnicode
|
||||
|
||||
exception_dict = {}
|
||||
anidb_exception_dict = {}
|
||||
@ -234,7 +234,7 @@ def retrieve_exceptions():
|
||||
# if this exception isn't already in the DB then add it
|
||||
if cur_exception not in existing_exceptions:
|
||||
|
||||
cur_exception = fixStupidEncodings(cur_exception)
|
||||
cur_exception = toUnicode(cur_exception)
|
||||
|
||||
myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
|
||||
[cur_indexer_id, cur_exception, curSeason])
|
||||
@ -267,7 +267,7 @@ def update_scene_exceptions(indexer_id, scene_exceptions, season=-1):
|
||||
exceptionsCache[indexer_id][season] = scene_exceptions
|
||||
|
||||
for cur_exception in scene_exceptions:
|
||||
cur_exception = fixStupidEncodings(cur_exception)
|
||||
cur_exception = toUnicode(cur_exception)
|
||||
|
||||
myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
|
||||
[indexer_id, cur_exception, season])
|
||||
|
@ -234,7 +234,7 @@ def isGoodResult(name, show, log=True, season=-1):
|
||||
|
||||
all_show_names = allPossibleShowNames(show, season=season)
|
||||
showNames = map(sanitizeSceneName, all_show_names) + all_show_names
|
||||
showNames += map(unidecode, all_show_names)
|
||||
showNames += map(ek.toUnicode, all_show_names)
|
||||
|
||||
for curName in set(showNames):
|
||||
if not show.is_anime:
|
||||
|
@ -33,7 +33,7 @@ from sickbeard.exceptions import AuthException
|
||||
from sickbeard.rssfeeds import RSSFeeds
|
||||
from sickbeard import clients
|
||||
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
|
||||
from sickbeard.encodingKludge import fixStupidEncodings
|
||||
from sickbeard.encodingKludge import toUnicode
|
||||
|
||||
class CacheDBConnection(db.DBConnection):
|
||||
def __init__(self, providerName):
|
||||
@ -263,7 +263,7 @@ class TVCache():
|
||||
# get quality of release
|
||||
quality = parse_result.quality
|
||||
|
||||
name = fixStupidEncodings(name)
|
||||
name = toUnicode(name)
|
||||
|
||||
# get release group
|
||||
release_group = parse_result.release_group
|
||||
|
@ -3288,7 +3288,7 @@ class ErrorLogs(MainHandler):
|
||||
|
||||
for x in reversed(data):
|
||||
|
||||
x = ek.fixStupidEncodings(x)
|
||||
x = ek.toUnicode(x)
|
||||
match = re.match(regex, x)
|
||||
|
||||
if match:
|
||||
|
@ -18,23 +18,27 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with SickRage. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import glob
|
||||
import unittest
|
||||
import sys
|
||||
|
||||
class AllTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__]
|
||||
self.module_strings = [file_string[0:len(file_string) - 3] for file_string in self.test_file_strings]
|
||||
self.suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in self.module_strings]
|
||||
self.testSuite = unittest.TestSuite(self.suites)
|
||||
|
||||
def testAll(self):
|
||||
print "=================="
|
||||
print "STARTING - ALL TESTS"
|
||||
print "=================="
|
||||
for includedfiles in self.test_file_strings:
|
||||
print "- " + includedfiles
|
||||
|
||||
text_runner = unittest.TextTestRunner().run(self.testSuite)
|
||||
if not text_runner.wasSuccessful():
|
||||
sys.exit(-1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import glob
|
||||
import unittest
|
||||
import sys
|
||||
|
||||
test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__]
|
||||
module_strings = [file_string[0:len(file_string) - 3] for file_string in test_file_strings]
|
||||
suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in module_strings]
|
||||
testSuite = unittest.TestSuite(suites)
|
||||
|
||||
print "=================="
|
||||
print "STARTING - ALL TESTS"
|
||||
print "=================="
|
||||
print "this will include"
|
||||
for includedfiles in test_file_strings:
|
||||
print "- " + includedfiles
|
||||
|
||||
text_runner = unittest.TextTestRunner().run(testSuite)
|
||||
if not text_runner.wasSuccessful():
|
||||
sys.exit(-1)
|
||||
unittest.main()
|
@ -8,7 +8,6 @@ sys.path.append(os.path.abspath('../lib'))
|
||||
|
||||
from sickbeard import common
|
||||
|
||||
|
||||
class QualityTests(unittest.TestCase):
|
||||
|
||||
# TODO: repack / proper ? air-by-date ? season rip? multi-ep?
|
||||
|
@ -51,7 +51,6 @@ EPISODE = 2
|
||||
FILENAME = u"show name - s0" + str(SEASON) + "e0" + str(EPISODE) + ".mkv"
|
||||
FILEDIR = os.path.join(TESTDIR, SHOWNAME)
|
||||
FILEPATH = os.path.join(FILEDIR, FILENAME)
|
||||
|
||||
SHOWDIR = os.path.join(TESTDIR, SHOWNAME + " final")
|
||||
|
||||
#sickbeard.logger.sb_log_instance = sickbeard.logger.SBRotatingLogHandler(os.path.join(TESTDIR, 'sickbeard.log'), sickbeard.logger.NUM_LOGS, sickbeard.logger.LOG_SIZE)
|
||||
|
Loading…
Reference in New Issue
Block a user