mirror of
https://github.com/moparisthebest/SickRage
synced 2024-11-04 08:25:04 -05:00
Removed FTFY, python 2.6 compatibility issues.
Re-coded encodingKludge encode/decode for unicode <-> utf-8
This commit is contained in:
parent
468af14dfd
commit
360c3afa08
@ -1,351 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
ftfy: fixes text for you
|
|
||||||
|
|
||||||
This is a module for making text less broken. See the `fix_text` function
|
|
||||||
for more information.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
# See the docstring for ftfy.bad_codecs to see what we're doing here.
|
|
||||||
import ftfy.bad_codecs
|
|
||||||
ftfy.bad_codecs.ok()
|
|
||||||
|
|
||||||
from ftfy import fixes
|
|
||||||
from ftfy.fixes import fix_text_encoding
|
|
||||||
from ftfy.compatibility import PYTHON34_OR_LATER, is_printable
|
|
||||||
import unicodedata
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
|
|
||||||
def fix_text(text,
|
|
||||||
remove_unsafe_private_use=(not PYTHON34_OR_LATER),
|
|
||||||
fix_entities='auto',
|
|
||||||
remove_terminal_escapes=True,
|
|
||||||
fix_encoding=True,
|
|
||||||
normalization='NFKC',
|
|
||||||
uncurl_quotes=True,
|
|
||||||
fix_line_breaks=True,
|
|
||||||
remove_control_chars=True,
|
|
||||||
remove_bom=True,
|
|
||||||
max_decode_length=2**16):
|
|
||||||
r"""
|
|
||||||
Given Unicode text as input, make its representation consistent and
|
|
||||||
possibly less broken.
|
|
||||||
|
|
||||||
Let's start with some examples:
|
|
||||||
|
|
||||||
>>> print(fix_text('ünicode'))
|
|
||||||
ünicode
|
|
||||||
|
|
||||||
>>> print(fix_text('Broken text… it’s flubberific!'))
|
|
||||||
Broken text... it's flubberific!
|
|
||||||
|
|
||||||
>>> print(fix_text('HTML entities <3'))
|
|
||||||
HTML entities <3
|
|
||||||
|
|
||||||
>>> print(fix_text('<em>HTML entities <3</em>'))
|
|
||||||
<em>HTML entities <3</em>
|
|
||||||
|
|
||||||
>>> print(fix_text('\001\033[36;44mI’m blue, da ba dee da ba '
|
|
||||||
... 'doo…\033[0m'))
|
|
||||||
I'm blue, da ba dee da ba doo...
|
|
||||||
|
|
||||||
>>> # This example string starts with a byte-order mark, even if
|
|
||||||
>>> # you can't see it on the Web.
|
|
||||||
>>> print(fix_text('\ufeffParty like\nit’s 1999!'))
|
|
||||||
Party like
|
|
||||||
it's 1999!
|
|
||||||
|
|
||||||
>>> len(fix_text('fi' * 100000))
|
|
||||||
200000
|
|
||||||
|
|
||||||
>>> len(fix_text(''))
|
|
||||||
0
|
|
||||||
|
|
||||||
Based on the options you provide, ftfy applies these steps in order:
|
|
||||||
|
|
||||||
- If `remove_unsafe_private_use` is True, it removes a range of private-use
|
|
||||||
characters that could trigger a Python bug. The bug is fixed in
|
|
||||||
the most recent versions of Python, so this will default to False
|
|
||||||
starting on Python 3.4.
|
|
||||||
- If `fix_entities` is True, replace HTML entities with their equivalent
|
|
||||||
characters. If it's "auto" (the default), then consider replacing HTML
|
|
||||||
entities, but don't do so in text where you have seen a pair of actual
|
|
||||||
angle brackets (that's probably actually HTML and you shouldn't mess
|
|
||||||
with the entities).
|
|
||||||
- If `remove_terminal_escapes` is True, remove sequences of bytes that are
|
|
||||||
instructions for Unix terminals, such as the codes that make text appear
|
|
||||||
in different colors.
|
|
||||||
- If `fix_encoding` is True, look for common mistakes that come from
|
|
||||||
encoding or decoding Unicode text incorrectly, and fix them if they are
|
|
||||||
reasonably fixable. See `fix_text_encoding` for details.
|
|
||||||
- If `normalization` is not None, apply the specified form of Unicode
|
|
||||||
normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
|
|
||||||
The default, 'NFKC', applies the following relevant transformations:
|
|
||||||
|
|
||||||
- C: Combine characters and diacritics that are written using separate
|
|
||||||
code points, such as converting "e" plus an acute accent modifier
|
|
||||||
into "é", or converting "ka" (か) plus a dakuten into the
|
|
||||||
single character "ga" (が).
|
|
||||||
- K: Replace characters that are functionally equivalent with the most
|
|
||||||
common form. For example, half-width katakana will be replaced with
|
|
||||||
full-width versions, full-width Roman characters will be replaced with
|
|
||||||
ASCII characters, ellipsis characters will be replaced with three
|
|
||||||
periods, and the ligature 'fl' will be replaced with 'fl'.
|
|
||||||
|
|
||||||
- If `uncurl_quotes` is True, replace various curly quotation marks with
|
|
||||||
plain-ASCII straight quotes.
|
|
||||||
- If `fix_line_breaks` is true, convert all line breaks to Unix style
|
|
||||||
(CRLF and CR line breaks become LF line breaks).
|
|
||||||
- If `fix_control_characters` is true, remove all C0 control characters
|
|
||||||
except the common useful ones: TAB, CR, LF, and FF. (CR characters
|
|
||||||
may have already been removed by the `fix_line_breaks` step.)
|
|
||||||
- If `remove_bom` is True, remove the Byte-Order Mark if it exists.
|
|
||||||
- If anything was changed, repeat all the steps, so that the function is
|
|
||||||
idempotent. "&amp;" will become "&", for example, not "&".
|
|
||||||
|
|
||||||
`fix_text` will work one line at a time, with the possibility that some
|
|
||||||
lines are in different encodings. When it encounters lines longer than
|
|
||||||
`max_decode_length`, it will not run the `fix_encoding` step, to avoid
|
|
||||||
unbounded slowdowns.
|
|
||||||
|
|
||||||
If you are certain your entire text is in the same encoding (though that
|
|
||||||
encoding is possibly flawed), and do not mind performing operations on
|
|
||||||
the whole text at once, use `fix_text_segment`.
|
|
||||||
"""
|
|
||||||
if isinstance(text, bytes):
|
|
||||||
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
|
|
||||||
|
|
||||||
out = []
|
|
||||||
pos = 0
|
|
||||||
while pos < len(text):
|
|
||||||
textbreak = text.find('\n', pos) + 1
|
|
||||||
fix_encoding_this_time = fix_encoding
|
|
||||||
if textbreak == 0:
|
|
||||||
textbreak = len(text)
|
|
||||||
if (textbreak - pos) > max_decode_length:
|
|
||||||
fix_encoding_this_time = False
|
|
||||||
|
|
||||||
substring = text[pos:textbreak]
|
|
||||||
|
|
||||||
if fix_entities == 'auto' and '<' in substring and '>' in substring:
|
|
||||||
# we see angle brackets together; this could be HTML
|
|
||||||
fix_entities = False
|
|
||||||
|
|
||||||
out.append(
|
|
||||||
fix_text_segment(
|
|
||||||
substring,
|
|
||||||
remove_unsafe_private_use=remove_unsafe_private_use,
|
|
||||||
fix_entities=fix_entities,
|
|
||||||
remove_terminal_escapes=remove_terminal_escapes,
|
|
||||||
fix_encoding=fix_encoding_this_time,
|
|
||||||
normalization=normalization,
|
|
||||||
uncurl_quotes=uncurl_quotes,
|
|
||||||
fix_line_breaks=fix_line_breaks,
|
|
||||||
remove_control_chars=remove_control_chars,
|
|
||||||
remove_bom=remove_bom
|
|
||||||
)
|
|
||||||
)
|
|
||||||
pos = textbreak
|
|
||||||
|
|
||||||
return ''.join(out)
|
|
||||||
|
|
||||||
ftfy = fix_text
|
|
||||||
|
|
||||||
|
|
||||||
def fix_file(input_file,
|
|
||||||
remove_unsafe_private_use=True,
|
|
||||||
fix_entities='auto',
|
|
||||||
remove_terminal_escapes=True,
|
|
||||||
fix_encoding=True,
|
|
||||||
normalization='NFKC',
|
|
||||||
uncurl_quotes=True,
|
|
||||||
fix_line_breaks=True,
|
|
||||||
remove_control_chars=True,
|
|
||||||
remove_bom=True):
|
|
||||||
"""
|
|
||||||
Fix text that is found in a file.
|
|
||||||
|
|
||||||
If the file is being read as Unicode text, use that. If it's being read as
|
|
||||||
bytes, then unfortunately, we have to guess what encoding it is. We'll try
|
|
||||||
a few common encodings, but we make no promises. See the `guess_bytes`
|
|
||||||
function for how this is done.
|
|
||||||
|
|
||||||
The output is a stream of fixed lines of text.
|
|
||||||
"""
|
|
||||||
entities = fix_entities
|
|
||||||
for line in input_file:
|
|
||||||
if isinstance(line, bytes):
|
|
||||||
line, encoding = guess_bytes(line)
|
|
||||||
if fix_entities == 'auto' and '<' in line and '>' in line:
|
|
||||||
entities = False
|
|
||||||
yield fix_text_segment(
|
|
||||||
line,
|
|
||||||
remove_unsafe_private_use=remove_unsafe_private_use,
|
|
||||||
fix_entities=entities,
|
|
||||||
remove_terminal_escapes=remove_terminal_escapes,
|
|
||||||
fix_encoding=fix_encoding,
|
|
||||||
normalization=normalization,
|
|
||||||
uncurl_quotes=uncurl_quotes,
|
|
||||||
fix_line_breaks=fix_line_breaks,
|
|
||||||
remove_control_chars=remove_control_chars,
|
|
||||||
remove_bom=remove_bom
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def fix_text_segment(text,
|
|
||||||
remove_unsafe_private_use=True,
|
|
||||||
fix_entities='auto',
|
|
||||||
remove_terminal_escapes=True,
|
|
||||||
fix_encoding=True,
|
|
||||||
normalization='NFKC',
|
|
||||||
uncurl_quotes=True,
|
|
||||||
fix_line_breaks=True,
|
|
||||||
remove_control_chars=True,
|
|
||||||
remove_bom=True):
|
|
||||||
"""
|
|
||||||
Apply fixes to text in a single chunk. This could be a line of text
|
|
||||||
within a larger run of `fix_text`, or it could be a larger amount
|
|
||||||
of text that you are certain is all in the same encoding.
|
|
||||||
|
|
||||||
See `fix_text` for a description of the parameters.
|
|
||||||
"""
|
|
||||||
if isinstance(text, bytes):
|
|
||||||
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
|
|
||||||
|
|
||||||
if fix_entities == 'auto' and '<' in text and '>' in text:
|
|
||||||
fix_entities = False
|
|
||||||
while True:
|
|
||||||
origtext = text
|
|
||||||
if remove_unsafe_private_use:
|
|
||||||
text = fixes.remove_unsafe_private_use(text)
|
|
||||||
if fix_entities:
|
|
||||||
text = fixes.unescape_html(text)
|
|
||||||
if remove_terminal_escapes:
|
|
||||||
text = fixes.remove_terminal_escapes(text)
|
|
||||||
if fix_encoding:
|
|
||||||
text = fixes.fix_text_encoding(text)
|
|
||||||
if normalization is not None:
|
|
||||||
text = unicodedata.normalize(normalization, text)
|
|
||||||
if uncurl_quotes:
|
|
||||||
text = fixes.uncurl_quotes(text)
|
|
||||||
if fix_line_breaks:
|
|
||||||
text = fixes.fix_line_breaks(text)
|
|
||||||
if remove_control_chars:
|
|
||||||
text = fixes.remove_control_chars(text)
|
|
||||||
if remove_bom:
|
|
||||||
text = fixes.remove_bom(text)
|
|
||||||
if text == origtext:
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def guess_bytes(bstring):
|
|
||||||
"""
|
|
||||||
If you have some bytes in an unknown encoding, here's a reasonable
|
|
||||||
strategy for decoding them, by trying a few common encodings that
|
|
||||||
can be distinguished from each other.
|
|
||||||
|
|
||||||
This is not a magic bullet. If the bytes are coming from some MySQL
|
|
||||||
database with the "character set" set to ISO Elbonian, this won't figure
|
|
||||||
it out. Perhaps more relevantly, this currently doesn't try East Asian
|
|
||||||
encodings.
|
|
||||||
|
|
||||||
The encodings we try are:
|
|
||||||
|
|
||||||
- UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
|
|
||||||
like nothing else
|
|
||||||
- UTF-8, because it's the global de facto standard
|
|
||||||
- "utf-8-variants", because it's what people actually implement when they
|
|
||||||
think they're doing UTF-8
|
|
||||||
- MacRoman, because Microsoft Office thinks it's still a thing, and it
|
|
||||||
can be distinguished by its line breaks. (If there are no line breaks in
|
|
||||||
the string, though, you're out of luck.)
|
|
||||||
- "sloppy-windows-1252", the Latin-1-like encoding that is the most common
|
|
||||||
single-byte encoding
|
|
||||||
"""
|
|
||||||
if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
|
|
||||||
return bstring.decode('utf-16'), 'utf-16'
|
|
||||||
|
|
||||||
byteset = set(bytes(bstring))
|
|
||||||
byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
|
|
||||||
|
|
||||||
try:
|
|
||||||
if byte_ed in byteset or byte_c0 in byteset:
|
|
||||||
# Byte 0xed can be used to encode a range of codepoints that
|
|
||||||
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
|
|
||||||
# so when we see 0xed, it's very likely we're being asked to
|
|
||||||
# decode CESU-8, the variant that encodes UTF-16 surrogates
|
|
||||||
# instead of the original characters themselves.
|
|
||||||
#
|
|
||||||
# This will occasionally trigger on standard UTF-8, as there
|
|
||||||
# are some Korean characters that also use byte 0xed, but that's
|
|
||||||
# not harmful.
|
|
||||||
#
|
|
||||||
# Byte 0xc0 is impossible because, numerically, it would only
|
|
||||||
# encode characters lower than U+0040. Those already have
|
|
||||||
# single-byte representations, and UTF-8 requires using the
|
|
||||||
# shortest possible representation. However, Java hides the null
|
|
||||||
# codepoint, U+0000, in a non-standard longer representation -- it
|
|
||||||
# encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
|
|
||||||
# will never appear in the encoded bytes.
|
|
||||||
#
|
|
||||||
# The 'utf-8-variants' decoder can handle both of these cases, as
|
|
||||||
# well as standard UTF-8, at the cost of a bit of speed.
|
|
||||||
return bstring.decode('utf-8-variants'), 'utf-8-variants'
|
|
||||||
else:
|
|
||||||
return bstring.decode('utf-8'), 'utf-8'
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if byte_CR in bstring and byte_LF not in bstring:
|
|
||||||
return bstring.decode('macroman'), 'macroman'
|
|
||||||
else:
|
|
||||||
return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
|
|
||||||
|
|
||||||
|
|
||||||
def explain_unicode(text):
|
|
||||||
"""
|
|
||||||
A utility method that's useful for debugging mysterious Unicode.
|
|
||||||
|
|
||||||
It breaks down a string, showing you for each codepoint its number in
|
|
||||||
hexadecimal, its glyph, its category in the Unicode standard, and its name
|
|
||||||
in the Unicode standard.
|
|
||||||
|
|
||||||
>>> explain_unicode('(╯°□°)╯︵ ┻━┻')
|
|
||||||
U+0028 ( [Ps] LEFT PARENTHESIS
|
|
||||||
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
|
|
||||||
U+00B0 ° [So] DEGREE SIGN
|
|
||||||
U+25A1 □ [So] WHITE SQUARE
|
|
||||||
U+00B0 ° [So] DEGREE SIGN
|
|
||||||
U+0029 ) [Pe] RIGHT PARENTHESIS
|
|
||||||
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
|
|
||||||
U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
|
|
||||||
U+0020 [Zs] SPACE
|
|
||||||
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
|
|
||||||
U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL
|
|
||||||
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
|
|
||||||
"""
|
|
||||||
for char in text:
|
|
||||||
if is_printable(char):
|
|
||||||
display = char
|
|
||||||
else:
|
|
||||||
display = char.encode('unicode-escape').decode('ascii')
|
|
||||||
print('U+{code:04X} {display:<7} [{category}] {name}'.format(
|
|
||||||
display=display,
|
|
||||||
code=ord(char),
|
|
||||||
category=unicodedata.category(char),
|
|
||||||
name=unicodedata.name(char, '<unknown>')
|
|
||||||
))
|
|
||||||
|
|
||||||
|
|
||||||
def fix_bad_encoding(text):
|
|
||||||
"""
|
|
||||||
Kept for compatibility with previous versions of ftfy.
|
|
||||||
"""
|
|
||||||
warnings.warn(
|
|
||||||
'fix_bad_encoding is now known as fix_text_encoding',
|
|
||||||
DeprecationWarning
|
|
||||||
)
|
|
||||||
return fix_text_encoding(text)
|
|
@ -1,94 +0,0 @@
|
|||||||
# coding: utf-8
|
|
||||||
r"""
|
|
||||||
Give Python the ability to decode some common, flawed encodings.
|
|
||||||
|
|
||||||
Python does not want you to be sloppy with your text. Its encoders and decoders
|
|
||||||
("codecs") follow the relevant standards whenever possible, which means that
|
|
||||||
when you get text that *doesn't* follow those standards, you'll probably fail
|
|
||||||
to decode it. Or you might succeed at decoding it for implementation-specific
|
|
||||||
reasons, which is perhaps worse.
|
|
||||||
|
|
||||||
There are some encodings out there that Python wishes didn't exist, which are
|
|
||||||
widely used outside of Python:
|
|
||||||
|
|
||||||
- "utf-8-variants", a family of not-quite-UTF-8 encodings, including the
|
|
||||||
ever-popular CESU-8 and "Java modified UTF-8".
|
|
||||||
- "Sloppy" versions of character map encodings, where bytes that don't map to
|
|
||||||
anything will instead map to the Unicode character with the same number.
|
|
||||||
|
|
||||||
Simply importing this module, or in fact any part of the `ftfy` package, will
|
|
||||||
make these new "bad codecs" available to Python through the standard Codecs
|
|
||||||
API. You never have to actually call any functions inside `ftfy.bad_codecs`.
|
|
||||||
|
|
||||||
However, if you want to call something because your code checker insists on it,
|
|
||||||
you can call ``ftfy.bad_codecs.ok()``.
|
|
||||||
|
|
||||||
A quick example of decoding text that's encoded in CESU-8:
|
|
||||||
|
|
||||||
>>> import ftfy.bad_codecs
|
|
||||||
>>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
|
|
||||||
😍
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from encodings import normalize_encoding
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
_CACHE = {}
|
|
||||||
|
|
||||||
# Define some aliases for 'utf-8-variants'. All hyphens get turned into
|
|
||||||
# underscores, because of `normalize_encoding`.
|
|
||||||
UTF8_VAR_NAMES = (
|
|
||||||
'utf_8_variants', 'utf8_variants',
|
|
||||||
'utf_8_variant', 'utf8_variant',
|
|
||||||
'utf_8_var', 'utf8_var',
|
|
||||||
'cesu_8', 'cesu8',
|
|
||||||
'java_utf_8', 'java_utf8'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def search_function(encoding):
|
|
||||||
"""
|
|
||||||
Register our "bad codecs" with Python's codecs API. This involves adding
|
|
||||||
a search function that takes in an encoding name, and returns a codec
|
|
||||||
for that encoding if it knows one, or None if it doesn't.
|
|
||||||
|
|
||||||
The encodings this will match are:
|
|
||||||
|
|
||||||
- Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N',
|
|
||||||
where the non-sloppy version is an encoding that leaves some bytes
|
|
||||||
unmapped to characters.
|
|
||||||
- The 'utf-8-variants' encoding, which has the several aliases seen
|
|
||||||
above.
|
|
||||||
"""
|
|
||||||
if encoding in _CACHE:
|
|
||||||
return _CACHE[encoding]
|
|
||||||
|
|
||||||
norm_encoding = normalize_encoding(encoding)
|
|
||||||
codec = None
|
|
||||||
if norm_encoding in UTF8_VAR_NAMES:
|
|
||||||
from ftfy.bad_codecs.utf8_variants import CODEC_INFO
|
|
||||||
codec = CODEC_INFO
|
|
||||||
elif norm_encoding.startswith('sloppy_'):
|
|
||||||
from ftfy.bad_codecs.sloppy import CODECS
|
|
||||||
codec = CODECS.get(norm_encoding)
|
|
||||||
|
|
||||||
if codec is not None:
|
|
||||||
_CACHE[encoding] = codec
|
|
||||||
|
|
||||||
return codec
|
|
||||||
|
|
||||||
|
|
||||||
def ok():
|
|
||||||
"""
|
|
||||||
A feel-good function that gives you something to call after importing
|
|
||||||
this package.
|
|
||||||
|
|
||||||
Why is this here? Pyflakes. Pyflakes gets upset when you import a module
|
|
||||||
and appear not to use it. It doesn't know that you're using it when
|
|
||||||
you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
|
|
||||||
encodings.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
codecs.register(search_function)
|
|
@ -1,156 +0,0 @@
|
|||||||
# coding: utf-8
|
|
||||||
r"""
|
|
||||||
Decodes single-byte encodings, filling their "holes" in the same messy way that
|
|
||||||
everyone else does.
|
|
||||||
|
|
||||||
A single-byte encoding maps each byte to a Unicode character, except that some
|
|
||||||
bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
|
|
||||||
example, bytes 0x81 and 0x8D, among others, have no meaning.
|
|
||||||
|
|
||||||
Python, wanting to preserve some sense of decorum, will handle these bytes
|
|
||||||
as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
|
|
||||||
different from each other. It just hasn't defined what they are in terms of
|
|
||||||
Unicode.
|
|
||||||
|
|
||||||
Software that has to interoperate with Windows-1252 and Unicode -- such as all
|
|
||||||
the common Web browsers -- will pick some Unicode characters for them to map
|
|
||||||
to, and the characters they pick are the Unicode characters with the same
|
|
||||||
numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
|
|
||||||
resulting characters tend to fall into a range of Unicode that's set aside for
|
|
||||||
obselete Latin-1 control characters anyway.
|
|
||||||
|
|
||||||
These sloppy codecs let Python do the same thing, thus interoperating with
|
|
||||||
other software that works this way. It defines a sloppy version of many
|
|
||||||
single-byte encodings with holes. (There is no need for a sloppy version of
|
|
||||||
an encoding without holes: for example, there is no such thing as
|
|
||||||
sloppy-iso-8859-2 or sloppy-macroman.)
|
|
||||||
|
|
||||||
The following encodings will become defined:
|
|
||||||
|
|
||||||
- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
|
|
||||||
- sloppy-windows-1251 (Cyrillic)
|
|
||||||
- sloppy-windows-1252 (Western European, based on Latin-1)
|
|
||||||
- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
|
|
||||||
- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
|
|
||||||
- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
|
|
||||||
- sloppy-windows-1256 (Arabic)
|
|
||||||
- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
|
|
||||||
- sloppy-windows-1258 (Vietnamese)
|
|
||||||
- sloppy-cp874 (Thai, based on ISO-8859-11)
|
|
||||||
- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
|
|
||||||
- sloppy-iso-8859-6 (different Arabic)
|
|
||||||
- sloppy-iso-8859-7 (Greek)
|
|
||||||
- sloppy-iso-8859-8 (Hebrew)
|
|
||||||
- sloppy-iso-8859-11 (Thai)
|
|
||||||
|
|
||||||
Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
|
|
||||||
defined.
|
|
||||||
|
|
||||||
Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
|
|
||||||
the rest are rather uncommon.
|
|
||||||
|
|
||||||
Here are some examples, using `ftfy.explain_unicode` to illustrate how
|
|
||||||
sloppy-windows-1252 merges Windows-1252 with Latin-1:
|
|
||||||
|
|
||||||
>>> from ftfy import explain_unicode
|
|
||||||
>>> some_bytes = b'\x80\x81\x82'
|
|
||||||
>>> explain_unicode(some_bytes.decode('latin-1'))
|
|
||||||
U+0080 \x80 [Cc] <unknown>
|
|
||||||
U+0081 \x81 [Cc] <unknown>
|
|
||||||
U+0082 \x82 [Cc] <unknown>
|
|
||||||
|
|
||||||
>>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
|
|
||||||
U+20AC € [Sc] EURO SIGN
|
|
||||||
U+FFFD <EFBFBD> [So] REPLACEMENT CHARACTER
|
|
||||||
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
|
|
||||||
|
|
||||||
>>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
|
|
||||||
U+20AC € [Sc] EURO SIGN
|
|
||||||
U+0081 \x81 [Cc] <unknown>
|
|
||||||
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import codecs
|
|
||||||
from encodings import normalize_encoding
|
|
||||||
|
|
||||||
REPLACEMENT_CHAR = '\ufffd'
|
|
||||||
|
|
||||||
|
|
||||||
def make_sloppy_codec(encoding):
|
|
||||||
"""
|
|
||||||
Take a codec name, and return a 'sloppy' version of that codec that can
|
|
||||||
encode and decode the unassigned bytes in that encoding.
|
|
||||||
|
|
||||||
Single-byte encodings in the standard library are defined using some
|
|
||||||
boilerplate classes surrounding the functions that do the actual work,
|
|
||||||
`codecs.charmap_decode` and `charmap_encode`. This function, given an
|
|
||||||
encoding name, *defines* those boilerplate classes.
|
|
||||||
"""
|
|
||||||
# Make an array of all 256 possible bytes.
|
|
||||||
all_bytes = bytearray(range(256))
|
|
||||||
|
|
||||||
# Get a list of what they would decode to in Latin-1.
|
|
||||||
sloppy_chars = list(all_bytes.decode('latin-1'))
|
|
||||||
|
|
||||||
# Get a list of what they decode to in the given encoding. Use the
|
|
||||||
# replacement character for unassigned bytes.
|
|
||||||
decoded_chars = all_bytes.decode(encoding, 'replace')
|
|
||||||
|
|
||||||
# Update the sloppy_chars list. Each byte that was successfully decoded
|
|
||||||
# gets its decoded value in the list. The unassigned bytes are left as
|
|
||||||
# they are, which gives their decoding in Latin-1.
|
|
||||||
for i, char in enumerate(decoded_chars):
|
|
||||||
if char != REPLACEMENT_CHAR:
|
|
||||||
sloppy_chars[i] = char
|
|
||||||
|
|
||||||
# Create the data structures that tell the charmap methods how to encode
|
|
||||||
# and decode in this sloppy encoding.
|
|
||||||
decoding_table = ''.join(sloppy_chars)
|
|
||||||
encoding_table = codecs.charmap_build(decoding_table)
|
|
||||||
|
|
||||||
# Now produce all the class boilerplate. Look at the Python source for
|
|
||||||
# `encodings.cp1252` for comparison; this is almost exactly the same,
|
|
||||||
# except I made it follow pep8.
|
|
||||||
class Codec(codecs.Codec):
|
|
||||||
def encode(self, input, errors='strict'):
|
|
||||||
return codecs.charmap_encode(input, errors, encoding_table)
|
|
||||||
|
|
||||||
def decode(self, input, errors='strict'):
|
|
||||||
return codecs.charmap_decode(input, errors, decoding_table)
|
|
||||||
|
|
||||||
class IncrementalEncoder(codecs.IncrementalEncoder):
|
|
||||||
def encode(self, input, final=False):
|
|
||||||
return codecs.charmap_encode(input, self.errors, encoding_table)[0]
|
|
||||||
|
|
||||||
class IncrementalDecoder(codecs.IncrementalDecoder):
|
|
||||||
def decode(self, input, final=False):
|
|
||||||
return codecs.charmap_decode(input, self.errors, decoding_table)[0]
|
|
||||||
|
|
||||||
class StreamWriter(Codec, codecs.StreamWriter):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class StreamReader(Codec, codecs.StreamReader):
|
|
||||||
pass
|
|
||||||
|
|
||||||
return codecs.CodecInfo(
|
|
||||||
name='sloppy-' + encoding,
|
|
||||||
encode=Codec().encode,
|
|
||||||
decode=Codec().decode,
|
|
||||||
incrementalencoder=IncrementalEncoder,
|
|
||||||
incrementaldecoder=IncrementalDecoder,
|
|
||||||
streamreader=StreamReader,
|
|
||||||
streamwriter=StreamWriter,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Define a codec for each incomplete encoding. The resulting CODECS dictionary
|
|
||||||
# can be used by the main module of ftfy.bad_codecs.
|
|
||||||
CODECS = {}
|
|
||||||
INCOMPLETE_ENCODINGS = (
|
|
||||||
['windows-%s' % num for num in range(1250, 1259)] +
|
|
||||||
['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
|
|
||||||
['cp%s' % num for num in range(1250, 1259)] + ['cp874']
|
|
||||||
)
|
|
||||||
|
|
||||||
for _encoding in INCOMPLETE_ENCODINGS:
|
|
||||||
_new_name = normalize_encoding('sloppy-' + _encoding)
|
|
||||||
CODECS[_new_name] = make_sloppy_codec(_encoding)
|
|
@ -1,281 +0,0 @@
|
|||||||
r"""
|
|
||||||
This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can
|
|
||||||
decode text that's been encoded with a popular non-standard version of UTF-8.
|
|
||||||
This includes CESU-8, the accidental encoding made by layering UTF-8 on top of
|
|
||||||
UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for
|
|
||||||
codepoint 0.
|
|
||||||
|
|
||||||
This is particularly relevant in Python 3, which provides no other way of
|
|
||||||
decoding CESU-8 or Java's encoding. [1]
|
|
||||||
|
|
||||||
The easiest way to use the codec is to simply import `ftfy.bad_codecs`:
|
|
||||||
|
|
||||||
>>> import ftfy.bad_codecs
|
|
||||||
>>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var')
|
|
||||||
>>> print(repr(result).lstrip('u'))
|
|
||||||
'here comes a null! \x00'
|
|
||||||
|
|
||||||
The codec does not at all enforce "correct" CESU-8. For example, the Unicode
|
|
||||||
Consortium's not-quite-standard describing CESU-8 requires that there is only
|
|
||||||
one possible encoding of any character, so it does not allow mixing of valid
|
|
||||||
UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8
|
|
||||||
decoder does.
|
|
||||||
|
|
||||||
Characters in the Basic Multilingual Plane still have only one encoding. This
|
|
||||||
codec still enforces the rule, within the BMP, that characters must appear in
|
|
||||||
their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`,
|
|
||||||
instead of just `0x00`, may be used to encode the null character `U+0000`, like
|
|
||||||
in Java.
|
|
||||||
|
|
||||||
If you encode with this codec, you get legitimate UTF-8. Decoding with this
|
|
||||||
codec and then re-encoding is not idempotent, although encoding and then
|
|
||||||
decoding is. So this module won't produce CESU-8 for you. Look for that
|
|
||||||
functionality in the sister module, "Breaks Text For You", coming approximately
|
|
||||||
never.
|
|
||||||
|
|
||||||
[1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: first
|
|
||||||
decode the bytes (incorrectly), then encode them, then decode them again, using
|
|
||||||
UTF-8 as the codec every time.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2
|
|
||||||
from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder,
|
|
||||||
IncrementalEncoder as UTF8IncrementalEncoder)
|
|
||||||
import re
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
NAME = 'utf-8-variants'
|
|
||||||
# This regular expression matches all possible six-byte CESU-8 sequences.
|
|
||||||
CESU8_RE = re.compile(b'\xed[\xa0-\xaf][\x80-\xbf]\xed[\xb0-\xbf][\x80-\xbf]')
|
|
||||||
|
|
||||||
|
|
||||||
class IncrementalDecoder(UTF8IncrementalDecoder):
|
|
||||||
"""
|
|
||||||
An incremental decoder that extends Python's built-in UTF-8 decoder.
|
|
||||||
|
|
||||||
This encoder needs to take in bytes, possibly arriving in a stream, and
|
|
||||||
output the correctly decoded text. The general strategy for doing this
|
|
||||||
is to fall back on the real UTF-8 decoder whenever possible, because
|
|
||||||
the real UTF-8 decoder is way optimized, but to call specialized methods
|
|
||||||
we define here for the cases the real encoder isn't expecting.
|
|
||||||
"""
|
|
||||||
def _buffer_decode(self, input, errors, final):
|
|
||||||
"""
|
|
||||||
Decode bytes that may be arriving in a stream, following the Codecs
|
|
||||||
API.
|
|
||||||
|
|
||||||
`input` is the incoming sequence of bytes. `errors` tells us how to
|
|
||||||
handle errors, though we delegate all error-handling cases to the real
|
|
||||||
UTF-8 decoder to ensure correct behavior. `final` indicates whether
|
|
||||||
this is the end of the sequence, in which case we should raise an
|
|
||||||
error given incomplete input.
|
|
||||||
|
|
||||||
Returns as much decoded text as possible, and the number of bytes
|
|
||||||
consumed.
|
|
||||||
"""
|
|
||||||
# decoded_segments are the pieces of text we have decoded so far,
|
|
||||||
# and position is our current position in the byte string. (Bytes
|
|
||||||
# before this position have been consumed, and bytes after it have
|
|
||||||
# yet to be decoded.)
|
|
||||||
decoded_segments = []
|
|
||||||
position = 0
|
|
||||||
while True:
|
|
||||||
# Use _buffer_decode_step to decode a segment of text.
|
|
||||||
decoded, consumed = self._buffer_decode_step(
|
|
||||||
input[position:],
|
|
||||||
errors,
|
|
||||||
final
|
|
||||||
)
|
|
||||||
if consumed == 0:
|
|
||||||
# Either there's nothing left to decode, or we need to wait
|
|
||||||
# for more input. Either way, we're done for now.
|
|
||||||
break
|
|
||||||
|
|
||||||
# Append the decoded text to the list, and update our position.
|
|
||||||
decoded_segments.append(decoded)
|
|
||||||
position += consumed
|
|
||||||
|
|
||||||
if final:
|
|
||||||
# _buffer_decode_step must consume all the bytes when `final` is
|
|
||||||
# true.
|
|
||||||
assert position == len(input)
|
|
||||||
|
|
||||||
return ''.join(decoded_segments), position
|
|
||||||
|
|
||||||
def _buffer_decode_step(self, input, errors, final):
|
|
||||||
"""
|
|
||||||
There are three possibilities for each decoding step:
|
|
||||||
|
|
||||||
- Decode as much real UTF-8 as possible.
|
|
||||||
- Decode a six-byte CESU-8 sequence at the current position.
|
|
||||||
- Decode a Java-style null at the current position.
|
|
||||||
|
|
||||||
This method figures out which step is appropriate, and does it.
|
|
||||||
"""
|
|
||||||
# Get a reference to the superclass method that we'll be using for
|
|
||||||
# most of the real work.
|
|
||||||
sup = UTF8IncrementalDecoder._buffer_decode
|
|
||||||
|
|
||||||
# Find the next byte position that indicates a variant of UTF-8.
|
|
||||||
# CESU-8 sequences always start with 0xed, and Java nulls always
|
|
||||||
# start with 0xc0, both of which are conveniently impossible in
|
|
||||||
# real UTF-8.
|
|
||||||
cutoff1 = input.find(b'\xed')
|
|
||||||
cutoff2 = input.find(b'\xc0')
|
|
||||||
|
|
||||||
# Set `cutoff` to whichever cutoff comes first.
|
|
||||||
if cutoff1 != -1 and cutoff2 != -1:
|
|
||||||
cutoff = min(cutoff1, cutoff2)
|
|
||||||
elif cutoff1 != -1:
|
|
||||||
cutoff = cutoff1
|
|
||||||
elif cutoff2 != -1:
|
|
||||||
cutoff = cutoff2
|
|
||||||
else:
|
|
||||||
# The entire input can be decoded as UTF-8, so just do so.
|
|
||||||
return sup(input, errors, final)
|
|
||||||
|
|
||||||
if cutoff1 == 0:
|
|
||||||
# Decode a possible six-byte sequence starting with 0xed.
|
|
||||||
return self._buffer_decode_surrogates(sup, input, errors, final)
|
|
||||||
elif cutoff2 == 0:
|
|
||||||
# Decode a possible two-byte sequence, 0xc0 0x80.
|
|
||||||
return self._buffer_decode_null(sup, input, errors, final)
|
|
||||||
else:
|
|
||||||
# Decode the bytes up until the next weird thing as UTF-8.
|
|
||||||
# Set final=True because 0xc0 and 0xed don't make sense in the
|
|
||||||
# middle of a sequence, in any variant.
|
|
||||||
return sup(input[:cutoff], errors, True)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _buffer_decode_null(sup, input, errors, final):
|
|
||||||
"""
|
|
||||||
Decode the bytes 0xc0 0x80 as U+0000, like Java does.
|
|
||||||
"""
|
|
||||||
nextbyte = input[1:2]
|
|
||||||
if nextbyte == b'':
|
|
||||||
if final:
|
|
||||||
# We found 0xc0 at the end of the stream, which is an error.
|
|
||||||
# Delegate to the superclass method to handle that error.
|
|
||||||
return sup(input, errors, final)
|
|
||||||
else:
|
|
||||||
# We found 0xc0 and we don't know what comes next, so consume
|
|
||||||
# no bytes and wait.
|
|
||||||
return '', 0
|
|
||||||
elif nextbyte == b'\x80':
|
|
||||||
# We found the usual 0xc0 0x80 sequence, so decode it and consume
|
|
||||||
# two bytes.
|
|
||||||
return '\u0000', 2
|
|
||||||
else:
|
|
||||||
# We found 0xc0 followed by something else, which is an error.
|
|
||||||
# Whatever should happen is equivalent to what happens when the
|
|
||||||
# superclass is given just the byte 0xc0, with final=True.
|
|
||||||
return sup(b'\xc0', errors, True)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _buffer_decode_surrogates(sup, input, errors, final):
|
|
||||||
"""
|
|
||||||
When we have improperly encoded surrogates, we can still see the
|
|
||||||
bits that they were meant to represent.
|
|
||||||
|
|
||||||
The surrogates were meant to encode a 20-bit number, to which we
|
|
||||||
add 0x10000 to get a codepoint. That 20-bit number now appears in
|
|
||||||
this form:
|
|
||||||
|
|
||||||
11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
|
|
||||||
|
|
||||||
The CESU8_RE above matches byte sequences of this form. Then we need
|
|
||||||
to extract the bits and assemble a codepoint number from them.
|
|
||||||
"""
|
|
||||||
if len(input) < 6:
|
|
||||||
if final:
|
|
||||||
# We found 0xed near the end of the stream, and there aren't
|
|
||||||
# six bytes to decode. Delegate to the superclass method to
|
|
||||||
# handle it as normal UTF-8. It might be a Hangul character
|
|
||||||
# or an error.
|
|
||||||
if PYTHON2 and len(input) >= 3:
|
|
||||||
# We can't trust Python 2 to raise an error when it's
|
|
||||||
# asked to decode a surrogate, so let's force the issue.
|
|
||||||
input = mangle_surrogates(input)
|
|
||||||
return sup(input, errors, final)
|
|
||||||
else:
|
|
||||||
# We found 0xed, the stream isn't over yet, and we don't know
|
|
||||||
# enough of the following bytes to decode anything, so consume
|
|
||||||
# zero bytes and wait.
|
|
||||||
return '', 0
|
|
||||||
else:
|
|
||||||
if CESU8_RE.match(input):
|
|
||||||
# If this is a CESU-8 sequence, do some math to pull out
|
|
||||||
# the intended 20-bit value, and consume six bytes.
|
|
||||||
bytenums = bytes_to_ints(input[:6])
|
|
||||||
codepoint = (
|
|
||||||
((bytenums[1] & 0x0f) << 16) +
|
|
||||||
((bytenums[2] & 0x3f) << 10) +
|
|
||||||
((bytenums[4] & 0x0f) << 6) +
|
|
||||||
(bytenums[5] & 0x3f) +
|
|
||||||
0x10000
|
|
||||||
)
|
|
||||||
return unichr(codepoint), 6
|
|
||||||
else:
|
|
||||||
# This looked like a CESU-8 sequence, but it wasn't one.
|
|
||||||
# 0xed indicates the start of a three-byte sequence, so give
|
|
||||||
# three bytes to the superclass to decode as usual -- except
|
|
||||||
# for working around the Python 2 discrepancy as before.
|
|
||||||
if PYTHON2:
|
|
||||||
input = mangle_surrogates(input)
|
|
||||||
return sup(input[:3], errors, False)
|
|
||||||
|
|
||||||
|
|
||||||
def mangle_surrogates(bytestring):
|
|
||||||
"""
|
|
||||||
When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats
|
|
||||||
it as an error (which it is). In 'replace' mode, it will decode as three
|
|
||||||
replacement characters. But Python 2 will just output the surrogate
|
|
||||||
codepoint.
|
|
||||||
|
|
||||||
To ensure consistency between Python 2 and Python 3, and protect downstream
|
|
||||||
applications from malformed strings, we turn surrogate sequences at the
|
|
||||||
start of the string into the bytes `ff ff ff`, which we're *sure* won't
|
|
||||||
decode, and which turn into three replacement characters in 'replace' mode.
|
|
||||||
"""
|
|
||||||
if PYTHON2:
|
|
||||||
if bytestring.startswith(b'\xed') and len(bytestring) >= 3:
|
|
||||||
decoded = bytestring[:3].decode('utf-8', 'replace')
|
|
||||||
if '\ud800' <= decoded <= '\udfff':
|
|
||||||
return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:])
|
|
||||||
return bytestring
|
|
||||||
else:
|
|
||||||
# On Python 3, nothing needs to be done.
|
|
||||||
return bytestring
|
|
||||||
|
|
||||||
# The encoder is identical to UTF-8.
|
|
||||||
IncrementalEncoder = UTF8IncrementalEncoder
|
|
||||||
|
|
||||||
|
|
||||||
# Everything below here is boilerplate that matches the modules in the
|
|
||||||
# built-in `encodings` package.
|
|
||||||
def encode(input, errors='strict'):
|
|
||||||
return IncrementalEncoder(errors).encode(input, final=True), len(input)
|
|
||||||
|
|
||||||
|
|
||||||
def decode(input, errors='strict'):
|
|
||||||
return IncrementalDecoder(errors).decode(input, final=True), len(input)
|
|
||||||
|
|
||||||
|
|
||||||
class StreamWriter(codecs.StreamWriter):
|
|
||||||
encode = encode
|
|
||||||
|
|
||||||
|
|
||||||
class StreamReader(codecs.StreamReader):
|
|
||||||
decode = decode
|
|
||||||
|
|
||||||
|
|
||||||
CODEC_INFO = codecs.CodecInfo(
|
|
||||||
name=NAME,
|
|
||||||
encode=encode,
|
|
||||||
decode=decode,
|
|
||||||
incrementalencoder=IncrementalEncoder,
|
|
||||||
incrementaldecoder=IncrementalDecoder,
|
|
||||||
streamreader=StreamReader,
|
|
||||||
streamwriter=StreamWriter,
|
|
||||||
)
|
|
@ -1,144 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
Heuristics to determine whether re-encoding text is actually making it
|
|
||||||
more reasonable.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from ftfy.chardata import chars_to_classes
|
|
||||||
import re
|
|
||||||
import unicodedata
|
|
||||||
|
|
||||||
# The following regex uses the mapping of character classes to ASCII
|
|
||||||
# characters defined in chardata.py and build_data.py:
|
|
||||||
#
|
|
||||||
# L = Latin capital letter
|
|
||||||
# l = Latin lowercase letter
|
|
||||||
# A = Non-latin capital or title-case letter
|
|
||||||
# a = Non-latin lowercase letter
|
|
||||||
# C = Non-cased letter (Lo)
|
|
||||||
# X = Control character (Cc)
|
|
||||||
# m = Letter modifier (Lm)
|
|
||||||
# M = Mark (Mc, Me, Mn)
|
|
||||||
# N = Miscellaneous numbers (No)
|
|
||||||
# 0 = Math symbol (Sm)
|
|
||||||
# 1 = Currency symbol (Sc)
|
|
||||||
# 2 = Symbol modifier (Sk)
|
|
||||||
# 3 = Other symbol (So)
|
|
||||||
# S = UTF-16 surrogate
|
|
||||||
# _ = Unassigned character
|
|
||||||
# = Whitespace
|
|
||||||
# o = Other
|
|
||||||
|
|
||||||
|
|
||||||
def _make_weirdness_regex():
|
|
||||||
"""
|
|
||||||
Creates a list of regexes that match 'weird' character sequences.
|
|
||||||
The more matches there are, the weirder the text is.
|
|
||||||
"""
|
|
||||||
groups = []
|
|
||||||
|
|
||||||
# Match lowercase letters that are followed by non-ASCII uppercase letters
|
|
||||||
groups.append('lA')
|
|
||||||
|
|
||||||
# Match diacritical marks, except when they modify a non-cased letter or
|
|
||||||
# another mark.
|
|
||||||
#
|
|
||||||
# You wouldn't put a diacritical mark on a digit or a space, for example.
|
|
||||||
# You might put it on a Latin letter, but in that case there will almost
|
|
||||||
# always be a pre-composed version, and we normalize to pre-composed
|
|
||||||
# versions first. The cases that can't be pre-composed tend to be in
|
|
||||||
# large scripts without case, which are in class C.
|
|
||||||
groups.append('[^CM]M')
|
|
||||||
|
|
||||||
# Match non-Latin characters adjacent to Latin characters.
|
|
||||||
#
|
|
||||||
# This is a simplification from ftfy version 2, which compared all
|
|
||||||
# adjacent scripts. However, the ambiguities we need to resolve come from
|
|
||||||
# encodings designed to represent Latin characters.
|
|
||||||
groups.append('[Ll][AaC]')
|
|
||||||
groups.append('[AaC][Ll]')
|
|
||||||
|
|
||||||
# Match C1 control characters, which are almost always the result of
|
|
||||||
# decoding Latin-1 that was meant to be Windows-1252.
|
|
||||||
groups.append('X')
|
|
||||||
|
|
||||||
# Match private use and unassigned characters.
|
|
||||||
groups.append('P')
|
|
||||||
groups.append('_')
|
|
||||||
|
|
||||||
# Match adjacent characters from any different pair of these categories:
|
|
||||||
# - Modifier marks (M)
|
|
||||||
# - Letter modifiers (m)
|
|
||||||
# - Miscellaneous numbers (N)
|
|
||||||
# - Symbols (0123)
|
|
||||||
|
|
||||||
exclusive_categories = 'MmN0123'
|
|
||||||
for cat1 in exclusive_categories:
|
|
||||||
others_range = ''.join(c for c in exclusive_categories if c != cat1)
|
|
||||||
groups.append('{cat1}[{others_range}]'.format(
|
|
||||||
cat1=cat1, others_range=others_range
|
|
||||||
))
|
|
||||||
regex = '|'.join('({0})'.format(group) for group in groups)
|
|
||||||
return re.compile(regex)
|
|
||||||
|
|
||||||
WEIRDNESS_RE = _make_weirdness_regex()
|
|
||||||
|
|
||||||
# A few characters are common ending punctuation that can show up at the end
|
|
||||||
# of a mojibake sequence. It's plausible that such a character could appear
|
|
||||||
# after an accented capital letter, for example, so we'll want to add a
|
|
||||||
# slight preference to leave these characters alone.
|
|
||||||
#
|
|
||||||
# The match ends with a + so that we only give the bonus once for a
|
|
||||||
# consecutive sequence of these characters.
|
|
||||||
ENDING_PUNCT_RE = re.compile(
|
|
||||||
'['
|
|
||||||
'\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}'
|
|
||||||
'\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}'
|
|
||||||
'\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}'
|
|
||||||
'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}'
|
|
||||||
']+'
|
|
||||||
)
|
|
||||||
|
|
||||||
def sequence_weirdness(text):
|
|
||||||
"""
|
|
||||||
Determine how often a text has unexpected characters or sequences of
|
|
||||||
characters. This metric is used to disambiguate when text should be
|
|
||||||
re-decoded or left as is.
|
|
||||||
|
|
||||||
We start by normalizing text in NFC form, so that penalties for
|
|
||||||
diacritical marks don't apply to characters that know what to do with
|
|
||||||
them.
|
|
||||||
|
|
||||||
The following things are deemed weird:
|
|
||||||
|
|
||||||
- Lowercase letters followed by non-ASCII uppercase letters
|
|
||||||
- Non-Latin characters next to Latin characters
|
|
||||||
- Un-combined diacritical marks, unless they're stacking on non-alphabetic
|
|
||||||
characters (in languages that do that kind of thing a lot) or other
|
|
||||||
marks
|
|
||||||
- C1 control characters
|
|
||||||
- Adjacent symbols from any different pair of these categories:
|
|
||||||
|
|
||||||
- Modifier marks
|
|
||||||
- Letter modifiers
|
|
||||||
- Non-digit numbers
|
|
||||||
- Symbols (including math and currency)
|
|
||||||
|
|
||||||
The return value is the number of instances of weirdness.
|
|
||||||
"""
|
|
||||||
text2 = unicodedata.normalize('NFC', text)
|
|
||||||
weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2)))
|
|
||||||
punct_discount = len(ENDING_PUNCT_RE.findall(text2))
|
|
||||||
return weirdness * 2 - punct_discount
|
|
||||||
|
|
||||||
|
|
||||||
def text_cost(text):
|
|
||||||
"""
|
|
||||||
An overall cost function for text. Weirder is worse, but all else being
|
|
||||||
equal, shorter strings are better.
|
|
||||||
|
|
||||||
The overall cost is measured as the "weirdness" (see
|
|
||||||
:func:`sequence_weirdness`) plus the length.
|
|
||||||
"""
|
|
||||||
return sequence_weirdness(text) + len(text)
|
|
@ -1,111 +0,0 @@
|
|||||||
"""
|
|
||||||
A script to make the char_classes.dat file.
|
|
||||||
|
|
||||||
This never needs to run in normal usage. It needs to be run if the character
|
|
||||||
classes we care about change, or if a new version of Python supports a new
|
|
||||||
Unicode standard and we want it to affect our string decoding.
|
|
||||||
|
|
||||||
The file that we generate is based on Unicode 6.1, as supported by Python 3.3.
|
|
||||||
You can certainly use it in earlier versions. This simply makes sure that we
|
|
||||||
get consistent results from running ftfy on different versions of Python.
|
|
||||||
|
|
||||||
The file will be written to the current directory.
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import unicodedata
|
|
||||||
import sys
|
|
||||||
import zlib
|
|
||||||
if sys.hexversion >= 0x03000000:
|
|
||||||
unichr = chr
|
|
||||||
|
|
||||||
# L = Latin capital letter
|
|
||||||
# l = Latin lowercase letter
|
|
||||||
# A = Non-latin capital or title-case letter
|
|
||||||
# a = Non-latin lowercase letter
|
|
||||||
# C = Non-cased letter (Lo)
|
|
||||||
# X = Control character (Cc)
|
|
||||||
# m = Letter modifier (Lm)
|
|
||||||
# M = Mark (Mc, Me, Mn)
|
|
||||||
# N = Miscellaneous numbers (No)
|
|
||||||
# P = Private use (Co)
|
|
||||||
# 0 = Math symbol (Sm)
|
|
||||||
# 1 = Currency symbol (Sc)
|
|
||||||
# 2 = Symbol modifier (Sk)
|
|
||||||
# 3 = Other symbol (So)
|
|
||||||
# S = UTF-16 surrogate
|
|
||||||
# _ = Unassigned character
|
|
||||||
# = Whitespace
|
|
||||||
# o = Other
|
|
||||||
|
|
||||||
|
|
||||||
def make_char_data_file(do_it_anyway=False):
|
|
||||||
"""
|
|
||||||
Build the compressed data file 'char_classes.dat' and write it to the
|
|
||||||
current directory.
|
|
||||||
|
|
||||||
If you run this, run it in Python 3.3 or later. It will run in earlier
|
|
||||||
versions, but you won't get the current Unicode standard, leading to
|
|
||||||
inconsistent behavior. To protect against this, running this in the
|
|
||||||
wrong version of Python will raise an error unless you pass
|
|
||||||
`do_it_anyway=True`.
|
|
||||||
"""
|
|
||||||
if sys.hexversion < 0x03030000 and not do_it_anyway:
|
|
||||||
raise RuntimeError(
|
|
||||||
"This function should be run in Python 3.3 or later."
|
|
||||||
)
|
|
||||||
|
|
||||||
cclasses = [None] * 0x110000
|
|
||||||
for codepoint in range(0x0, 0x110000):
|
|
||||||
char = unichr(codepoint)
|
|
||||||
category = unicodedata.category(char)
|
|
||||||
|
|
||||||
if category.startswith('L'): # letters
|
|
||||||
is_latin = unicodedata.name(char).startswith('LATIN')
|
|
||||||
if is_latin and codepoint < 0x200:
|
|
||||||
if category == 'Lu':
|
|
||||||
cclasses[codepoint] = 'L'
|
|
||||||
else:
|
|
||||||
cclasses[codepoint] = 'l'
|
|
||||||
else: # non-Latin letter, or close enough
|
|
||||||
if category == 'Lu' or category == 'Lt':
|
|
||||||
cclasses[codepoint] = 'A'
|
|
||||||
elif category == 'Ll':
|
|
||||||
cclasses[codepoint] = 'a'
|
|
||||||
elif category == 'Lo':
|
|
||||||
cclasses[codepoint] = 'C'
|
|
||||||
elif category == 'Lm':
|
|
||||||
cclasses[codepoint] = 'm'
|
|
||||||
else:
|
|
||||||
raise ValueError('got some weird kind of letter')
|
|
||||||
elif category.startswith('M'): # marks
|
|
||||||
cclasses[codepoint] = 'M'
|
|
||||||
elif category == 'No':
|
|
||||||
cclasses[codepoint] = 'N'
|
|
||||||
elif category == 'Sm':
|
|
||||||
cclasses[codepoint] = '0'
|
|
||||||
elif category == 'Sc':
|
|
||||||
cclasses[codepoint] = '1'
|
|
||||||
elif category == 'Sk':
|
|
||||||
cclasses[codepoint] = '2'
|
|
||||||
elif category == 'So':
|
|
||||||
cclasses[codepoint] = '3'
|
|
||||||
elif category == 'Cn':
|
|
||||||
cclasses[codepoint] = '_'
|
|
||||||
elif category == 'Cc':
|
|
||||||
cclasses[codepoint] = 'X'
|
|
||||||
elif category == 'Cs':
|
|
||||||
cclasses[codepoint] = 'S'
|
|
||||||
elif category == 'Co':
|
|
||||||
cclasses[codepoint] = 'P'
|
|
||||||
elif category.startswith('Z'):
|
|
||||||
cclasses[codepoint] = ' '
|
|
||||||
else:
|
|
||||||
cclasses[codepoint] = 'o'
|
|
||||||
|
|
||||||
cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
|
|
||||||
out = open('char_classes.dat', 'wb')
|
|
||||||
out.write(zlib.compress(''.join(cclasses).encode('ascii')))
|
|
||||||
out.close()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
make_char_data_file()
|
|
Binary file not shown.
@ -1,81 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
This gives other modules access to the gritty details about characters and the
|
|
||||||
encodings that use them.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import re
|
|
||||||
import zlib
|
|
||||||
from pkg_resources import resource_string
|
|
||||||
from ftfy.compatibility import unichr
|
|
||||||
|
|
||||||
# These are the five encodings we will try to fix in ftfy, in the
|
|
||||||
# order that they should be tried.
|
|
||||||
CHARMAP_ENCODINGS = [
|
|
||||||
'latin-1',
|
|
||||||
'sloppy-windows-1252',
|
|
||||||
'macroman',
|
|
||||||
'cp437',
|
|
||||||
'sloppy-windows-1251',
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def _build_regexes():
|
|
||||||
"""
|
|
||||||
ENCODING_REGEXES contain reasonably fast ways to detect if we
|
|
||||||
could represent a given string in a given encoding. The simplest one is
|
|
||||||
the 'ascii' detector, which of course just determines if all characters
|
|
||||||
are between U+0000 and U+007F.
|
|
||||||
"""
|
|
||||||
# Define a regex that matches ASCII text.
|
|
||||||
encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
|
|
||||||
|
|
||||||
for encoding in CHARMAP_ENCODINGS:
|
|
||||||
latin1table = ''.join(unichr(i) for i in range(128, 256))
|
|
||||||
charlist = latin1table.encode('latin-1').decode(encoding)
|
|
||||||
|
|
||||||
# Build a regex from the ASCII range, followed by the decodings of
|
|
||||||
# bytes 0x80-0xff in this character set. (This uses the fact that all
|
|
||||||
# regex special characters are ASCII, and therefore won't appear in the
|
|
||||||
# string.)
|
|
||||||
regex = '^[\x00-\x7f{0}]*$'.format(charlist)
|
|
||||||
encoding_regexes[encoding] = re.compile(regex)
|
|
||||||
return encoding_regexes
|
|
||||||
ENCODING_REGEXES = _build_regexes()
|
|
||||||
|
|
||||||
|
|
||||||
def possible_encoding(text, encoding):
|
|
||||||
"""
|
|
||||||
Given text and a single-byte encoding, check whether that text could have
|
|
||||||
been decoded from that single-byte encoding.
|
|
||||||
|
|
||||||
In other words, check whether it can be encoded in that encoding, possibly
|
|
||||||
sloppily.
|
|
||||||
"""
|
|
||||||
return bool(ENCODING_REGEXES[encoding].match(text))
|
|
||||||
|
|
||||||
|
|
||||||
CHAR_CLASS_STRING = zlib.decompress(
|
|
||||||
resource_string(__name__, 'char_classes.dat')
|
|
||||||
).decode('ascii')
|
|
||||||
|
|
||||||
def chars_to_classes(string):
|
|
||||||
"""
|
|
||||||
Convert each Unicode character to a letter indicating which of many
|
|
||||||
classes it's in.
|
|
||||||
|
|
||||||
See build_data.py for where this data comes from and what it means.
|
|
||||||
"""
|
|
||||||
return string.translate(CHAR_CLASS_STRING)
|
|
||||||
|
|
||||||
|
|
||||||
# A translate mapping that will strip all C0 control characters except
|
|
||||||
# those that represent whitespace.
|
|
||||||
CONTROL_CHARS = {}
|
|
||||||
for i in range(32):
|
|
||||||
CONTROL_CHARS[i] = None
|
|
||||||
|
|
||||||
# Map whitespace control characters to themselves.
|
|
||||||
for char in '\t\n\f\r':
|
|
||||||
del CONTROL_CHARS[ord(char)]
|
|
@ -1,34 +0,0 @@
|
|||||||
"""
|
|
||||||
A simple command-line utility for fixing text found in a file.
|
|
||||||
|
|
||||||
Because files do not come with their encoding marked, it first runs the file
|
|
||||||
through `ftfy.guess_bytes`, then runs it through `ftfy.fix_text`.
|
|
||||||
"""
|
|
||||||
from ftfy import fix_file
|
|
||||||
|
|
||||||
import sys
|
|
||||||
ENCODE_STDOUT = (sys.hexversion < 0x03000000)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""
|
|
||||||
Run ftfy as a command-line utility. (Requires Python 2.7 or later, or
|
|
||||||
the 'argparse' module.)
|
|
||||||
"""
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('filename', help='file to transcode')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
file = open(args.filename)
|
|
||||||
for line in fix_file(file):
|
|
||||||
if ENCODE_STDOUT:
|
|
||||||
sys.stdout.write(line.encode('utf-8'))
|
|
||||||
else:
|
|
||||||
sys.stdout.write(line)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -1,79 +0,0 @@
|
|||||||
"""
|
|
||||||
Makes some function names and behavior consistent between Python 2 and
|
|
||||||
Python 3, and also between narrow and wide builds.
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import sys
|
|
||||||
import re
|
|
||||||
import unicodedata
|
|
||||||
|
|
||||||
if sys.hexversion >= 0x03000000:
|
|
||||||
from html import entities
|
|
||||||
unichr = chr
|
|
||||||
xrange = range
|
|
||||||
PYTHON2 = False
|
|
||||||
else:
|
|
||||||
import htmlentitydefs as entities
|
|
||||||
unichr = unichr
|
|
||||||
xrange = xrange
|
|
||||||
PYTHON2 = True
|
|
||||||
htmlentitydefs = entities
|
|
||||||
|
|
||||||
PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000)
|
|
||||||
|
|
||||||
|
|
||||||
def _narrow_unichr_workaround(codepoint):
|
|
||||||
"""
|
|
||||||
A replacement for unichr() on narrow builds of Python. This will get
|
|
||||||
us the narrow representation of an astral character, which will be
|
|
||||||
a string of length two, containing two UTF-16 surrogates.
|
|
||||||
"""
|
|
||||||
escaped = b'\\U%08x' % codepoint
|
|
||||||
return escaped.decode('unicode-escape')
|
|
||||||
|
|
||||||
|
|
||||||
if sys.maxunicode < 0x10000:
|
|
||||||
unichr = _narrow_unichr_workaround
|
|
||||||
# In a narrow build of Python, we can't write a regex involving astral
|
|
||||||
# characters. If we want to write the regex:
|
|
||||||
#
|
|
||||||
# [\U00100000-\U0010ffff]
|
|
||||||
#
|
|
||||||
# The actual string that defines it quietly turns into:
|
|
||||||
#
|
|
||||||
# [\udbc0\udc00-\udbff\udfff]
|
|
||||||
#
|
|
||||||
# And now the range operator only applies to the middle two characters.
|
|
||||||
# It looks like a range that's going backwards from \dc00 to \dbff,
|
|
||||||
# which is an error.
|
|
||||||
#
|
|
||||||
# What we can do instead is rewrite the expression to be _about_ the two
|
|
||||||
# surrogates that make up the astral characters, instead of the characters
|
|
||||||
# themselves. This would be wrong on a wide build, but it works on a
|
|
||||||
# narrow build.
|
|
||||||
UNSAFE_PRIVATE_USE_RE = re.compile('[\udbc0-\udbff][\udc00-\udfff]')
|
|
||||||
else:
|
|
||||||
UNSAFE_PRIVATE_USE_RE = re.compile('[\U00100000-\U0010ffff]')
|
|
||||||
|
|
||||||
|
|
||||||
def bytes_to_ints(bytestring):
|
|
||||||
"""
|
|
||||||
No matter what version of Python this is, make a sequence of integers from
|
|
||||||
a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a
|
|
||||||
sequence of integers.
|
|
||||||
"""
|
|
||||||
if PYTHON2:
|
|
||||||
return [ord(b) for b in bytestring]
|
|
||||||
else:
|
|
||||||
return bytestring
|
|
||||||
|
|
||||||
|
|
||||||
def is_printable(char):
|
|
||||||
"""
|
|
||||||
str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so
|
|
||||||
let's make a crude approximation in Python 2.
|
|
||||||
"""
|
|
||||||
if PYTHON2:
|
|
||||||
return not unicodedata.category(char).startswith('C')
|
|
||||||
else:
|
|
||||||
return char.isprintable()
|
|
@ -1,473 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
This module contains the individual fixes that the main fix_text function
|
|
||||||
can perform.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from ftfy.chardata import (possible_encoding,
|
|
||||||
CHARMAP_ENCODINGS, CONTROL_CHARS)
|
|
||||||
from ftfy.badness import text_cost
|
|
||||||
from ftfy.compatibility import htmlentitydefs, unichr, UNSAFE_PRIVATE_USE_RE
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
|
|
||||||
BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
|
|
||||||
|
|
||||||
ftfy is designed to fix problems that were introduced by handling Unicode
|
|
||||||
incorrectly. It might be able to fix the bytes you just handed it, but the
|
|
||||||
fact that you just gave a pile of bytes to a function that fixes text means
|
|
||||||
that your code is *also* handling Unicode incorrectly.
|
|
||||||
|
|
||||||
ftfy takes Unicode text as input. You should take these bytes and decode
|
|
||||||
them from the encoding you think they are in. If you're not sure what encoding
|
|
||||||
they're in:
|
|
||||||
|
|
||||||
- First, try to find out. 'utf-8' is a good assumption.
|
|
||||||
- If the encoding is simply unknowable, try running your bytes through
|
|
||||||
ftfy.guess_bytes. As the name implies, this may not always be accurate.
|
|
||||||
|
|
||||||
If you're confused by this, please read the Python Unicode HOWTO:
|
|
||||||
|
|
||||||
http://docs.python.org/%d/howto/unicode.html
|
|
||||||
""" % sys.version_info[0]
|
|
||||||
|
|
||||||
|
|
||||||
def fix_text_encoding(text):
|
|
||||||
r"""
|
|
||||||
Fix text with incorrectly-decoded garbage ("mojibake") whenever possible.
|
|
||||||
|
|
||||||
Something you will find all over the place, in real-world text, is text
|
|
||||||
that's mistakenly encoded as utf-8, decoded in some ugly format like
|
|
||||||
latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
|
|
||||||
|
|
||||||
This causes your perfectly good Unicode-aware code to end up with garbage
|
|
||||||
text because someone else (or maybe "someone else") made a mistake.
|
|
||||||
|
|
||||||
This function looks for the evidence of that having happened and fixes it.
|
|
||||||
It determines whether it should replace nonsense sequences of single-byte
|
|
||||||
characters that were really meant to be UTF-8 characters, and if so, turns
|
|
||||||
them into the correctly-encoded Unicode character that they were meant to
|
|
||||||
represent.
|
|
||||||
|
|
||||||
The input to the function must be Unicode. If you don't have Unicode text,
|
|
||||||
you're not using the right tool to solve your problem.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
The following examples are written using unmarked literal strings,
|
|
||||||
but they are Unicode text. In Python 2 we have "unicode_literals"
|
|
||||||
turned on, and in Python 3 this is always the case.
|
|
||||||
|
|
||||||
ftfy decodes text that looks like it was decoded incorrectly. It leaves
|
|
||||||
alone text that doesn't.
|
|
||||||
|
|
||||||
>>> print(fix_text_encoding('único'))
|
|
||||||
único
|
|
||||||
|
|
||||||
>>> print(fix_text_encoding('This text is fine already :þ'))
|
|
||||||
This text is fine already :þ
|
|
||||||
|
|
||||||
Because these characters often come from Microsoft products, we allow
|
|
||||||
for the possibility that we get not just Unicode characters 128-255, but
|
|
||||||
also Windows's conflicting idea of what characters 128-160 are.
|
|
||||||
|
|
||||||
>>> print(fix_text_encoding('This — should be an em dash'))
|
|
||||||
This — should be an em dash
|
|
||||||
|
|
||||||
We might have to deal with both Windows characters and raw control
|
|
||||||
characters at the same time, especially when dealing with characters like
|
|
||||||
0x81 that have no mapping in Windows. This is a string that Python's
|
|
||||||
standard `.encode` and `.decode` methods cannot correct.
|
|
||||||
|
|
||||||
>>> print(fix_text_encoding('This text is sad .â\x81”.'))
|
|
||||||
This text is sad .⁔.
|
|
||||||
|
|
||||||
However, it has safeguards against fixing sequences of letters and
|
|
||||||
punctuation that can occur in valid text:
|
|
||||||
|
|
||||||
>>> print(fix_text_encoding('not such a fan of Charlotte Brontë…”'))
|
|
||||||
not such a fan of Charlotte Brontë…”
|
|
||||||
|
|
||||||
Cases of genuine ambiguity can sometimes be addressed by finding other
|
|
||||||
characters that are not double-encoded, and expecting the encoding to
|
|
||||||
be consistent:
|
|
||||||
|
|
||||||
>>> print(fix_text_encoding('AHÅ™, the new sofa from IKEA®'))
|
|
||||||
AHÅ™, the new sofa from IKEA®
|
|
||||||
|
|
||||||
Finally, we handle the case where the text is in a single-byte encoding
|
|
||||||
that was intended as Windows-1252 all along but read as Latin-1:
|
|
||||||
|
|
||||||
>>> print(fix_text_encoding('This text was never UTF-8 at all\x85'))
|
|
||||||
This text was never UTF-8 at all…
|
|
||||||
|
|
||||||
The best version of the text is found using
|
|
||||||
:func:`ftfy.badness.text_cost`.
|
|
||||||
"""
|
|
||||||
text, _plan = fix_encoding_and_explain(text)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def fix_encoding_and_explain(text):
|
|
||||||
"""
|
|
||||||
Re-decodes text that has been decoded incorrectly, and also return a
|
|
||||||
"plan" indicating all the steps required to fix it.
|
|
||||||
|
|
||||||
To fix similar text in the same way, without having to detect anything,
|
|
||||||
you can use the ``apply_plan`` function.
|
|
||||||
"""
|
|
||||||
best_version = text
|
|
||||||
best_cost = text_cost(text)
|
|
||||||
best_plan = []
|
|
||||||
plan_so_far = []
|
|
||||||
while True:
|
|
||||||
prevtext = text
|
|
||||||
text, plan = fix_one_step_and_explain(text)
|
|
||||||
plan_so_far.extend(plan)
|
|
||||||
cost = text_cost(text)
|
|
||||||
|
|
||||||
# Add a penalty if we used a particularly obsolete encoding. The result
|
|
||||||
# is that we won't use these encodings unless they can successfully
|
|
||||||
# replace multiple characters.
|
|
||||||
if ('encode', 'macroman') in plan_so_far or\
|
|
||||||
('encode', 'cp437') in plan_so_far:
|
|
||||||
cost += 2
|
|
||||||
|
|
||||||
# We need pretty solid evidence to decode from Windows-1251 (Cyrillic).
|
|
||||||
if ('encode', 'sloppy-windows-1251') in plan_so_far:
|
|
||||||
cost += 5
|
|
||||||
|
|
||||||
if cost < best_cost:
|
|
||||||
best_cost = cost
|
|
||||||
best_version = text
|
|
||||||
best_plan = list(plan_so_far)
|
|
||||||
if text == prevtext:
|
|
||||||
return best_version, best_plan
|
|
||||||
|
|
||||||
|
|
||||||
def fix_one_step_and_explain(text):
|
|
||||||
"""
|
|
||||||
Performs a single step of re-decoding text that's been decoded incorrectly.
|
|
||||||
|
|
||||||
Returns the decoded text, plus a "plan" for how to reproduce what it
|
|
||||||
did.
|
|
||||||
"""
|
|
||||||
if isinstance(text, bytes):
|
|
||||||
raise UnicodeError(BYTES_ERROR_TEXT)
|
|
||||||
if len(text) == 0:
|
|
||||||
return text, []
|
|
||||||
|
|
||||||
# The first plan is to return ASCII text unchanged.
|
|
||||||
if possible_encoding(text, 'ascii'):
|
|
||||||
return text, []
|
|
||||||
|
|
||||||
# As we go through the next step, remember the possible encodings
|
|
||||||
# that we encounter but don't successfully fix yet. We may need them
|
|
||||||
# later.
|
|
||||||
possible_1byte_encodings = []
|
|
||||||
|
|
||||||
# Suppose the text was supposed to be UTF-8, but it was decoded using
|
|
||||||
# a single-byte encoding instead. When these cases can be fixed, they
|
|
||||||
# are usually the correct thing to do, so try them next.
|
|
||||||
for encoding in CHARMAP_ENCODINGS:
|
|
||||||
if possible_encoding(text, encoding):
|
|
||||||
encoded_bytes = text.encode(encoding)
|
|
||||||
|
|
||||||
# Now, find out if it's UTF-8 (or close enough). Otherwise,
|
|
||||||
# remember the encoding for later.
|
|
||||||
try:
|
|
||||||
decoding = 'utf-8'
|
|
||||||
if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes:
|
|
||||||
decoding = 'utf-8-variants'
|
|
||||||
fixed = encoded_bytes.decode(decoding)
|
|
||||||
steps = [('encode', encoding), ('decode', decoding)]
|
|
||||||
return fixed, steps
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
possible_1byte_encodings.append(encoding)
|
|
||||||
|
|
||||||
# The next most likely case is that this is Latin-1 that was intended to
|
|
||||||
# be read as Windows-1252, because those two encodings in particular are
|
|
||||||
# easily confused.
|
|
||||||
if 'latin-1' in possible_1byte_encodings:
|
|
||||||
if 'windows-1252' in possible_1byte_encodings:
|
|
||||||
# This text is in the intersection of Latin-1 and
|
|
||||||
# Windows-1252, so it's probably legit.
|
|
||||||
return text, []
|
|
||||||
else:
|
|
||||||
# Otherwise, it means we have characters that are in Latin-1 but
|
|
||||||
# not in Windows-1252. Those are C1 control characters. Nobody
|
|
||||||
# wants those. Assume they were meant to be Windows-1252. Don't
|
|
||||||
# use the sloppy codec, because bad Windows-1252 characters are
|
|
||||||
# a bad sign.
|
|
||||||
encoded = text.encode('latin-1')
|
|
||||||
try:
|
|
||||||
fixed = encoded.decode('windows-1252')
|
|
||||||
steps = []
|
|
||||||
if fixed != text:
|
|
||||||
steps = [('encode', 'latin-1'), ('decode', 'windows-1252')]
|
|
||||||
return fixed, steps
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
# This text contained characters that don't even make sense
|
|
||||||
# if you assume they were supposed to be Windows-1252. In
|
|
||||||
# that case, let's not assume anything.
|
|
||||||
pass
|
|
||||||
|
|
||||||
# The cases that remain are mixups between two different single-byte
|
|
||||||
# encodings, and not the common case of Latin-1 vs. Windows-1252.
|
|
||||||
#
|
|
||||||
# Those cases are somewhat rare, and impossible to solve without false
|
|
||||||
# positives. If you're in one of these situations, you should try using
|
|
||||||
# the `ftfy.guess_bytes` function.
|
|
||||||
|
|
||||||
# Return the text unchanged; the plan is empty.
|
|
||||||
return text, []
|
|
||||||
|
|
||||||
|
|
||||||
def apply_plan(text, plan):
|
|
||||||
"""
|
|
||||||
Apply a plan for fixing the encoding of text.
|
|
||||||
|
|
||||||
The plan is a list of tuples of the form (operation, encoding), where
|
|
||||||
`operation` is either 'encode' or 'decode', and `encoding` is an encoding
|
|
||||||
name such as 'utf-8' or 'latin-1'.
|
|
||||||
|
|
||||||
Because only text can be encoded, and only bytes can be decoded, the plan
|
|
||||||
should alternate 'encode' and 'decode' steps, or else this function will
|
|
||||||
encounter an error.
|
|
||||||
"""
|
|
||||||
obj = text
|
|
||||||
for operation, encoding in plan:
|
|
||||||
if operation == 'encode':
|
|
||||||
obj = obj.encode(encoding)
|
|
||||||
elif operation == 'decode':
|
|
||||||
obj = obj.decode(encoding)
|
|
||||||
else:
|
|
||||||
raise ValueError("Unknown plan step: %s" % operation)
|
|
||||||
|
|
||||||
return obj
|
|
||||||
|
|
||||||
|
|
||||||
HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};")
|
|
||||||
|
|
||||||
|
|
||||||
def unescape_html(text):
|
|
||||||
"""
|
|
||||||
Decode all three types of HTML entities/character references.
|
|
||||||
|
|
||||||
Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
|
|
||||||
to it for efficiency: it won't match entities longer than 8 characters,
|
|
||||||
because there are no valid entities like that.
|
|
||||||
|
|
||||||
>>> print(unescape_html('<tag>'))
|
|
||||||
<tag>
|
|
||||||
"""
|
|
||||||
def fixup(match):
|
|
||||||
"""
|
|
||||||
Replace one matched HTML entity with the character it represents,
|
|
||||||
if possible.
|
|
||||||
"""
|
|
||||||
text = match.group(0)
|
|
||||||
if text[:2] == "&#":
|
|
||||||
# character reference
|
|
||||||
try:
|
|
||||||
if text[:3] == "&#x":
|
|
||||||
return unichr(int(text[3:-1], 16))
|
|
||||||
else:
|
|
||||||
return unichr(int(text[2:-1]))
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# named entity
|
|
||||||
try:
|
|
||||||
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
return text # leave as is
|
|
||||||
return HTML_ENTITY_RE.sub(fixup, text)
|
|
||||||
|
|
||||||
|
|
||||||
ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])')
|
|
||||||
|
|
||||||
def remove_terminal_escapes(text):
|
|
||||||
r"""
|
|
||||||
Strip out "ANSI" terminal escape sequences, such as those that produce
|
|
||||||
colored text on Unix.
|
|
||||||
|
|
||||||
>>> print(remove_terminal_escapes(
|
|
||||||
... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
|
|
||||||
... ))
|
|
||||||
I'm blue, da ba dee da ba doo...
|
|
||||||
"""
|
|
||||||
return ANSI_RE.sub('', text)
|
|
||||||
|
|
||||||
|
|
||||||
SINGLE_QUOTE_RE = re.compile('[\u2018-\u201b]')
|
|
||||||
DOUBLE_QUOTE_RE = re.compile('[\u201c-\u201f]')
|
|
||||||
|
|
||||||
def uncurl_quotes(text):
|
|
||||||
r"""
|
|
||||||
Replace curly quotation marks with straight equivalents.
|
|
||||||
|
|
||||||
>>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
|
|
||||||
"here's a test"
|
|
||||||
"""
|
|
||||||
return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
|
|
||||||
|
|
||||||
|
|
||||||
def fix_line_breaks(text):
|
|
||||||
r"""
|
|
||||||
Convert all line breaks to Unix style.
|
|
||||||
|
|
||||||
This will convert the following sequences into the standard \\n
|
|
||||||
line break:
|
|
||||||
|
|
||||||
- CRLF (\\r\\n), used on Windows and in some communication
|
|
||||||
protocols
|
|
||||||
- CR (\\r), once used on Mac OS Classic, and now kept alive
|
|
||||||
by misguided software such as Microsoft Office for Mac
|
|
||||||
- LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029),
|
|
||||||
defined by Unicode and used to sow confusion and discord
|
|
||||||
- NEXT LINE (\\x85), a C1 control character that is certainly
|
|
||||||
not what you meant
|
|
||||||
|
|
||||||
The NEXT LINE character is a bit of an odd case, because it
|
|
||||||
usually won't show up if `fix_encoding` is also being run.
|
|
||||||
\\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
|
|
||||||
|
|
||||||
>>> print(fix_line_breaks(
|
|
||||||
... "This string is made of two things:\u2029"
|
|
||||||
... "1. Unicode\u2028"
|
|
||||||
... "2. Spite"
|
|
||||||
... ))
|
|
||||||
This string is made of two things:
|
|
||||||
1. Unicode
|
|
||||||
2. Spite
|
|
||||||
|
|
||||||
For further testing and examples, let's define a function to make sure
|
|
||||||
we can see the control characters in their escaped form:
|
|
||||||
|
|
||||||
>>> def eprint(text):
|
|
||||||
... print(text.encode('unicode-escape').decode('ascii'))
|
|
||||||
|
|
||||||
>>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
|
|
||||||
Content-type: text/plain\n\nHi.
|
|
||||||
|
|
||||||
>>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
|
|
||||||
This is how Microsoft \n trolls Mac users
|
|
||||||
|
|
||||||
>>> eprint(fix_line_breaks("What is this \x85 I don't even"))
|
|
||||||
What is this \n I don't even
|
|
||||||
"""
|
|
||||||
return text.replace('\r\n', '\n').replace('\r', '\n')\
|
|
||||||
.replace('\u2028', '\n').replace('\u2029', '\n')\
|
|
||||||
.replace('\u0085', '\n')
|
|
||||||
|
|
||||||
|
|
||||||
def remove_control_chars(text):
|
|
||||||
"""
|
|
||||||
Remove all control characters except for the important ones.
|
|
||||||
|
|
||||||
This removes characters in these ranges:
|
|
||||||
|
|
||||||
- U+0000 to U+0008
|
|
||||||
- U+000B
|
|
||||||
- U+000E to U+001F
|
|
||||||
- U+007F
|
|
||||||
|
|
||||||
It leaves alone these characters that are commonly used for formatting:
|
|
||||||
|
|
||||||
- TAB (U+0009)
|
|
||||||
- LF (U+000A)
|
|
||||||
- FF (U+000C)
|
|
||||||
- CR (U+000D)
|
|
||||||
"""
|
|
||||||
return text.translate(CONTROL_CHARS)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_bom(text):
|
|
||||||
r"""
|
|
||||||
Remove a left-over byte-order mark.
|
|
||||||
|
|
||||||
>>> print(remove_bom("\ufeffWhere do you want to go today?"))
|
|
||||||
Where do you want to go today?
|
|
||||||
"""
|
|
||||||
return text.lstrip(unichr(0xfeff))
|
|
||||||
|
|
||||||
|
|
||||||
def remove_unsafe_private_use(text):
|
|
||||||
r"""
|
|
||||||
Python 3.3's Unicode support isn't perfect, and in fact there are certain
|
|
||||||
string operations that will crash some versions of it with a SystemError:
|
|
||||||
http://bugs.python.org/issue18183
|
|
||||||
|
|
||||||
The best solution is to remove all characters from Supplementary Private
|
|
||||||
Use Area B, using a regex that is known not to crash given those
|
|
||||||
characters.
|
|
||||||
|
|
||||||
These are the characters from U+100000 to U+10FFFF. It's sad to lose an
|
|
||||||
entire plane of Unicode, but on the other hand, these characters are not
|
|
||||||
assigned and never will be. If you get one of these characters and don't
|
|
||||||
know what its purpose is, its purpose is probably to crash your code.
|
|
||||||
|
|
||||||
If you were using these for actual private use, this might be inconvenient.
|
|
||||||
You can turn off this fixer, of course, but I kind of encourage using
|
|
||||||
Supplementary Private Use Area A instead.
|
|
||||||
|
|
||||||
>>> print(remove_unsafe_private_use('\U0001F4A9\U00100000'))
|
|
||||||
💩
|
|
||||||
|
|
||||||
This fixer is off by default in Python 3.4 or later. (The bug is actually
|
|
||||||
fixed in 3.3.3 and 2.7.6, but I don't want the default behavior to change
|
|
||||||
based on a micro version upgrade of Python.)
|
|
||||||
"""
|
|
||||||
return UNSAFE_PRIVATE_USE_RE.sub('', text)
|
|
||||||
|
|
||||||
|
|
||||||
# Define a regex to match valid escape sequences in Python string literals.
|
|
||||||
ESCAPE_SEQUENCE_RE = re.compile(r'''
|
|
||||||
( \\U........ # 8-digit hex escapes
|
|
||||||
| \\u.... # 4-digit hex escapes
|
|
||||||
| \\x.. # 2-digit hex escapes
|
|
||||||
| \\[0-7]{1,3} # Octal escapes
|
|
||||||
| \\N\{[^}]+\} # Unicode characters by name
|
|
||||||
| \\[\\'"abfnrtv] # Single-character escapes
|
|
||||||
)''', re.UNICODE | re.VERBOSE)
|
|
||||||
|
|
||||||
|
|
||||||
def decode_escapes(text):
|
|
||||||
r"""
|
|
||||||
Decode backslashed escape sequences, including \\x, \\u, and \\U character
|
|
||||||
references, even in the presence of other Unicode.
|
|
||||||
|
|
||||||
This is what Python's "string-escape" and "unicode-escape" codecs were
|
|
||||||
meant to do, but in contrast, this actually works. It will decode the
|
|
||||||
string exactly the same way that the Python interpreter decodes its string
|
|
||||||
literals.
|
|
||||||
|
|
||||||
>>> factoid = '\\u20a1 is the currency symbol for the colón.'
|
|
||||||
>>> print(factoid[1:])
|
|
||||||
u20a1 is the currency symbol for the colón.
|
|
||||||
>>> print(decode_escapes(factoid))
|
|
||||||
₡ is the currency symbol for the colón.
|
|
||||||
|
|
||||||
Even though Python itself can read string literals with a combination of
|
|
||||||
escapes and literal Unicode -- you're looking at one right now -- the
|
|
||||||
"unicode-escape" codec doesn't work on literal Unicode. (See
|
|
||||||
http://stackoverflow.com/a/24519338/773754 for more details.)
|
|
||||||
|
|
||||||
Instead, this function searches for just the parts of a string that
|
|
||||||
represent escape sequences, and decodes them, leaving the rest alone. All
|
|
||||||
valid escape sequences are made of ASCII characters, and this allows
|
|
||||||
"unicode-escape" to work correctly.
|
|
||||||
|
|
||||||
This fix cannot be automatically applied by the `ftfy.fix_text` function,
|
|
||||||
because escaped text is not necessarily a mistake, and there is no way
|
|
||||||
to distinguish text that's supposed to be escaped from text that isn't.
|
|
||||||
"""
|
|
||||||
def decode_match(match):
|
|
||||||
"Given a regex match, decode the escape sequence it contains."
|
|
||||||
return codecs.decode(match.group(0), 'unicode-escape')
|
|
||||||
|
|
||||||
return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
|
|
@ -1,39 +0,0 @@
|
|||||||
"""
|
|
||||||
This file defines a general method for evaluating ftfy using data that arrives
|
|
||||||
in a stream. A concrete implementation of it is found in `twitter_tester.py`.
|
|
||||||
"""
|
|
||||||
from __future__ import print_function, unicode_literals
|
|
||||||
from ftfy.fixes import fix_text_encoding
|
|
||||||
from ftfy.chardata import possible_encoding
|
|
||||||
|
|
||||||
|
|
||||||
class StreamTester:
|
|
||||||
"""
|
|
||||||
Take in a sequence of texts, and show the ones that will be changed by
|
|
||||||
ftfy. This will also periodically show updates, such as the proportion of
|
|
||||||
texts that changed.
|
|
||||||
"""
|
|
||||||
def __init__(self):
|
|
||||||
self.num_fixed = 0
|
|
||||||
self.count = 0
|
|
||||||
|
|
||||||
def check_ftfy(self, text):
|
|
||||||
"""
|
|
||||||
Given a single text input, check whether `ftfy.fix_text_encoding`
|
|
||||||
would change it. If so, display the change.
|
|
||||||
"""
|
|
||||||
self.count += 1
|
|
||||||
if not possible_encoding(text, 'ascii'):
|
|
||||||
fixed = fix_text_encoding(text)
|
|
||||||
if text != fixed:
|
|
||||||
# possibly filter common bots before printing
|
|
||||||
print(u'\nText:\t{text}\nFixed:\t{fixed}\n'.format(
|
|
||||||
text=text, fixed=fixed
|
|
||||||
))
|
|
||||||
self.num_fixed += 1
|
|
||||||
|
|
||||||
# Print status updates once in a while
|
|
||||||
if self.count % 100 == 0:
|
|
||||||
print('.', end='', flush=True)
|
|
||||||
if self.count % 10000 == 0:
|
|
||||||
print('\n%d/%d fixed' % (self.num_fixed, self.count))
|
|
@ -1,73 +0,0 @@
|
|||||||
# coding: utf-8
|
|
||||||
"""
|
|
||||||
Do what is necessary to authenticate this tester as a Twitter "app", using
|
|
||||||
somebody's Twitter account.
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth')
|
|
||||||
|
|
||||||
def get_auth():
|
|
||||||
"""
|
|
||||||
Twitter has some bizarre requirements about how to authorize an "app" to
|
|
||||||
use its API.
|
|
||||||
|
|
||||||
The user of the app has to log in to get a secret token. That's fine. But
|
|
||||||
the app itself has its own "consumer secret" token. The app has to know it,
|
|
||||||
and the user of the app has to not know it.
|
|
||||||
|
|
||||||
This is, of course, impossible. It's equivalent to DRM. Your computer can't
|
|
||||||
*really* make use of secret information while hiding the same information
|
|
||||||
from you.
|
|
||||||
|
|
||||||
The threat appears to be that, if you have this super-sekrit token, you can
|
|
||||||
impersonate the app while doing something different. Well, of course you
|
|
||||||
can do that, because you *have the source code* and you can change it to do
|
|
||||||
what you want. You still have to log in as a particular user who has a
|
|
||||||
token that's actually secret, you know.
|
|
||||||
|
|
||||||
Even developers of closed-source applications that use the Twitter API are
|
|
||||||
unsure what to do, for good reason. These "secrets" are not secret in any
|
|
||||||
cryptographic sense. A bit of Googling shows that the secret tokens for
|
|
||||||
every popular Twitter app are already posted on the Web.
|
|
||||||
|
|
||||||
Twitter wants us to pretend this string can be kept secret, and hide this
|
|
||||||
secret behind a fig leaf like everybody else does. So that's what we've
|
|
||||||
done.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from twitter.oauth import OAuth
|
|
||||||
from twitter import oauth_dance, read_token_file
|
|
||||||
|
|
||||||
def unhide(secret):
|
|
||||||
"""
|
|
||||||
Do something mysterious and exactly as secure as every other Twitter
|
|
||||||
app.
|
|
||||||
"""
|
|
||||||
return ''.join([chr(ord(c) - 0x2800) for c in secret])
|
|
||||||
|
|
||||||
fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁'
|
|
||||||
consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw'
|
|
||||||
|
|
||||||
if os.path.exists(AUTH_TOKEN_PATH):
|
|
||||||
token, token_secret = read_token_file(AUTH_TOKEN_PATH)
|
|
||||||
else:
|
|
||||||
authdir = os.path.dirname(AUTH_TOKEN_PATH)
|
|
||||||
if not os.path.exists(authdir):
|
|
||||||
os.makedirs(authdir)
|
|
||||||
token, token_secret = oauth_dance(
|
|
||||||
app_name='ftfy-tester',
|
|
||||||
consumer_key=consumer_key,
|
|
||||||
consumer_secret=unhide(fig_leaf),
|
|
||||||
token_filename=AUTH_TOKEN_PATH
|
|
||||||
)
|
|
||||||
|
|
||||||
return OAuth(
|
|
||||||
token=token,
|
|
||||||
token_secret=token_secret,
|
|
||||||
consumer_key=consumer_key,
|
|
||||||
consumer_secret=unhide(fig_leaf)
|
|
||||||
)
|
|
||||||
|
|
@ -1,89 +0,0 @@
|
|||||||
"""
|
|
||||||
Implements a StreamTester that runs over Twitter data. See the class
|
|
||||||
docstring.
|
|
||||||
|
|
||||||
This module is written for Python 3 only. The __future__ imports you see here
|
|
||||||
are just to let Python 2 scan the file without crashing with a SyntaxError.
|
|
||||||
"""
|
|
||||||
from __future__ import print_function, unicode_literals
|
|
||||||
import os
|
|
||||||
from collections import defaultdict
|
|
||||||
from ftfy.streamtester import StreamTester
|
|
||||||
|
|
||||||
|
|
||||||
class TwitterTester(StreamTester):
|
|
||||||
"""
|
|
||||||
This class uses the StreamTester code (defined in `__init__.py`) to
|
|
||||||
evaluate ftfy's real-world performance, by feeding it live data from
|
|
||||||
Twitter.
|
|
||||||
|
|
||||||
This is a semi-manual evaluation. It requires a human to look at the
|
|
||||||
results and determine if they are good. The three possible cases we
|
|
||||||
can see here are:
|
|
||||||
|
|
||||||
- Success: the process takes in mojibake and outputs correct text.
|
|
||||||
- False positive: the process takes in correct text, and outputs
|
|
||||||
mojibake. Every false positive should be considered a bug, and
|
|
||||||
reported on GitHub if it isn't already.
|
|
||||||
- Confusion: the process takes in mojibake and outputs different
|
|
||||||
mojibake. Not a great outcome, but not as dire as a false
|
|
||||||
positive.
|
|
||||||
|
|
||||||
This tester cannot reveal false negatives. So far, that can only be
|
|
||||||
done by the unit tests.
|
|
||||||
"""
|
|
||||||
OUTPUT_DIR = './twitterlogs'
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.lines_by_lang = defaultdict(list)
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def save_files(self):
|
|
||||||
"""
|
|
||||||
When processing data from live Twitter, save it to log files so that
|
|
||||||
it can be replayed later.
|
|
||||||
"""
|
|
||||||
if not os.path.exists(self.OUTPUT_DIR):
|
|
||||||
os.makedirs(self.OUTPUT_DIR)
|
|
||||||
for lang, lines in self.lines_by_lang.items():
|
|
||||||
filename = 'tweets.{}.txt'.format(lang)
|
|
||||||
fullname = os.path.join(self.OUTPUT_DIR, filename)
|
|
||||||
langfile = open(fullname, 'a')
|
|
||||||
for line in lines:
|
|
||||||
print(line.replace('\n', ' '), file=langfile)
|
|
||||||
langfile.close()
|
|
||||||
self.lines_by_lang = defaultdict(list)
|
|
||||||
|
|
||||||
def run_sample(self):
|
|
||||||
"""
|
|
||||||
Listen to live data from Twitter, and pass on the fully-formed tweets
|
|
||||||
to `check_ftfy`. This requires the `twitter` Python package as a
|
|
||||||
dependency.
|
|
||||||
"""
|
|
||||||
from twitter import TwitterStream
|
|
||||||
from ftfy.streamtester.oauth import get_auth
|
|
||||||
twitter_stream = TwitterStream(auth=get_auth())
|
|
||||||
iterator = twitter_stream.statuses.sample()
|
|
||||||
for tweet in iterator:
|
|
||||||
if 'text' in tweet:
|
|
||||||
self.check_ftfy(tweet['text'])
|
|
||||||
if 'user' in tweet:
|
|
||||||
lang = tweet['user'].get('lang', 'NONE')
|
|
||||||
self.lines_by_lang[lang].append(tweet['text'])
|
|
||||||
if self.count % 10000 == 100:
|
|
||||||
self.save_files()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""
|
|
||||||
When run from the command line, this script connects to the Twitter stream
|
|
||||||
and runs the TwitterTester on it forever. Or at least until the stream
|
|
||||||
drops.
|
|
||||||
"""
|
|
||||||
tester = TwitterTester()
|
|
||||||
tester.run_sample()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
|
|
@ -17,53 +17,71 @@
|
|||||||
# along with SickRage. If not, see <http://www.gnu.org/licenses/>.
|
# along with SickRage. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import traceback
|
||||||
|
|
||||||
import sickbeard
|
import sickbeard
|
||||||
from sickbeard import logger
|
from sickbeard import logger
|
||||||
|
|
||||||
import ftfy
|
import six
|
||||||
import ftfy.bad_codecs
|
import chardet
|
||||||
|
|
||||||
|
|
||||||
# This module tries to deal with the apparently random behavior of python when dealing with unicode <-> utf-8
|
# This module tries to deal with the apparently random behavior of python when dealing with unicode <-> utf-8
|
||||||
# encodings. It tries to just use unicode, but if that fails then it tries forcing it to utf-8. Any functions
|
# encodings. It tries to just use unicode, but if that fails then it tries forcing it to utf-8. Any functions
|
||||||
# which return something should always return unicode.
|
# which return something should always return unicode.
|
||||||
|
|
||||||
def fixStupidEncodings(x, silent=False):
|
def toUnicode(x):
|
||||||
if type(x) == str:
|
|
||||||
try:
|
try:
|
||||||
return str(ftfy.fix_text(u'' + x)).decode(sickbeard.SYS_ENCODING)
|
if isinstance(x, unicode):
|
||||||
except UnicodeDecodeError:
|
|
||||||
logger.log(u"Unable to decode value: " + repr(x), logger.ERROR)
|
|
||||||
return x
|
|
||||||
except UnicodeEncodeError:
|
|
||||||
logger.log(u"Unable to encode value: " + repr(x), logger.ERROR)
|
|
||||||
return x
|
|
||||||
elif type(x) == unicode:
|
|
||||||
return x
|
return x
|
||||||
else:
|
else:
|
||||||
logger.log(
|
try:
|
||||||
u"Unknown value passed in, ignoring it: " + str(type(x)) + " (" + repr(x) + ":" + repr(type(x)) + ")",
|
return six.text_type(x)
|
||||||
logger.DEBUG if silent else logger.ERROR)
|
except:
|
||||||
|
try:
|
||||||
|
if chardet.detect(x).get('encoding') == 'utf-8':
|
||||||
|
return x.decode('utf-8')
|
||||||
|
if isinstance(x, str):
|
||||||
|
try:
|
||||||
|
return x.decode(sickbeard.SYS_ENCODING)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
raise
|
||||||
|
return x
|
||||||
|
except:
|
||||||
|
raise
|
||||||
|
except:
|
||||||
|
logger.log('Unable to decode value "%s..." : %s ' % (repr(x)[:20], traceback.format_exc()), logger.WARNING)
|
||||||
|
ascii_text = str(x).encode('string_escape')
|
||||||
|
return toUnicode(ascii_text)
|
||||||
|
|
||||||
|
def ss(x):
|
||||||
|
u_x = toUnicode(x)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return u_x.encode(sickbeard.SYS_ENCODING)
|
||||||
|
except Exception as e:
|
||||||
|
logger.log('Failed ss encoding char, force UTF8: %s' % e, logger.WARNING)
|
||||||
|
try:
|
||||||
|
return u_x.encode(sickbeard.SYS_ENCODING, 'replace')
|
||||||
|
except:
|
||||||
|
return u_x.encode('utf-8', 'replace')
|
||||||
|
|
||||||
def fixListEncodings(x):
|
def fixListEncodings(x):
|
||||||
if type(x) != list and type(x) != tuple:
|
if not isinstance(x, (list, tuple)):
|
||||||
return x
|
return x
|
||||||
else:
|
else:
|
||||||
return filter(lambda x: x != None, map(fixStupidEncodings, x))
|
return filter(lambda x: x != None, map(toUnicode, x))
|
||||||
|
|
||||||
|
|
||||||
def ek(func, *args, **kwargs):
|
def ek(func, *args, **kwargs):
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
result = func(*args, **kwargs)
|
result = func(*args, **kwargs)
|
||||||
else:
|
else:
|
||||||
result = func(
|
result = func(*[ss(x) if isinstance(x, (str, unicode)) else x for x in args], **kwargs)
|
||||||
*[fixStupidEncodings(x).encode(sickbeard.SYS_ENCODING) if type(x) in (str, unicode) else x for x in args],
|
|
||||||
**kwargs)
|
|
||||||
|
|
||||||
if type(result) in (list, tuple):
|
if isinstance(result, (list, tuple)):
|
||||||
return fixListEncodings(result)
|
return fixListEncodings(result)
|
||||||
elif type(result) == str:
|
elif isinstance(result, str):
|
||||||
return fixStupidEncodings(result)
|
return toUnicode(result)
|
||||||
else:
|
else:
|
||||||
return result
|
return result
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with SickRage. If not, see <http://www.gnu.org/licenses/>.
|
# along with SickRage. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
from sickbeard.encodingKludge import fixStupidEncodings
|
from sickbeard.encodingKludge import toUnicode
|
||||||
|
|
||||||
def ex(e):
|
def ex(e):
|
||||||
"""
|
"""
|
||||||
@ -32,11 +32,11 @@ def ex(e):
|
|||||||
|
|
||||||
if arg is not None:
|
if arg is not None:
|
||||||
if isinstance(arg, (str, unicode)):
|
if isinstance(arg, (str, unicode)):
|
||||||
fixed_arg = fixStupidEncodings(arg, True)
|
fixed_arg = toUnicode(arg, True)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
fixed_arg = u"error " + fixStupidEncodings(str(arg), True)
|
fixed_arg = u"error " + toUnicode(str(arg), True)
|
||||||
|
|
||||||
except:
|
except:
|
||||||
fixed_arg = None
|
fixed_arg = None
|
||||||
|
@ -26,7 +26,7 @@ from sickbeard.exceptions import ex, EpisodeNotFoundException
|
|||||||
from sickbeard.history import dateFormat
|
from sickbeard.history import dateFormat
|
||||||
from sickbeard.common import Quality
|
from sickbeard.common import Quality
|
||||||
from sickbeard.common import WANTED, FAILED
|
from sickbeard.common import WANTED, FAILED
|
||||||
from sickbeard.encodingKludge import fixStupidEncodings
|
from sickbeard.encodingKludge import toUnicode
|
||||||
|
|
||||||
def prepareFailedName(release):
|
def prepareFailedName(release):
|
||||||
"""Standardizes release name for failed DB"""
|
"""Standardizes release name for failed DB"""
|
||||||
@ -36,7 +36,7 @@ def prepareFailedName(release):
|
|||||||
fixed = fixed.rpartition(".")[0]
|
fixed = fixed.rpartition(".")[0]
|
||||||
|
|
||||||
fixed = re.sub("[\.\-\+\ ]", "_", fixed)
|
fixed = re.sub("[\.\-\+\ ]", "_", fixed)
|
||||||
fixed = fixStupidEncodings(fixed)
|
fixed = toUnicode(fixed)
|
||||||
|
|
||||||
return fixed
|
return fixed
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ import db
|
|||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
from sickbeard.common import SNATCHED, SUBTITLED, FAILED, Quality
|
from sickbeard.common import SNATCHED, SUBTITLED, FAILED, Quality
|
||||||
from sickbeard.encodingKludge import fixStupidEncodings
|
from sickbeard.encodingKludge import toUnicode
|
||||||
|
|
||||||
|
|
||||||
dateFormat = "%Y%m%d%H%M%S"
|
dateFormat = "%Y%m%d%H%M%S"
|
||||||
@ -28,7 +28,7 @@ dateFormat = "%Y%m%d%H%M%S"
|
|||||||
|
|
||||||
def _logHistoryItem(action, showid, season, episode, quality, resource, provider, version=-1):
|
def _logHistoryItem(action, showid, season, episode, quality, resource, provider, version=-1):
|
||||||
logDate = datetime.datetime.today().strftime(dateFormat)
|
logDate = datetime.datetime.today().strftime(dateFormat)
|
||||||
resource = fixStupidEncodings(resource)
|
resource = toUnicode(resource)
|
||||||
|
|
||||||
myDB = db.DBConnection()
|
myDB = db.DBConnection()
|
||||||
myDB.action(
|
myDB.action(
|
||||||
|
@ -29,7 +29,7 @@ import sickbeard
|
|||||||
|
|
||||||
from sickbeard import logger, common
|
from sickbeard import logger, common
|
||||||
from sickbeard import db
|
from sickbeard import db
|
||||||
from sickbeard.encodingKludge import fixStupidEncodings
|
from sickbeard.encodingKludge import toUnicode
|
||||||
from sickbeard.exceptions import ex
|
from sickbeard.exceptions import ex
|
||||||
|
|
||||||
|
|
||||||
@ -51,7 +51,7 @@ class EmailNotifier:
|
|||||||
ep_name: The name of the episode that was snatched
|
ep_name: The name of the episode that was snatched
|
||||||
title: The title of the notification (optional)
|
title: The title of the notification (optional)
|
||||||
"""
|
"""
|
||||||
ep_name = fixStupidEncodings(ep_name)
|
ep_name = toUnicode(ep_name)
|
||||||
|
|
||||||
if sickbeard.EMAIL_NOTIFY_ONSNATCH:
|
if sickbeard.EMAIL_NOTIFY_ONSNATCH:
|
||||||
show = self._parseEp(ep_name)
|
show = self._parseEp(ep_name)
|
||||||
@ -86,7 +86,7 @@ class EmailNotifier:
|
|||||||
ep_name: The name of the episode that was downloaded
|
ep_name: The name of the episode that was downloaded
|
||||||
title: The title of the notification (optional)
|
title: The title of the notification (optional)
|
||||||
"""
|
"""
|
||||||
ep_name = fixStupidEncodings(ep_name)
|
ep_name = toUnicode(ep_name)
|
||||||
|
|
||||||
if sickbeard.EMAIL_NOTIFY_ONDOWNLOAD:
|
if sickbeard.EMAIL_NOTIFY_ONDOWNLOAD:
|
||||||
show = self._parseEp(ep_name)
|
show = self._parseEp(ep_name)
|
||||||
@ -121,7 +121,7 @@ class EmailNotifier:
|
|||||||
ep_name: The name of the episode that was downloaded
|
ep_name: The name of the episode that was downloaded
|
||||||
lang: Subtitle language wanted
|
lang: Subtitle language wanted
|
||||||
"""
|
"""
|
||||||
ep_name = fixStupidEncodings(ep_name)
|
ep_name = toUnicode(ep_name)
|
||||||
|
|
||||||
if sickbeard.EMAIL_NOTIFY_ONSUBTITLEDOWNLOAD:
|
if sickbeard.EMAIL_NOTIFY_ONSUBTITLEDOWNLOAD:
|
||||||
show = self._parseEp(ep_name)
|
show = self._parseEp(ep_name)
|
||||||
@ -198,7 +198,7 @@ class EmailNotifier:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def _parseEp(self, ep_name):
|
def _parseEp(self, ep_name):
|
||||||
ep_name = fixStupidEncodings(ep_name)
|
ep_name = toUnicode(ep_name)
|
||||||
|
|
||||||
sep = " - "
|
sep = " - "
|
||||||
titles = ep_name.split(sep)
|
titles = ep_name.split(sep)
|
||||||
|
@ -25,7 +25,7 @@ import sickbeard
|
|||||||
from sickbeard import logger
|
from sickbeard import logger
|
||||||
from sickbeard import common
|
from sickbeard import common
|
||||||
from sickbeard.exceptions import ex
|
from sickbeard.exceptions import ex
|
||||||
from sickbeard.encodingKludge import fixStupidEncodings
|
from sickbeard.encodingKludge import toUnicode
|
||||||
|
|
||||||
from sickbeard.notifiers.xbmc import XBMCNotifier
|
from sickbeard.notifiers.xbmc import XBMCNotifier
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ import sickbeard
|
|||||||
from sickbeard import logger
|
from sickbeard import logger
|
||||||
from sickbeard import common
|
from sickbeard import common
|
||||||
from sickbeard.exceptions import ex
|
from sickbeard.exceptions import ex
|
||||||
from sickbeard.encodingKludge import fixStupidEncodings
|
from sickbeard.encodingKludge import toUnicode
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -236,9 +236,9 @@ class XBMCNotifier:
|
|||||||
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
|
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
|
||||||
authheader = "Basic %s" % base64string
|
authheader = "Basic %s" % base64string
|
||||||
req.add_header("Authorization", authheader)
|
req.add_header("Authorization", authheader)
|
||||||
logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG)
|
logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG)
|
||||||
else:
|
else:
|
||||||
logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG)
|
logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG)
|
||||||
|
|
||||||
response = urllib2.urlopen(req)
|
response = urllib2.urlopen(req)
|
||||||
result = response.read().decode(sickbeard.SYS_ENCODING)
|
result = response.read().decode(sickbeard.SYS_ENCODING)
|
||||||
@ -248,7 +248,7 @@ class XBMCNotifier:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
except (urllib2.URLError, IOError), e:
|
except (urllib2.URLError, IOError), e:
|
||||||
logger.log(u"Warning: Couldn't contact XBMC HTTP at " + fixStupidEncodings(url) + " " + ex(e),
|
logger.log(u"Warning: Couldn't contact XBMC HTTP at " + toUnicode(url) + " " + ex(e),
|
||||||
logger.WARNING)
|
logger.WARNING)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -379,9 +379,9 @@ class XBMCNotifier:
|
|||||||
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
|
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
|
||||||
authheader = "Basic %s" % base64string
|
authheader = "Basic %s" % base64string
|
||||||
req.add_header("Authorization", authheader)
|
req.add_header("Authorization", authheader)
|
||||||
logger.log(u"Contacting XBMC (with auth header) via url: " + fixStupidEncodings(url), logger.DEBUG)
|
logger.log(u"Contacting XBMC (with auth header) via url: " + toUnicode(url), logger.DEBUG)
|
||||||
else:
|
else:
|
||||||
logger.log(u"Contacting XBMC via url: " + fixStupidEncodings(url), logger.DEBUG)
|
logger.log(u"Contacting XBMC via url: " + toUnicode(url), logger.DEBUG)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = urllib2.urlopen(req)
|
response = urllib2.urlopen(req)
|
||||||
@ -401,7 +401,7 @@ class XBMCNotifier:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
except IOError, e:
|
except IOError, e:
|
||||||
logger.log(u"Warning: Couldn't contact XBMC JSON API at " + fixStupidEncodings(url) + " " + ex(e),
|
logger.log(u"Warning: Couldn't contact XBMC JSON API at " + toUnicode(url) + " " + ex(e),
|
||||||
logger.WARNING)
|
logger.WARNING)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ from sickbeard import encodingKludge as ek
|
|||||||
from sickbeard.exceptions import ex
|
from sickbeard.exceptions import ex
|
||||||
|
|
||||||
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
|
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
|
||||||
from sickbeard.encodingKludge import fixStupidEncodings
|
from sickbeard.encodingKludge import toUnicode
|
||||||
|
|
||||||
|
|
||||||
def getSeasonNZBs(name, urlData, season):
|
def getSeasonNZBs(name, urlData, season):
|
||||||
@ -85,7 +85,7 @@ def createNZBString(fileElements, xmlns):
|
|||||||
for curFile in fileElements:
|
for curFile in fileElements:
|
||||||
rootElement.append(stripNS(curFile, xmlns))
|
rootElement.append(stripNS(curFile, xmlns))
|
||||||
|
|
||||||
return xml.etree.ElementTree.tostring(fixStupidEncodings(rootElement))
|
return xml.etree.ElementTree.tostring(toUnicode(rootElement))
|
||||||
|
|
||||||
|
|
||||||
def saveNZB(nzbName, nzbString):
|
def saveNZB(nzbName, nzbString):
|
||||||
|
@ -27,7 +27,7 @@ from sickbeard import helpers
|
|||||||
from sickbeard import name_cache
|
from sickbeard import name_cache
|
||||||
from sickbeard import logger
|
from sickbeard import logger
|
||||||
from sickbeard import db
|
from sickbeard import db
|
||||||
from sickbeard.encodingKludge import fixStupidEncodings
|
from sickbeard.encodingKludge import toUnicode
|
||||||
|
|
||||||
exception_dict = {}
|
exception_dict = {}
|
||||||
anidb_exception_dict = {}
|
anidb_exception_dict = {}
|
||||||
@ -234,7 +234,7 @@ def retrieve_exceptions():
|
|||||||
# if this exception isn't already in the DB then add it
|
# if this exception isn't already in the DB then add it
|
||||||
if cur_exception not in existing_exceptions:
|
if cur_exception not in existing_exceptions:
|
||||||
|
|
||||||
cur_exception = fixStupidEncodings(cur_exception)
|
cur_exception = toUnicode(cur_exception)
|
||||||
|
|
||||||
myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
|
myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
|
||||||
[cur_indexer_id, cur_exception, curSeason])
|
[cur_indexer_id, cur_exception, curSeason])
|
||||||
@ -267,7 +267,7 @@ def update_scene_exceptions(indexer_id, scene_exceptions, season=-1):
|
|||||||
exceptionsCache[indexer_id][season] = scene_exceptions
|
exceptionsCache[indexer_id][season] = scene_exceptions
|
||||||
|
|
||||||
for cur_exception in scene_exceptions:
|
for cur_exception in scene_exceptions:
|
||||||
cur_exception = fixStupidEncodings(cur_exception)
|
cur_exception = toUnicode(cur_exception)
|
||||||
|
|
||||||
myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
|
myDB.action("INSERT INTO scene_exceptions (indexer_id, show_name, season) VALUES (?,?,?)",
|
||||||
[indexer_id, cur_exception, season])
|
[indexer_id, cur_exception, season])
|
||||||
|
@ -234,7 +234,7 @@ def isGoodResult(name, show, log=True, season=-1):
|
|||||||
|
|
||||||
all_show_names = allPossibleShowNames(show, season=season)
|
all_show_names = allPossibleShowNames(show, season=season)
|
||||||
showNames = map(sanitizeSceneName, all_show_names) + all_show_names
|
showNames = map(sanitizeSceneName, all_show_names) + all_show_names
|
||||||
showNames += map(unidecode, all_show_names)
|
showNames += map(ek.toUnicode, all_show_names)
|
||||||
|
|
||||||
for curName in set(showNames):
|
for curName in set(showNames):
|
||||||
if not show.is_anime:
|
if not show.is_anime:
|
||||||
|
@ -33,7 +33,7 @@ from sickbeard.exceptions import AuthException
|
|||||||
from sickbeard.rssfeeds import RSSFeeds
|
from sickbeard.rssfeeds import RSSFeeds
|
||||||
from sickbeard import clients
|
from sickbeard import clients
|
||||||
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
|
from name_parser.parser import NameParser, InvalidNameException, InvalidShowException
|
||||||
from sickbeard.encodingKludge import fixStupidEncodings
|
from sickbeard.encodingKludge import toUnicode
|
||||||
|
|
||||||
class CacheDBConnection(db.DBConnection):
|
class CacheDBConnection(db.DBConnection):
|
||||||
def __init__(self, providerName):
|
def __init__(self, providerName):
|
||||||
@ -263,7 +263,7 @@ class TVCache():
|
|||||||
# get quality of release
|
# get quality of release
|
||||||
quality = parse_result.quality
|
quality = parse_result.quality
|
||||||
|
|
||||||
name = fixStupidEncodings(name)
|
name = toUnicode(name)
|
||||||
|
|
||||||
# get release group
|
# get release group
|
||||||
release_group = parse_result.release_group
|
release_group = parse_result.release_group
|
||||||
|
@ -3288,7 +3288,7 @@ class ErrorLogs(MainHandler):
|
|||||||
|
|
||||||
for x in reversed(data):
|
for x in reversed(data):
|
||||||
|
|
||||||
x = ek.fixStupidEncodings(x)
|
x = ek.toUnicode(x)
|
||||||
match = re.match(regex, x)
|
match = re.match(regex, x)
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
|
@ -18,23 +18,27 @@
|
|||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with SickRage. If not, see <http://www.gnu.org/licenses/>.
|
# along with SickRage. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
if __name__ == "__main__":
|
import glob
|
||||||
import glob
|
import unittest
|
||||||
import unittest
|
import sys
|
||||||
import sys
|
|
||||||
|
|
||||||
test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__]
|
class AllTests(unittest.TestCase):
|
||||||
module_strings = [file_string[0:len(file_string) - 3] for file_string in test_file_strings]
|
def setUp(self):
|
||||||
suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in module_strings]
|
self.test_file_strings = [ x for x in glob.glob('*_tests.py') if not x in __file__]
|
||||||
testSuite = unittest.TestSuite(suites)
|
self.module_strings = [file_string[0:len(file_string) - 3] for file_string in self.test_file_strings]
|
||||||
|
self.suites = [unittest.defaultTestLoader.loadTestsFromName(file_string) for file_string in self.module_strings]
|
||||||
|
self.testSuite = unittest.TestSuite(self.suites)
|
||||||
|
|
||||||
|
def testAll(self):
|
||||||
print "=================="
|
print "=================="
|
||||||
print "STARTING - ALL TESTS"
|
print "STARTING - ALL TESTS"
|
||||||
print "=================="
|
print "=================="
|
||||||
print "this will include"
|
for includedfiles in self.test_file_strings:
|
||||||
for includedfiles in test_file_strings:
|
|
||||||
print "- " + includedfiles
|
print "- " + includedfiles
|
||||||
|
|
||||||
text_runner = unittest.TextTestRunner().run(testSuite)
|
text_runner = unittest.TextTestRunner().run(self.testSuite)
|
||||||
if not text_runner.wasSuccessful():
|
if not text_runner.wasSuccessful():
|
||||||
sys.exit(-1)
|
sys.exit(-1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
@ -8,7 +8,6 @@ sys.path.append(os.path.abspath('../lib'))
|
|||||||
|
|
||||||
from sickbeard import common
|
from sickbeard import common
|
||||||
|
|
||||||
|
|
||||||
class QualityTests(unittest.TestCase):
|
class QualityTests(unittest.TestCase):
|
||||||
|
|
||||||
# TODO: repack / proper ? air-by-date ? season rip? multi-ep?
|
# TODO: repack / proper ? air-by-date ? season rip? multi-ep?
|
||||||
|
@ -51,7 +51,6 @@ EPISODE = 2
|
|||||||
FILENAME = u"show name - s0" + str(SEASON) + "e0" + str(EPISODE) + ".mkv"
|
FILENAME = u"show name - s0" + str(SEASON) + "e0" + str(EPISODE) + ".mkv"
|
||||||
FILEDIR = os.path.join(TESTDIR, SHOWNAME)
|
FILEDIR = os.path.join(TESTDIR, SHOWNAME)
|
||||||
FILEPATH = os.path.join(FILEDIR, FILENAME)
|
FILEPATH = os.path.join(FILEDIR, FILENAME)
|
||||||
|
|
||||||
SHOWDIR = os.path.join(TESTDIR, SHOWNAME + " final")
|
SHOWDIR = os.path.join(TESTDIR, SHOWNAME + " final")
|
||||||
|
|
||||||
#sickbeard.logger.sb_log_instance = sickbeard.logger.SBRotatingLogHandler(os.path.join(TESTDIR, 'sickbeard.log'), sickbeard.logger.NUM_LOGS, sickbeard.logger.LOG_SIZE)
|
#sickbeard.logger.sb_log_instance = sickbeard.logger.SBRotatingLogHandler(os.path.join(TESTDIR, 'sickbeard.log'), sickbeard.logger.NUM_LOGS, sickbeard.logger.LOG_SIZE)
|
||||||
|
Loading…
Reference in New Issue
Block a user