mirror of
https://github.com/moparisthebest/SickRage
synced 2024-11-10 11:25:05 -05:00
352 lines
14 KiB
Python
352 lines
14 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
ftfy: fixes text for you
|
||
|
||
This is a module for making text less broken. See the `fix_text` function
|
||
for more information.
|
||
"""
|
||
|
||
from __future__ import unicode_literals
|
||
|
||
# See the docstring for ftfy.bad_codecs to see what we're doing here.
|
||
import ftfy.bad_codecs
|
||
ftfy.bad_codecs.ok()
|
||
|
||
from ftfy import fixes
|
||
from ftfy.fixes import fix_text_encoding
|
||
from ftfy.compatibility import PYTHON34_OR_LATER, is_printable
|
||
import unicodedata
|
||
import warnings
|
||
|
||
|
||
def fix_text(text,
|
||
remove_unsafe_private_use=(not PYTHON34_OR_LATER),
|
||
fix_entities='auto',
|
||
remove_terminal_escapes=True,
|
||
fix_encoding=True,
|
||
normalization='NFKC',
|
||
uncurl_quotes=True,
|
||
fix_line_breaks=True,
|
||
remove_control_chars=True,
|
||
remove_bom=True,
|
||
max_decode_length=2**16):
|
||
r"""
|
||
Given Unicode text as input, make its representation consistent and
|
||
possibly less broken.
|
||
|
||
Let's start with some examples:
|
||
|
||
>>> print(fix_text('ünicode'))
|
||
ünicode
|
||
|
||
>>> print(fix_text('Broken text… it’s flubberific!'))
|
||
Broken text... it's flubberific!
|
||
|
||
>>> print(fix_text('HTML entities <3'))
|
||
HTML entities <3
|
||
|
||
>>> print(fix_text('<em>HTML entities <3</em>'))
|
||
<em>HTML entities <3</em>
|
||
|
||
>>> print(fix_text('\001\033[36;44mI’m blue, da ba dee da ba '
|
||
... 'doo…\033[0m'))
|
||
I'm blue, da ba dee da ba doo...
|
||
|
||
>>> # This example string starts with a byte-order mark, even if
|
||
>>> # you can't see it on the Web.
|
||
>>> print(fix_text('\ufeffParty like\nit’s 1999!'))
|
||
Party like
|
||
it's 1999!
|
||
|
||
>>> len(fix_text('fi' * 100000))
|
||
200000
|
||
|
||
>>> len(fix_text(''))
|
||
0
|
||
|
||
Based on the options you provide, ftfy applies these steps in order:
|
||
|
||
- If `remove_unsafe_private_use` is True, it removes a range of private-use
|
||
characters that could trigger a Python bug. The bug is fixed in
|
||
the most recent versions of Python, so this will default to False
|
||
starting on Python 3.4.
|
||
- If `fix_entities` is True, replace HTML entities with their equivalent
|
||
characters. If it's "auto" (the default), then consider replacing HTML
|
||
entities, but don't do so in text where you have seen a pair of actual
|
||
angle brackets (that's probably actually HTML and you shouldn't mess
|
||
with the entities).
|
||
- If `remove_terminal_escapes` is True, remove sequences of bytes that are
|
||
instructions for Unix terminals, such as the codes that make text appear
|
||
in different colors.
|
||
- If `fix_encoding` is True, look for common mistakes that come from
|
||
encoding or decoding Unicode text incorrectly, and fix them if they are
|
||
reasonably fixable. See `fix_text_encoding` for details.
|
||
- If `normalization` is not None, apply the specified form of Unicode
|
||
normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
|
||
The default, 'NFKC', applies the following relevant transformations:
|
||
|
||
- C: Combine characters and diacritics that are written using separate
|
||
code points, such as converting "e" plus an acute accent modifier
|
||
into "é", or converting "ka" (か) plus a dakuten into the
|
||
single character "ga" (が).
|
||
- K: Replace characters that are functionally equivalent with the most
|
||
common form. For example, half-width katakana will be replaced with
|
||
full-width versions, full-width Roman characters will be replaced with
|
||
ASCII characters, ellipsis characters will be replaced with three
|
||
periods, and the ligature 'fl' will be replaced with 'fl'.
|
||
|
||
- If `uncurl_quotes` is True, replace various curly quotation marks with
|
||
plain-ASCII straight quotes.
|
||
- If `fix_line_breaks` is true, convert all line breaks to Unix style
|
||
(CRLF and CR line breaks become LF line breaks).
|
||
- If `fix_control_characters` is true, remove all C0 control characters
|
||
except the common useful ones: TAB, CR, LF, and FF. (CR characters
|
||
may have already been removed by the `fix_line_breaks` step.)
|
||
- If `remove_bom` is True, remove the Byte-Order Mark if it exists.
|
||
- If anything was changed, repeat all the steps, so that the function is
|
||
idempotent. "&amp;" will become "&", for example, not "&".
|
||
|
||
`fix_text` will work one line at a time, with the possibility that some
|
||
lines are in different encodings. When it encounters lines longer than
|
||
`max_decode_length`, it will not run the `fix_encoding` step, to avoid
|
||
unbounded slowdowns.
|
||
|
||
If you are certain your entire text is in the same encoding (though that
|
||
encoding is possibly flawed), and do not mind performing operations on
|
||
the whole text at once, use `fix_text_segment`.
|
||
"""
|
||
if isinstance(text, bytes):
|
||
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
|
||
|
||
out = []
|
||
pos = 0
|
||
while pos < len(text):
|
||
textbreak = text.find('\n', pos) + 1
|
||
fix_encoding_this_time = fix_encoding
|
||
if textbreak == 0:
|
||
textbreak = len(text)
|
||
if (textbreak - pos) > max_decode_length:
|
||
fix_encoding_this_time = False
|
||
|
||
substring = text[pos:textbreak]
|
||
|
||
if fix_entities == 'auto' and '<' in substring and '>' in substring:
|
||
# we see angle brackets together; this could be HTML
|
||
fix_entities = False
|
||
|
||
out.append(
|
||
fix_text_segment(
|
||
substring,
|
||
remove_unsafe_private_use=remove_unsafe_private_use,
|
||
fix_entities=fix_entities,
|
||
remove_terminal_escapes=remove_terminal_escapes,
|
||
fix_encoding=fix_encoding_this_time,
|
||
normalization=normalization,
|
||
uncurl_quotes=uncurl_quotes,
|
||
fix_line_breaks=fix_line_breaks,
|
||
remove_control_chars=remove_control_chars,
|
||
remove_bom=remove_bom
|
||
)
|
||
)
|
||
pos = textbreak
|
||
|
||
return ''.join(out)
|
||
|
||
ftfy = fix_text
|
||
|
||
|
||
def fix_file(input_file,
|
||
remove_unsafe_private_use=True,
|
||
fix_entities='auto',
|
||
remove_terminal_escapes=True,
|
||
fix_encoding=True,
|
||
normalization='NFKC',
|
||
uncurl_quotes=True,
|
||
fix_line_breaks=True,
|
||
remove_control_chars=True,
|
||
remove_bom=True):
|
||
"""
|
||
Fix text that is found in a file.
|
||
|
||
If the file is being read as Unicode text, use that. If it's being read as
|
||
bytes, then unfortunately, we have to guess what encoding it is. We'll try
|
||
a few common encodings, but we make no promises. See the `guess_bytes`
|
||
function for how this is done.
|
||
|
||
The output is a stream of fixed lines of text.
|
||
"""
|
||
entities = fix_entities
|
||
for line in input_file:
|
||
if isinstance(line, bytes):
|
||
line, encoding = guess_bytes(line)
|
||
if fix_entities == 'auto' and '<' in line and '>' in line:
|
||
entities = False
|
||
yield fix_text_segment(
|
||
line,
|
||
remove_unsafe_private_use=remove_unsafe_private_use,
|
||
fix_entities=entities,
|
||
remove_terminal_escapes=remove_terminal_escapes,
|
||
fix_encoding=fix_encoding,
|
||
normalization=normalization,
|
||
uncurl_quotes=uncurl_quotes,
|
||
fix_line_breaks=fix_line_breaks,
|
||
remove_control_chars=remove_control_chars,
|
||
remove_bom=remove_bom
|
||
)
|
||
|
||
|
||
def fix_text_segment(text,
|
||
remove_unsafe_private_use=True,
|
||
fix_entities='auto',
|
||
remove_terminal_escapes=True,
|
||
fix_encoding=True,
|
||
normalization='NFKC',
|
||
uncurl_quotes=True,
|
||
fix_line_breaks=True,
|
||
remove_control_chars=True,
|
||
remove_bom=True):
|
||
"""
|
||
Apply fixes to text in a single chunk. This could be a line of text
|
||
within a larger run of `fix_text`, or it could be a larger amount
|
||
of text that you are certain is all in the same encoding.
|
||
|
||
See `fix_text` for a description of the parameters.
|
||
"""
|
||
if isinstance(text, bytes):
|
||
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
|
||
|
||
if fix_entities == 'auto' and '<' in text and '>' in text:
|
||
fix_entities = False
|
||
while True:
|
||
origtext = text
|
||
if remove_unsafe_private_use:
|
||
text = fixes.remove_unsafe_private_use(text)
|
||
if fix_entities:
|
||
text = fixes.unescape_html(text)
|
||
if remove_terminal_escapes:
|
||
text = fixes.remove_terminal_escapes(text)
|
||
if fix_encoding:
|
||
text = fixes.fix_text_encoding(text)
|
||
if normalization is not None:
|
||
text = unicodedata.normalize(normalization, text)
|
||
if uncurl_quotes:
|
||
text = fixes.uncurl_quotes(text)
|
||
if fix_line_breaks:
|
||
text = fixes.fix_line_breaks(text)
|
||
if remove_control_chars:
|
||
text = fixes.remove_control_chars(text)
|
||
if remove_bom:
|
||
text = fixes.remove_bom(text)
|
||
if text == origtext:
|
||
return text
|
||
|
||
|
||
def guess_bytes(bstring):
|
||
"""
|
||
If you have some bytes in an unknown encoding, here's a reasonable
|
||
strategy for decoding them, by trying a few common encodings that
|
||
can be distinguished from each other.
|
||
|
||
This is not a magic bullet. If the bytes are coming from some MySQL
|
||
database with the "character set" set to ISO Elbonian, this won't figure
|
||
it out. Perhaps more relevantly, this currently doesn't try East Asian
|
||
encodings.
|
||
|
||
The encodings we try are:
|
||
|
||
- UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
|
||
like nothing else
|
||
- UTF-8, because it's the global de facto standard
|
||
- "utf-8-variants", because it's what people actually implement when they
|
||
think they're doing UTF-8
|
||
- MacRoman, because Microsoft Office thinks it's still a thing, and it
|
||
can be distinguished by its line breaks. (If there are no line breaks in
|
||
the string, though, you're out of luck.)
|
||
- "sloppy-windows-1252", the Latin-1-like encoding that is the most common
|
||
single-byte encoding
|
||
"""
|
||
if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
|
||
return bstring.decode('utf-16'), 'utf-16'
|
||
|
||
byteset = set(bytes(bstring))
|
||
byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
|
||
|
||
try:
|
||
if byte_ed in byteset or byte_c0 in byteset:
|
||
# Byte 0xed can be used to encode a range of codepoints that
|
||
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
|
||
# so when we see 0xed, it's very likely we're being asked to
|
||
# decode CESU-8, the variant that encodes UTF-16 surrogates
|
||
# instead of the original characters themselves.
|
||
#
|
||
# This will occasionally trigger on standard UTF-8, as there
|
||
# are some Korean characters that also use byte 0xed, but that's
|
||
# not harmful.
|
||
#
|
||
# Byte 0xc0 is impossible because, numerically, it would only
|
||
# encode characters lower than U+0040. Those already have
|
||
# single-byte representations, and UTF-8 requires using the
|
||
# shortest possible representation. However, Java hides the null
|
||
# codepoint, U+0000, in a non-standard longer representation -- it
|
||
# encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
|
||
# will never appear in the encoded bytes.
|
||
#
|
||
# The 'utf-8-variants' decoder can handle both of these cases, as
|
||
# well as standard UTF-8, at the cost of a bit of speed.
|
||
return bstring.decode('utf-8-variants'), 'utf-8-variants'
|
||
else:
|
||
return bstring.decode('utf-8'), 'utf-8'
|
||
except UnicodeDecodeError:
|
||
pass
|
||
|
||
if byte_CR in bstring and byte_LF not in bstring:
|
||
return bstring.decode('macroman'), 'macroman'
|
||
else:
|
||
return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
|
||
|
||
|
||
def explain_unicode(text):
|
||
"""
|
||
A utility method that's useful for debugging mysterious Unicode.
|
||
|
||
It breaks down a string, showing you for each codepoint its number in
|
||
hexadecimal, its glyph, its category in the Unicode standard, and its name
|
||
in the Unicode standard.
|
||
|
||
>>> explain_unicode('(╯°□°)╯︵ ┻━┻')
|
||
U+0028 ( [Ps] LEFT PARENTHESIS
|
||
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
|
||
U+00B0 ° [So] DEGREE SIGN
|
||
U+25A1 □ [So] WHITE SQUARE
|
||
U+00B0 ° [So] DEGREE SIGN
|
||
U+0029 ) [Pe] RIGHT PARENTHESIS
|
||
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
|
||
U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
|
||
U+0020 [Zs] SPACE
|
||
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
|
||
U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL
|
||
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
|
||
"""
|
||
for char in text:
|
||
if is_printable(char):
|
||
display = char
|
||
else:
|
||
display = char.encode('unicode-escape').decode('ascii')
|
||
print('U+{code:04X} {display:<7} [{category}] {name}'.format(
|
||
display=display,
|
||
code=ord(char),
|
||
category=unicodedata.category(char),
|
||
name=unicodedata.name(char, '<unknown>')
|
||
))
|
||
|
||
|
||
def fix_bad_encoding(text):
|
||
"""
|
||
Kept for compatibility with previous versions of ftfy.
|
||
"""
|
||
warnings.warn(
|
||
'fix_bad_encoding is now known as fix_text_encoding',
|
||
DeprecationWarning
|
||
)
|
||
return fix_text_encoding(text)
|