# -*- coding: utf-8 -*-
"""
ftfy: fixes text for you
This is a module for making text less broken. See the `fix_text` function
for more information.
"""
from __future__ import unicode_literals
# See the docstring for ftfy.bad_codecs to see what we're doing here.
import ftfy.bad_codecs
ftfy.bad_codecs.ok()
from ftfy import fixes
from ftfy.fixes import fix_text_encoding
from ftfy.compatibility import PYTHON34_OR_LATER, is_printable
import unicodedata
import warnings
def fix_text(text,
remove_unsafe_private_use=(not PYTHON34_OR_LATER),
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
normalization='NFKC',
uncurl_quotes=True,
fix_line_breaks=True,
remove_control_chars=True,
remove_bom=True,
max_decode_length=2**16):
r"""
Given Unicode text as input, make its representation consistent and
possibly less broken.
Let's start with some examples:
>>> print(fix_text('ünicode'))
ünicode
>>> print(fix_text('Broken text… it’s flubberific!'))
Broken text... it's flubberific!
>>> print(fix_text('HTML entities <3'))
HTML entities <3
>>> print(fix_text('HTML entities <3'))
HTML entities <3
>>> print(fix_text('\001\033[36;44mIm blue, da ba dee da ba '
... 'doo
\033[0m'))
I'm blue, da ba dee da ba doo...
>>> # This example string starts with a byte-order mark, even if
>>> # you can't see it on the Web.
>>> print(fix_text('\ufeffParty like\nit’s 1999!'))
Party like
it's 1999!
>>> len(fix_text('fi' * 100000))
200000
>>> len(fix_text(''))
0
Based on the options you provide, ftfy applies these steps in order:
- If `remove_unsafe_private_use` is True, it removes a range of private-use
characters that could trigger a Python bug. The bug is fixed in
the most recent versions of Python, so this will default to False
starting on Python 3.4.
- If `fix_entities` is True, replace HTML entities with their equivalent
characters. If it's "auto" (the default), then consider replacing HTML
entities, but don't do so in text where you have seen a pair of actual
angle brackets (that's probably actually HTML and you shouldn't mess
with the entities).
- If `remove_terminal_escapes` is True, remove sequences of bytes that are
instructions for Unix terminals, such as the codes that make text appear
in different colors.
- If `fix_encoding` is True, look for common mistakes that come from
encoding or decoding Unicode text incorrectly, and fix them if they are
reasonably fixable. See `fix_text_encoding` for details.
- If `normalization` is not None, apply the specified form of Unicode
normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
The default, 'NFKC', applies the following relevant transformations:
- C: Combine characters and diacritics that are written using separate
code points, such as converting "e" plus an acute accent modifier
into "é", or converting "ka" (か) plus a dakuten into the
single character "ga" (が).
- K: Replace characters that are functionally equivalent with the most
common form. For example, half-width katakana will be replaced with
full-width versions, full-width Roman characters will be replaced with
ASCII characters, ellipsis characters will be replaced with three
periods, and the ligature 'fl' will be replaced with 'fl'.
- If `uncurl_quotes` is True, replace various curly quotation marks with
plain-ASCII straight quotes.
- If `fix_line_breaks` is true, convert all line breaks to Unix style
(CRLF and CR line breaks become LF line breaks).
- If `fix_control_characters` is true, remove all C0 control characters
except the common useful ones: TAB, CR, LF, and FF. (CR characters
may have already been removed by the `fix_line_breaks` step.)
- If `remove_bom` is True, remove the Byte-Order Mark if it exists.
- If anything was changed, repeat all the steps, so that the function is
idempotent. "&" will become "&", for example, not "&".
`fix_text` will work one line at a time, with the possibility that some
lines are in different encodings. When it encounters lines longer than
`max_decode_length`, it will not run the `fix_encoding` step, to avoid
unbounded slowdowns.
If you are certain your entire text is in the same encoding (though that
encoding is possibly flawed), and do not mind performing operations on
the whole text at once, use `fix_text_segment`.
"""
if isinstance(text, bytes):
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
out = []
pos = 0
while pos < len(text):
textbreak = text.find('\n', pos) + 1
fix_encoding_this_time = fix_encoding
if textbreak == 0:
textbreak = len(text)
if (textbreak - pos) > max_decode_length:
fix_encoding_this_time = False
substring = text[pos:textbreak]
if fix_entities == 'auto' and '<' in substring and '>' in substring:
# we see angle brackets together; this could be HTML
fix_entities = False
out.append(
fix_text_segment(
substring,
remove_unsafe_private_use=remove_unsafe_private_use,
fix_entities=fix_entities,
remove_terminal_escapes=remove_terminal_escapes,
fix_encoding=fix_encoding_this_time,
normalization=normalization,
uncurl_quotes=uncurl_quotes,
fix_line_breaks=fix_line_breaks,
remove_control_chars=remove_control_chars,
remove_bom=remove_bom
)
)
pos = textbreak
return ''.join(out)
ftfy = fix_text
def fix_file(input_file,
remove_unsafe_private_use=True,
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
normalization='NFKC',
uncurl_quotes=True,
fix_line_breaks=True,
remove_control_chars=True,
remove_bom=True):
"""
Fix text that is found in a file.
If the file is being read as Unicode text, use that. If it's being read as
bytes, then unfortunately, we have to guess what encoding it is. We'll try
a few common encodings, but we make no promises. See the `guess_bytes`
function for how this is done.
The output is a stream of fixed lines of text.
"""
entities = fix_entities
for line in input_file:
if isinstance(line, bytes):
line, encoding = guess_bytes(line)
if fix_entities == 'auto' and '<' in line and '>' in line:
entities = False
yield fix_text_segment(
line,
remove_unsafe_private_use=remove_unsafe_private_use,
fix_entities=entities,
remove_terminal_escapes=remove_terminal_escapes,
fix_encoding=fix_encoding,
normalization=normalization,
uncurl_quotes=uncurl_quotes,
fix_line_breaks=fix_line_breaks,
remove_control_chars=remove_control_chars,
remove_bom=remove_bom
)
def fix_text_segment(text,
remove_unsafe_private_use=True,
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
normalization='NFKC',
uncurl_quotes=True,
fix_line_breaks=True,
remove_control_chars=True,
remove_bom=True):
"""
Apply fixes to text in a single chunk. This could be a line of text
within a larger run of `fix_text`, or it could be a larger amount
of text that you are certain is all in the same encoding.
See `fix_text` for a description of the parameters.
"""
if isinstance(text, bytes):
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
if fix_entities == 'auto' and '<' in text and '>' in text:
fix_entities = False
while True:
origtext = text
if remove_unsafe_private_use:
text = fixes.remove_unsafe_private_use(text)
if fix_entities:
text = fixes.unescape_html(text)
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_text_encoding(text)
if normalization is not None:
text = unicodedata.normalize(normalization, text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom:
text = fixes.remove_bom(text)
if text == origtext:
return text
def guess_bytes(bstring):
"""
If you have some bytes in an unknown encoding, here's a reasonable
strategy for decoding them, by trying a few common encodings that
can be distinguished from each other.
This is not a magic bullet. If the bytes are coming from some MySQL
database with the "character set" set to ISO Elbonian, this won't figure
it out. Perhaps more relevantly, this currently doesn't try East Asian
encodings.
The encodings we try are:
- UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
like nothing else
- UTF-8, because it's the global de facto standard
- "utf-8-variants", because it's what people actually implement when they
think they're doing UTF-8
- MacRoman, because Microsoft Office thinks it's still a thing, and it
can be distinguished by its line breaks. (If there are no line breaks in
the string, though, you're out of luck.)
- "sloppy-windows-1252", the Latin-1-like encoding that is the most common
single-byte encoding
"""
if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
return bstring.decode('utf-16'), 'utf-16'
byteset = set(bytes(bstring))
byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
try:
if byte_ed in byteset or byte_c0 in byteset:
# Byte 0xed can be used to encode a range of codepoints that
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
# so when we see 0xed, it's very likely we're being asked to
# decode CESU-8, the variant that encodes UTF-16 surrogates
# instead of the original characters themselves.
#
# This will occasionally trigger on standard UTF-8, as there
# are some Korean characters that also use byte 0xed, but that's
# not harmful.
#
# Byte 0xc0 is impossible because, numerically, it would only
# encode characters lower than U+0040. Those already have
# single-byte representations, and UTF-8 requires using the
# shortest possible representation. However, Java hides the null
# codepoint, U+0000, in a non-standard longer representation -- it
# encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
# will never appear in the encoded bytes.
#
# The 'utf-8-variants' decoder can handle both of these cases, as
# well as standard UTF-8, at the cost of a bit of speed.
return bstring.decode('utf-8-variants'), 'utf-8-variants'
else:
return bstring.decode('utf-8'), 'utf-8'
except UnicodeDecodeError:
pass
if byte_CR in bstring and byte_LF not in bstring:
return bstring.decode('macroman'), 'macroman'
else:
return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
def explain_unicode(text):
"""
A utility method that's useful for debugging mysterious Unicode.
It breaks down a string, showing you for each codepoint its number in
hexadecimal, its glyph, its category in the Unicode standard, and its name
in the Unicode standard.
>>> explain_unicode('(╯°□°)╯︵ ┻━┻')
U+0028 ( [Ps] LEFT PARENTHESIS
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
U+00B0 ° [So] DEGREE SIGN
U+25A1 □ [So] WHITE SQUARE
U+00B0 ° [So] DEGREE SIGN
U+0029 ) [Pe] RIGHT PARENTHESIS
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
U+0020 [Zs] SPACE
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
"""
for char in text:
if is_printable(char):
display = char
else:
display = char.encode('unicode-escape').decode('ascii')
print('U+{code:04X} {display:<7} [{category}] {name}'.format(
display=display,
code=ord(char),
category=unicodedata.category(char),
name=unicodedata.name(char, '')
))
def fix_bad_encoding(text):
"""
Kept for compatibility with previous versions of ftfy.
"""
warnings.warn(
'fix_bad_encoding is now known as fix_text_encoding',
DeprecationWarning
)
return fix_text_encoding(text)