mirror of
https://github.com/moparisthebest/SickRage
synced 2024-12-13 03:22:22 -05:00
82 lines
2.4 KiB
Python
82 lines
2.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
This gives other modules access to the gritty details about characters and the
|
|
encodings that use them.
|
|
"""
|
|
|
|
from __future__ import unicode_literals
|
|
import re
|
|
import zlib
|
|
from pkg_resources import resource_string
|
|
from ftfy.compatibility import unichr
|
|
|
|
# These are the five encodings we will try to fix in ftfy, in the
|
|
# order that they should be tried.
|
|
CHARMAP_ENCODINGS = [
|
|
'latin-1',
|
|
'sloppy-windows-1252',
|
|
'macroman',
|
|
'cp437',
|
|
'sloppy-windows-1251',
|
|
]
|
|
|
|
|
|
def _build_regexes():
|
|
"""
|
|
ENCODING_REGEXES contain reasonably fast ways to detect if we
|
|
could represent a given string in a given encoding. The simplest one is
|
|
the 'ascii' detector, which of course just determines if all characters
|
|
are between U+0000 and U+007F.
|
|
"""
|
|
# Define a regex that matches ASCII text.
|
|
encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
|
|
|
|
for encoding in CHARMAP_ENCODINGS:
|
|
latin1table = ''.join(unichr(i) for i in range(128, 256))
|
|
charlist = latin1table.encode('latin-1').decode(encoding)
|
|
|
|
# Build a regex from the ASCII range, followed by the decodings of
|
|
# bytes 0x80-0xff in this character set. (This uses the fact that all
|
|
# regex special characters are ASCII, and therefore won't appear in the
|
|
# string.)
|
|
regex = '^[\x00-\x7f{0}]*$'.format(charlist)
|
|
encoding_regexes[encoding] = re.compile(regex)
|
|
return encoding_regexes
|
|
ENCODING_REGEXES = _build_regexes()
|
|
|
|
|
|
def possible_encoding(text, encoding):
|
|
"""
|
|
Given text and a single-byte encoding, check whether that text could have
|
|
been decoded from that single-byte encoding.
|
|
|
|
In other words, check whether it can be encoded in that encoding, possibly
|
|
sloppily.
|
|
"""
|
|
return bool(ENCODING_REGEXES[encoding].match(text))
|
|
|
|
|
|
CHAR_CLASS_STRING = zlib.decompress(
|
|
resource_string(__name__, 'char_classes.dat')
|
|
).decode('ascii')
|
|
|
|
def chars_to_classes(string):
|
|
"""
|
|
Convert each Unicode character to a letter indicating which of many
|
|
classes it's in.
|
|
|
|
See build_data.py for where this data comes from and what it means.
|
|
"""
|
|
return string.translate(CHAR_CLASS_STRING)
|
|
|
|
|
|
# A translate mapping that will strip all C0 control characters except
|
|
# those that represent whitespace.
|
|
CONTROL_CHARS = {}
|
|
for i in range(32):
|
|
CONTROL_CHARS[i] = None
|
|
|
|
# Map whitespace control characters to themselves.
|
|
for char in '\t\n\f\r':
|
|
del CONTROL_CHARS[ord(char)]
|