SickRage/lib/ftfy/build_data.py

"""
A script to make the char_classes.dat file.

This never needs to run in normal usage. It needs to be run if the character
classes we care about change, or if a new version of Python supports a new
Unicode standard and we want it to affect our string decoding.

The file that we generate is based on Unicode 6.1, as supported by Python 3.3.
You can certainly use it in earlier versions. This simply makes sure that we
get consistent results from running ftfy on different versions of Python.

The file will be written to the current directory.
"""
from __future__ import unicode_literals
import unicodedata
import sys
import zlib
if sys.hexversion >= 0x03000000:
    unichr = chr

# L = Latin capital letter
# l = Latin lowercase letter
# A = Non-latin capital or title-case letter
# a = Non-latin lowercase letter
# C = Non-cased letter (Lo)
# X = Control character (Cc)
# m = Letter modifier (Lm)
# M = Mark (Mc, Me, Mn)
# N = Miscellaneous numbers (No)
# P = Private use (Co)
# 0 = Math symbol (Sm)
# 1 = Currency symbol (Sc)
# 2 = Symbol modifier (Sk)
# 3 = Other symbol (So)
# S = UTF-16 surrogate
# _ = Unassigned character
#   = Whitespace
# o = Other


def make_char_data_file(do_it_anyway=False):
    """
    Build the compressed data file 'char_classes.dat' and write it to the
    current directory.

    If you run this, run it in Python 3.3 or later. It will run in earlier
    versions, but you won't get the current Unicode standard, leading to
    inconsistent behavior. To protect against this, running this in the
    wrong version of Python will raise an error unless you pass
    `do_it_anyway=True`.
    """
    if sys.hexversion < 0x03030000 and not do_it_anyway:
        raise RuntimeError(
            "This function should be run in Python 3.3 or later."
        )

    cclasses = [None] * 0x110000
    for codepoint in range(0x0, 0x110000):
        char = unichr(codepoint)
        category = unicodedata.category(char)

        if category.startswith('L'):  # letters
            is_latin = unicodedata.name(char).startswith('LATIN')
            if is_latin and codepoint < 0x200:
                if category == 'Lu':
                    cclasses[codepoint] = 'L'
                else:
                    cclasses[codepoint] = 'l'
            else:  # non-Latin letter, or close enough
                if category == 'Lu' or category == 'Lt':
                    cclasses[codepoint] = 'A'
                elif category == 'Ll':
                    cclasses[codepoint] = 'a'
                elif category == 'Lo':
                    cclasses[codepoint] = 'C'
                elif category == 'Lm':
                    cclasses[codepoint] = 'm'
                else:
                    raise ValueError('got some weird kind of letter')
        elif category.startswith('M'):  # marks
            cclasses[codepoint] = 'M'
        elif category == 'No':
            cclasses[codepoint] = 'N'
        elif category == 'Sm':
            cclasses[codepoint] = '0'
        elif category == 'Sc':
            cclasses[codepoint] = '1'
        elif category == 'Sk':
            cclasses[codepoint] = '2'
        elif category == 'So':
            cclasses[codepoint] = '3'
        elif category == 'Cn':
            cclasses[codepoint] = '_'
        elif category == 'Cc':
            cclasses[codepoint] = 'X'
        elif category == 'Cs':
            cclasses[codepoint] = 'S'
        elif category == 'Co':
            cclasses[codepoint] = 'P'
        elif category.startswith('Z'):
            cclasses[codepoint] = ' '
        else:
            cclasses[codepoint] = 'o'

    cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
    out = open('char_classes.dat', 'wb')
    out.write(zlib.compress(''.join(cclasses).encode('ascii')))
    out.close()

if __name__ == '__main__':
    make_char_data_file()