SickRage/lib/hachoir_core/i18n.py

215 lines
6.1 KiB
Python

# -*- coding: UTF-8 -*-
"""
Functions to manage internationalisation (i18n):
- initLocale(): setup locales and install Unicode compatible stdout and
stderr ;
- getTerminalCharset(): guess terminal charset ;
- gettext(text) translate a string to current language. The function always
returns Unicode string. You can also use the alias: _() ;
- ngettext(singular, plural, count): translate a sentence with singular and
plural form. The function always returns Unicode string.
WARNING: Loading this module indirectly calls initLocale() which sets
locale LC_ALL to ''. This is needed to get user preferred locale
settings.
"""
import lib.hachoir_core.config as config
import lib.hachoir_core
import locale
from os import path
import sys
from codecs import BOM_UTF8, BOM_UTF16_LE, BOM_UTF16_BE
def _getTerminalCharset():
"""
Function used by getTerminalCharset() to get terminal charset.
@see getTerminalCharset()
"""
# (1) Try locale.getpreferredencoding()
try:
charset = locale.getpreferredencoding()
if charset:
return charset
except (locale.Error, AttributeError):
pass
# (2) Try locale.nl_langinfo(CODESET)
try:
charset = locale.nl_langinfo(locale.CODESET)
if charset:
return charset
except (locale.Error, AttributeError):
pass
# (3) Try sys.stdout.encoding
if hasattr(sys.stdout, "encoding") and sys.stdout.encoding:
return sys.stdout.encoding
# (4) Otherwise, returns "ASCII"
return "ASCII"
def getTerminalCharset():
"""
Guess terminal charset using differents tests:
1. Try locale.getpreferredencoding()
2. Try locale.nl_langinfo(CODESET)
3. Try sys.stdout.encoding
4. Otherwise, returns "ASCII"
WARNING: Call initLocale() before calling this function.
"""
try:
return getTerminalCharset.value
except AttributeError:
getTerminalCharset.value = _getTerminalCharset()
return getTerminalCharset.value
class UnicodeStdout(object):
def __init__(self, old_device, charset):
self.device = old_device
self.charset = charset
def flush(self):
self.device.flush()
def write(self, text):
if isinstance(text, unicode):
text = text.encode(self.charset, 'replace')
self.device.write(text)
def writelines(self, lines):
for text in lines:
self.write(text)
def initLocale():
# Only initialize locale once
if initLocale.is_done:
return getTerminalCharset()
initLocale.is_done = True
# Setup locales
try:
locale.setlocale(locale.LC_ALL, "")
except (locale.Error, IOError):
pass
# Get the terminal charset
charset = getTerminalCharset()
# UnicodeStdout conflicts with the readline module
if config.unicode_stdout and ('readline' not in sys.modules):
# Replace stdout and stderr by unicode objet supporting unicode string
sys.stdout = UnicodeStdout(sys.stdout, charset)
sys.stderr = UnicodeStdout(sys.stderr, charset)
return charset
initLocale.is_done = False
def _dummy_gettext(text):
return unicode(text)
def _dummy_ngettext(singular, plural, count):
if 1 < abs(count) or not count:
return unicode(plural)
else:
return unicode(singular)
def _initGettext():
charset = initLocale()
# Try to load gettext module
if config.use_i18n:
try:
import gettext
ok = True
except ImportError:
ok = False
else:
ok = False
# gettext is not available or not needed: use dummy gettext functions
if not ok:
return (_dummy_gettext, _dummy_ngettext)
# Gettext variables
package = lib.hachoir_core.PACKAGE
locale_dir = path.join(path.dirname(__file__), "..", "locale")
# Initialize gettext module
gettext.bindtextdomain(package, locale_dir)
gettext.textdomain(package)
translate = gettext.gettext
ngettext = gettext.ngettext
# TODO: translate_unicode lambda function really sucks!
# => find native function to do that
unicode_gettext = lambda text: \
unicode(translate(text), charset)
unicode_ngettext = lambda singular, plural, count: \
unicode(ngettext(singular, plural, count), charset)
return (unicode_gettext, unicode_ngettext)
UTF_BOMS = (
(BOM_UTF8, "UTF-8"),
(BOM_UTF16_LE, "UTF-16-LE"),
(BOM_UTF16_BE, "UTF-16-BE"),
)
# Set of valid characters for specific charset
CHARSET_CHARACTERS = (
# U+00E0: LATIN SMALL LETTER A WITH GRAVE
(set(u"©®éêè\xE0ç".encode("ISO-8859-1")), "ISO-8859-1"),
(set(u"©®éêè\xE0ç€".encode("ISO-8859-15")), "ISO-8859-15"),
(set(u"©®".encode("MacRoman")), "MacRoman"),
(set(u"εδηιθκμοΡσςυΈί".encode("ISO-8859-7")), "ISO-8859-7"),
)
def guessBytesCharset(bytes, default=None):
r"""
>>> guessBytesCharset("abc")
'ASCII'
>>> guessBytesCharset("\xEF\xBB\xBFabc")
'UTF-8'
>>> guessBytesCharset("abc\xC3\xA9")
'UTF-8'
>>> guessBytesCharset("File written by Adobe Photoshop\xA8 4.0\0")
'MacRoman'
>>> guessBytesCharset("\xE9l\xE9phant")
'ISO-8859-1'
>>> guessBytesCharset("100 \xA4")
'ISO-8859-15'
>>> guessBytesCharset('Word \xb8\xea\xe4\xef\xf3\xe7 - Microsoft Outlook 97 - \xd1\xf5\xe8\xec\xdf\xf3\xe5\xe9\xf2 e-mail')
'ISO-8859-7'
"""
# Check for UTF BOM
for bom_bytes, charset in UTF_BOMS:
if bytes.startswith(bom_bytes):
return charset
# Pure ASCII?
try:
text = unicode(bytes, 'ASCII', 'strict')
return 'ASCII'
except UnicodeDecodeError:
pass
# Valid UTF-8?
try:
text = unicode(bytes, 'UTF-8', 'strict')
return 'UTF-8'
except UnicodeDecodeError:
pass
# Create a set of non-ASCII characters
non_ascii_set = set( byte for byte in bytes if ord(byte) >= 128 )
for characters, charset in CHARSET_CHARACTERS:
if characters.issuperset(non_ascii_set):
return charset
return default
# Initialize _(), gettext() and ngettext() functions
gettext, ngettext = _initGettext()
_ = gettext