Server line text-encoding-related fixes.

- Handle server lines that contain sequences which are invalid in the server encoding. Previously, these would cause the whole line to be interpreted in ISO-8859-1, but now they're simply replaced with an appropriate replacement character.

- Removed prefs.utf8_locale.

- Change default server encoding from system locale to UTF-8.

- Always populate server->encoding with a non-null value - UTF-8.

Fixes #1198
This commit is contained in:
Arnavion 2015-01-18 02:10:04 -08:00
parent 5569205d15
commit 5749c53484
7 changed files with 129 additions and 326 deletions

View File

@ -505,29 +505,11 @@ dcc_chat_line (struct DCC *dcc, char *line)
session *sess; session *sess;
char *word[PDIWORDS]; char *word[PDIWORDS];
char *po; char *po;
char *utf;
char *conv;
int ret, i; int ret, i;
gssize len;
gsize utf_len;
char portbuf[32]; char portbuf[32];
message_tags_data no_tags = MESSAGE_TAGS_DATA_INIT; message_tags_data no_tags = MESSAGE_TAGS_DATA_INIT;
len = strlen (line); line = text_invalid_encoding_to_utf8 (line, -1, dcc->serv->encoding, NULL);
if (dcc->serv->encoding == NULL) /* system */
utf = g_locale_to_utf8 (line, len, NULL, &utf_len, NULL);
else
utf = g_convert (line, len, "UTF-8", dcc->serv->encoding, 0, &utf_len, 0);
if (utf)
{
line = utf;
len = utf_len;
}
/* we really need valid UTF-8 now */
conv = text_validate (&line, &len);
sess = find_dialog (dcc->serv, dcc->nick); sess = find_dialog (dcc->serv, dcc->nick);
if (!sess) if (!sess)
@ -548,16 +530,14 @@ dcc_chat_line (struct DCC *dcc, char *line)
/* did the plugin close it? */ /* did the plugin close it? */
if (!g_slist_find (dcc_list, dcc)) if (!g_slist_find (dcc_list, dcc))
{ {
g_free (utf); g_free (line);
g_free (conv);
return 1; return 1;
} }
/* did the plugin eat the event? */ /* did the plugin eat the event? */
if (ret) if (ret)
{ {
g_free (utf); g_free (line);
g_free (conv);
return 0; return 0;
} }
@ -574,8 +554,7 @@ dcc_chat_line (struct DCC *dcc, char *line)
{ {
inbound_privmsg (dcc->serv, dcc->nick, "", line, FALSE, &no_tags); inbound_privmsg (dcc->serv, dcc->nick, "", line, FALSE, &no_tags);
} }
g_free (utf); g_free (line);
g_free (conv);
return 0; return 0;
} }

View File

@ -757,7 +757,6 @@ static void
xchat_init (void) xchat_init (void)
{ {
char buf[3068]; char buf[3068];
const char *cs = NULL;
#ifdef WIN32 #ifdef WIN32
WSADATA wsadata; WSADATA wsadata;
@ -795,9 +794,6 @@ xchat_init (void)
#endif #endif
#endif #endif
if (g_get_charset (&cs))
prefs.utf8_locale = TRUE;
load_text_events (); load_text_events ();
sound_load (); sound_load ();
notify_load (); notify_load ();

View File

@ -317,7 +317,6 @@ struct hexchatprefs
guint32 dcc_ip; guint32 dcc_ip;
unsigned int wait_on_exit; /* wait for logs to be flushed to disk IF we're connected */ unsigned int wait_on_exit; /* wait for logs to be flushed to disk IF we're connected */
unsigned int utf8_locale;
/* Tells us if we need to save, only when they've been edited. /* Tells us if we need to save, only when they've been edited.
This is so that we continue using internal defaults (which can This is so that we continue using internal defaults (which can

View File

@ -970,9 +970,7 @@ hexchat_printf (hexchat_plugin *ph, const char *format, ...)
void void
hexchat_command (hexchat_plugin *ph, const char *command) hexchat_command (hexchat_plugin *ph, const char *command)
{ {
char *command_nonconst; char *command_utf8;
char *conv;
gssize len = -1;
if (!is_session (ph->context)) if (!is_session (ph->context))
{ {
@ -981,11 +979,9 @@ hexchat_command (hexchat_plugin *ph, const char *command)
} }
/* scripts/plugins continue to send non-UTF8... *sigh* */ /* scripts/plugins continue to send non-UTF8... *sigh* */
command_nonconst = g_strdup (command); command_utf8 = text_invalid_encoding_to_utf8 (command, -1, "UTF-8", NULL);
conv = text_validate (&command_nonconst, &len); handle_command (ph->context, command_utf8, FALSE);
handle_command (ph->context, command_nonconst, FALSE); g_free (command_utf8);
g_free (conv);
g_free (command_nonconst);
} }
void void

View File

@ -89,48 +89,18 @@ int
tcp_send_real (void *ssl, int sok, char *encoding, char *buf, int len) tcp_send_real (void *ssl, int sok, char *encoding, char *buf, int len)
{ {
int ret; int ret;
char *locale;
gsize loc_len;
if (encoding == NULL) /* system */ gsize buf_encoded_len;
{ gchar *buf_encoded = text_invalid_utf8_to_encoding (buf, len, encoding, &buf_encoded_len);
locale = NULL; #ifdef USE_OPENSSL
if (!prefs.utf8_locale) if (!ssl)
{ ret = send (sok, buf_encoded, buf_encoded_len, 0);
const gchar *charset;
g_get_charset (&charset);
locale = g_convert_with_fallback (buf, len, charset, "UTF-8", "?", 0, &loc_len, 0);
}
}
else else
{ ret = _SSL_send (ssl, buf_encoded, buf_encoded_len);
locale = g_convert_with_fallback (buf, len, encoding, "UTF-8", "?", 0, &loc_len, 0);
}
if (locale)
{
len = loc_len;
#ifdef USE_OPENSSL
if (!ssl)
ret = send (sok, locale, len, 0);
else
ret = _SSL_send (ssl, locale, len);
#else #else
ret = send (sok, locale, len, 0); ret = send (sok, buf_encoded, buf_encoded_len, 0);
#endif #endif
g_free (locale); g_free (buf_encoded);
} else
{
#ifdef USE_OPENSSL
if (!ssl)
ret = send (sok, buf, len, 0);
else
ret = _SSL_send (ssl, buf, len);
#else
ret = send (sok, buf, len, 0);
#endif
}
return ret; return ret;
} }
@ -287,94 +257,15 @@ close_socket (int sok)
static void static void
server_inline (server *serv, char *line, gssize len) server_inline (server *serv, char *line, gssize len)
{ {
char *utf_line_allocated = NULL; gsize len_utf8;
line = text_invalid_encoding_to_utf8 (line, len, serv->encoding, &len_utf8);
/* Checks whether we're set to use UTF-8 charset */ fe_add_rawlog (serv, line, len_utf8, FALSE);
if ((serv->encoding == NULL && prefs.utf8_locale) /* Using system default - UTF-8 */ ||
g_ascii_strcasecmp (serv->encoding, "UTF8") == 0 ||
g_ascii_strcasecmp (serv->encoding, "UTF-8") == 0
)
{
utf_line_allocated = text_validate (&line, &len);
}
else
{
/* Since the user has an explicit charset set, either
via /charset command or from his non-UTF8 locale,
we don't fallback to ISO-8859-1 and instead try to remove
errnoeous octets till the string is convertable in the
said charset. */
const char *encoding = NULL;
if (serv->encoding != NULL)
encoding = serv->encoding;
else
g_get_charset (&encoding);
if (encoding != NULL)
{
char *conv_line; /* holds a copy of the original string */
gsize conv_len; /* tells g_convert how much of line to convert */
gsize utf_len;
gsize read_len;
GError *err;
gboolean retry;
conv_line = g_malloc (len + 1);
memcpy (conv_line, line, len);
conv_line[len] = 0;
conv_len = len;
/* if CP1255, convert it with the NUL terminator.
Works around SF bug #1122089 */
if (serv->using_cp1255)
conv_len++;
do
{
err = NULL;
retry = FALSE;
utf_line_allocated = g_convert_with_fallback (conv_line, conv_len, "UTF-8", encoding, "?", &read_len, &utf_len, &err);
if (err != NULL)
{
if (err->code == G_CONVERT_ERROR_ILLEGAL_SEQUENCE && conv_len > (read_len + 1))
{
/* Make our best bet by removing the erroneous char.
This will work for casual 8-bit strings with non-standard chars. */
memmove (conv_line + read_len, conv_line + read_len + 1, conv_len - read_len -1);
conv_len--;
retry = TRUE;
}
g_error_free (err);
}
} while (retry);
g_free (conv_line);
/* If any conversion has occured at all. Conversion might fail
due to errors other than invalid sequences, e.g. unknown charset. */
if (utf_line_allocated != NULL)
{
line = utf_line_allocated;
len = utf_len;
if (serv->using_cp1255 && len > 0)
len--;
}
else
{
/* If all fails, treat as UTF-8 with fallback to ISO-8859-1. */
utf_line_allocated = text_validate (&line, &len);
}
}
}
fe_add_rawlog (serv, line, len, FALSE);
/* let proto-irc.c handle it */ /* let proto-irc.c handle it */
serv->p_inline (serv, line, len); serv->p_inline (serv, line, len_utf8);
g_free (utf_line_allocated); g_free (line);
} }
/* read data from socket */ /* read data from socket */
@ -1749,12 +1640,7 @@ server_set_encoding (server *serv, char *new_encoding)
{ {
char *space; char *space;
if (serv->encoding) g_free (serv->encoding);
{
g_free (serv->encoding);
/* can be left as NULL to indicate system encoding */
serv->encoding = NULL;
}
if (new_encoding) if (new_encoding)
{ {
@ -1772,6 +1658,10 @@ server_set_encoding (server *serv, char *new_encoding)
serv->encoding = g_strdup ("UTF-8"); serv->encoding = g_strdup ("UTF-8");
} }
} }
else
{
serv->encoding = g_strdup ("UTF-8");
}
} }
server * server *
@ -1816,6 +1706,8 @@ server_set_defaults (server *serv)
serv->nick_prefixes = g_strdup ("@%+"); serv->nick_prefixes = g_strdup ("@%+");
serv->nick_modes = g_strdup ("ohv"); serv->nick_modes = g_strdup ("ohv");
server_set_encoding (serv, "UTF-8");
serv->nickcount = 1; serv->nickcount = 1;
serv->end_of_motd = FALSE; serv->end_of_motd = FALSE;
serv->is_away = FALSE; serv->is_away = FALSE;

View File

@ -658,33 +658,29 @@ log_open_or_close (session *sess)
int int
get_stamp_str (char *fmt, time_t tim, char **ret) get_stamp_str (char *fmt, time_t tim, char **ret)
{ {
char *loc = NULL;
char dest[128]; char dest[128];
gsize len; gsize len_locale;
gsize len_utf8;
/* strftime wants the format string in LOCALE! */ /* strftime requires the format string to be in locale encoding. */
if (!prefs.utf8_locale) fmt = g_locale_from_utf8 (fmt, -1, NULL, NULL, NULL);
len_locale = strftime_validated (dest, sizeof (dest), fmt, localtime (&tim));
g_free (fmt);
if (len_locale == 0)
{ {
const gchar *charset; return 0;
g_get_charset (&charset);
loc = g_convert_with_fallback (fmt, -1, charset, "UTF-8", "?", 0, 0, 0);
if (loc)
fmt = loc;
} }
len = strftime_validated (dest, sizeof (dest), fmt, localtime (&tim)); *ret = g_locale_to_utf8 (dest, len_locale, NULL, &len_utf8, NULL);
if (len) if (*ret == NULL)
{ {
if (prefs.utf8_locale) return 0;
*ret = g_strdup (dest);
else
*ret = g_locale_to_utf8 (dest, len, 0, &len, 0);
} }
g_free (loc); return len_utf8;
return len;
} }
static void static void
@ -753,154 +749,101 @@ log_write (session *sess, char *text, time_t ts)
g_free (temp); g_free (temp);
} }
/* converts a CP1252/ISO-8859-1(5) hybrid to UTF-8 */ /**
/* Features: 1. It never fails, all 00-FF chars are converted to valid UTF-8 */ * Converts a given string in from_encoding to to_encoding. This is similar to g_convert_with_fallback, except that it is tolerant of sequences in
/* 2. Uses CP1252 in the range 80-9f because ISO doesn't have any- */ * the original input that are invalid even in from_encoding. g_convert_with_fallback fails for such text, whereas this function replaces such a
/* thing useful in this range and it helps us receive from mIRC */ * sequence with the fallback string.
/* 3. The five undefined chars in CP1252 80-9f are replaced with */ *
/* ISO-8859-15 control codes. */ * If len is -1, strlen(text) is used to calculate the length. Do not pass -1 if text is supposed to contain \0 bytes, such as if from_encoding is a
/* 4. Handles 0xa4 as a Euro symbol ala ISO-8859-15. */ * multi-byte encoding like UTF-16.
/* 5. Uses ISO-8859-1 (which matches CP1252) for everything else. */ */
/* 6. This routine measured 3x faster than g_convert :) */ static gchar *
text_convert_invalid (const gchar* text, gssize len, const gchar *to_encoding, const gchar *from_encoding, const gchar *fallback, gsize *len_out)
static unsigned char *
iso_8859_1_to_utf8 (unsigned char *text, int len, gsize *bytes_written)
{ {
unsigned int idx; gchar *result_part;
unsigned char *res, *output; gsize result_part_len;
static const unsigned short lowtable[] = /* 74 byte table for 80-a4 */ const gchar *end;
{ gsize invalid_start_pos;
/* compressed utf-8 table: if the first byte's 0x20 bit is set, it GString *result;
indicates a 2-byte utf-8 sequence, otherwise prepend a 0xe2. */ const gchar *current_start;
0x82ac, /* 80 Euro. CP1252 from here on... */
0xe281, /* 81 NA */
0x809a, /* 82 */
0xe692, /* 83 */
0x809e, /* 84 */
0x80a6, /* 85 */
0x80a0, /* 86 */
0x80a1, /* 87 */
0xeb86, /* 88 */
0x80b0, /* 89 */
0xe5a0, /* 8a */
0x80b9, /* 8b */
0xe592, /* 8c */
0xe28d, /* 8d NA */
0xe5bd, /* 8e */
0xe28f, /* 8f NA */
0xe290, /* 90 NA */
0x8098, /* 91 */
0x8099, /* 92 */
0x809c, /* 93 */
0x809d, /* 94 */
0x80a2, /* 95 */
0x8093, /* 96 */
0x8094, /* 97 */
0xeb9c, /* 98 */
0x84a2, /* 99 */
0xe5a1, /* 9a */
0x80ba, /* 9b */
0xe593, /* 9c */
0xe29d, /* 9d NA */
0xe5be, /* 9e */
0xe5b8, /* 9f */
0xe2a0, /* a0 */
0xe2a1, /* a1 */
0xe2a2, /* a2 */
0xe2a3, /* a3 */
0x82ac /* a4 ISO-8859-15 Euro. */
};
if (len == -1) if (len == -1)
len = strlen (text);
/* worst case scenario: every byte turns into 3 bytes */
res = output = g_malloc ((len * 3) + 1);
while (len)
{ {
if (G_LIKELY (*text < 0x80)) len = strlen (text);
{
*output = *text; /* ascii maps directly */
}
else if (*text <= 0xa4) /* 80-a4 use a lookup table */
{
idx = *text - 0x80;
if (lowtable[idx] & 0x2000)
{
*output++ = (lowtable[idx] >> 8) & 0xdf; /* 2 byte utf-8 */
*output = lowtable[idx] & 0xff;
}
else
{
*output++ = 0xe2; /* 3 byte utf-8 */
*output++ = (lowtable[idx] >> 8) & 0xff;
*output = lowtable[idx] & 0xff;
}
}
else if (*text < 0xc0)
{
*output++ = 0xc2;
*output = *text;
}
else
{
*output++ = 0xc3;
*output = *text - 0x40;
}
output++;
text++;
len--;
} }
*output = 0; /* terminate */
*bytes_written = output - res;
return res; end = text + len;
/* Find the first position of an invalid sequence. */
result_part = g_convert (text, len, to_encoding, from_encoding, &invalid_start_pos, &result_part_len, NULL);
if (result_part != NULL)
{
/* All text converted successfully on the first try. Return it. */
if (len_out != NULL)
{
*len_out = result_part_len;
}
return result_part;
}
/* One or more invalid sequences exist that need to be replaced with the fallback. */
result = g_string_sized_new (len);
current_start = text;
for (;;)
{
g_assert (current_start + invalid_start_pos < end);
/* Convert everything before the position of the invalid sequence. It should be successful. */
result_part = g_convert (current_start, invalid_start_pos, to_encoding, from_encoding, &invalid_start_pos, &result_part_len, NULL);
g_assert (result_part != NULL);
g_string_append_len (result, result_part, result_part_len);
g_free (result_part);
/* Append the fallback */
g_string_append (result, fallback);
/* Now try converting everything after the invalid sequence. */
current_start += invalid_start_pos + 1;
result_part = g_convert (current_start, end - current_start, to_encoding, from_encoding, &invalid_start_pos, &result_part_len, NULL);
if (result_part != NULL)
{
/* The rest of the text converted successfully. Append it and return the whole converted text. */
g_string_append_len (result, result_part, result_part_len);
g_free (result_part);
if (len_out != NULL)
{
*len_out = result->len;
}
return g_string_free (result, FALSE);
}
/* The rest of the text didn't convert successfully. invalid_start_pos has the position of the next invalid sequence. */
}
} }
char * gchar *
text_validate (char **text, gssize *len) text_invalid_utf8_to_encoding (const gchar* text, gssize len, const gchar *to_encoding, gsize *len_out)
{ {
char *utf; return text_convert_invalid (text, len, to_encoding, "UTF-8", "?", len_out);
gsize utf_len; }
/* valid utf8? */ gchar *
if (g_utf8_validate (*text, *len, 0)) text_invalid_encoding_to_utf8 (const gchar* text, gssize len, const gchar *from_encoding, gsize *len_out)
return NULL; {
return text_convert_invalid (text, len, "UTF-8", from_encoding, "\357\277\275", len_out);
#ifdef WIN32
if (GetACP () == 1252) /* our routine is better than iconv's 1252 */
#else
if (prefs.utf8_locale)
#endif
/* fallback to iso-8859-1 */
utf = iso_8859_1_to_utf8 (*text, *len, &utf_len);
else
{
/* fallback to locale */
utf = g_locale_to_utf8 (*text, *len, 0, &utf_len, NULL);
if (!utf)
utf = iso_8859_1_to_utf8 (*text, *len, &utf_len);
}
if (!utf)
{
*text = g_strdup ("%INVALID%");
*len = 9;
} else
{
*text = utf;
*len = utf_len;
}
return utf;
} }
void void
PrintTextTimeStamp (session *sess, char *text, time_t timestamp) PrintTextTimeStamp (session *sess, char *text, time_t timestamp)
{ {
char *conv;
if (!sess) if (!sess)
{ {
if (!sess_list) if (!sess_list)
@ -909,22 +852,19 @@ PrintTextTimeStamp (session *sess, char *text, time_t timestamp)
} }
/* make sure it's valid utf8 */ /* make sure it's valid utf8 */
if (text[0] == 0) if (text[0] == '\0')
{ {
text = "\n"; text = g_strdup ("\n");
conv = NULL;
} }
else else
{ {
gssize len = -1; text = text_invalid_encoding_to_utf8 (text, -1, "UTF-8", NULL);
conv = text_validate ((char **)&text, &len);
} }
log_write (sess, text, timestamp); log_write (sess, text, timestamp);
scrollback_save (sess, text); scrollback_save (sess, text);
fe_print_text (sess, text, timestamp, FALSE); fe_print_text (sess, text, timestamp, FALSE);
g_free (text);
g_free (conv);
} }
void void

View File

@ -57,7 +57,8 @@ void text_emit (int index, session *sess, char *a, char *b, char *c, char *d,
time_t timestamp); time_t timestamp);
int text_emit_by_name (char *name, session *sess, time_t timestamp, int text_emit_by_name (char *name, session *sess, time_t timestamp,
char *a, char *b, char *c, char *d); char *a, char *b, char *c, char *d);
char *text_validate (char **text, gssize *len); gchar *text_invalid_utf8_to_encoding (const gchar* text, gssize len, const gchar *to_encoding, gsize *len_out);
gchar *text_invalid_encoding_to_utf8 (const gchar* text, gssize len, const gchar *from_encoding, gsize *len_out);
int get_stamp_str (char *fmt, time_t tim, char **ret); int get_stamp_str (char *fmt, time_t tim, char **ret);
void format_event (session *sess, int index, char **args, char *o, gsize sizeofo, unsigned int stripcolor_args); void format_event (session *sess, int index, char **args, char *o, gsize sizeofo, unsigned int stripcolor_args);
char *text_find_format_string (char *name); char *text_find_format_string (char *name);