From 4af624627eafdd5db9e0200bfd05c59aa60292b9 Mon Sep 17 00:00:00 2001 From: RichardHitt Date: Wed, 2 Jan 2013 14:50:26 -0800 Subject: [PATCH] overhauling of URL detection, including channel, nick, etc 'words' --- src/common/tree.c | 2 +- src/common/tree.h | 2 +- src/common/url.c | 465 +++++++++++++++++++++++++----------------- src/common/url.h | 3 +- src/common/userlist.c | 2 +- src/common/userlist.h | 2 +- src/fe-gtk/fe-gtk.c | 2 +- src/fe-gtk/maingui.c | 67 +++--- src/fe-gtk/xtext.c | 133 ++++++++---- src/fe-gtk/xtext.h | 4 +- 10 files changed, 402 insertions(+), 280 deletions(-) diff --git a/src/common/tree.c b/src/common/tree.c index 33fe1d41..715b0e56 100644 --- a/src/common/tree.c +++ b/src/common/tree.c @@ -142,7 +142,7 @@ mybsearch (const void *key, void **array, size_t nmemb, } void * -tree_find (tree *t, void *key, tree_cmp_func *cmp, void *data, int *pos) +tree_find (tree *t, const void *key, tree_cmp_func *cmp, void *data, int *pos) { if (!t || !t->array) return NULL; diff --git a/src/common/tree.h b/src/common/tree.h index 4a158052..ced8e425 100644 --- a/src/common/tree.h +++ b/src/common/tree.h @@ -8,7 +8,7 @@ typedef int (tree_traverse_func) (const void *key, void *data); tree *tree_new (tree_cmp_func *cmp, void *data); void tree_destroy (tree *t); -void *tree_find (tree *t, void *key, tree_cmp_func *cmp, void *data, int *pos); +void *tree_find (tree *t, const void *key, tree_cmp_func *cmp, void *data, int *pos); int tree_remove (tree *t, void *key, int *pos); void *tree_remove_at_pos (tree *t, int pos); void tree_foreach (tree *t, tree_traverse_func *func, void *data); diff --git a/src/common/url.c b/src/common/url.c index c5335859..98293635 100644 --- a/src/common/url.c +++ b/src/common/url.c @@ -13,7 +13,7 @@ * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include @@ -32,6 +32,13 @@ void *url_tree = NULL; GTree *url_btree = NULL; +static int do_an_re (const char *word, int *start, int *end, int *type); +static GRegex *re_url (void); +static GRegex *re_host (void); +static GRegex *re_email (void); +static GRegex *re_nick (void); +static GRegex *re_channel (void); +static GRegex *re_path (void); static int @@ -177,158 +184,38 @@ url_add (char *urltext, int len) keep it FAST! This new version was found to be almost 3x faster than 2.4.4 release. */ +static int laststart = 0; +static int lastend = 0; +static int lasttype = 0; + int -url_check_word (const char *word, int len) +url_check_word (const char *word) { -#define D(x) (x), ((sizeof (x)) - 1) - static const struct { - const char *s; - int len; - } - prefix[] = { - { D("irc.") }, - { D("ftp.") }, - { D("www.") }, - { D("irc://") }, - { D("ftp://") }, - { D("http://") }, - { D("https://") }, - { D("file://") }, - { D("rtsp://") }, - { D("ut2004://") }, - }, - suffix[] = { - { D(".org") }, - { D(".net") }, - { D(".com") }, - { D(".edu") }, - { D(".html") }, - { D(".info") }, - { D(".name") }, - /* Some extra common suffixes. - foo.blah/baz.php etc should work now, rather than - needing http:// at the beginning. */ - { D(".php") }, - { D(".htm") }, - { D(".aero") }, - { D(".asia") }, - { D(".biz") }, - { D(".cat") }, - { D(".coop") }, - { D(".int") }, - { D(".jobs") }, - { D(".mobi") }, - { D(".museum") }, - { D(".pro") }, - { D(".tel") }, - { D(".travel") }, - { D(".xxx") }, - { D(".asp") }, - { D(".aspx") }, - { D(".shtml") }, - { D(".xml") }, - }; -#undef D - const char *at, *dot; - int i, dots; - - /* this is pretty much the same as in logmask_is_fullpath() except with length checks and .\ for portable mode */ -#ifdef WIN32 - if ((len > 1 && word[0] == '\\') || - (len > 2 && word[0] == '.' && word[1] == '\\') || - (len > 2 && (((word[0] >= 'A' && word[0] <= 'Z') || (word[0] >= 'a' && word[0] <= 'z')) && word[1] == ':'))) -#else - if (len > 1 && word[0] == '/') -#endif + laststart = lastend = lasttype = 0; + if (do_an_re (word, &laststart, &lastend, &lasttype)) { - return WORD_PATH; - } - - if (len > 1 && word[1] == '#' && strchr("@+^%*#", word[0])) - return WORD_CHANNEL; - - if ((word[0] == '#' || word[0] == '&') && word[1] != '#' && word[1] != 0) - return WORD_CHANNEL; - - for (i = 0; i < G_N_ELEMENTS(prefix); i++) - { - int l; - - l = prefix[i].len; - if (len > l) + switch (lasttype) { - int j; - - /* This is pretty much g_ascii_strncasecmp(). */ - for (j = 0; j < l; j++) - { - unsigned char c = word[j]; - if (tolower(c) != prefix[i].s[j]) - break; - } - if (j == l) - return WORD_URL; + case WORD_NICK: + if (!isalnum (word[laststart])) + laststart++; + if (!userlist_find (current_sess, &word[laststart])) + lasttype = 0; + return lasttype; + case WORD_EMAIL: + if (!isalnum (word[laststart])) + laststart++; + /* Fall through */ + case WORD_URL: + case WORD_HOST: + case WORD_CHANNEL: + return lasttype; + default: + return 0; /* Should not occur */ } } - - at = strchr (word, '@'); /* check for email addy */ - dot = strrchr (word, '.'); - if (at && dot) - { - if (at < dot) - { - if (strchr (word, '*')) - return WORD_HOST; - else - return WORD_EMAIL; - } - } - - /* check if it's an IP number */ - dots = 0; - for (i = 0; i < len; i++) - { - if (word[i] == '.' && i > 0) - dots++; /* allow 127.0.0.1:80 */ - else if (!isdigit ((unsigned char) word[i]) && word[i] != ':') - { - dots = 0; - break; - } - } - if (dots == 3) - return WORD_HOST; - - if (len > 5) - { - for (i = 0; i < G_N_ELEMENTS(suffix); i++) - { - int l; - - l = suffix[i].len; - if (len > l) - { - const unsigned char *p = &word[len - l]; - int j; - - /* This is pretty much g_ascii_strncasecmp(). */ - for (j = 0; j < l; j++) - { - if (tolower(p[j]) != suffix[i].s[j]) - break; - } - if (j == l) - return WORD_HOST; - } - } - - if (word[len - 3] == '.' && - isalpha ((unsigned char) word[len - 2]) && - isalpha ((unsigned char) word[len - 1])) - return WORD_HOST; - } - - return 0; + else + return 0; } /* List of IRC commands for which contents (and thus possible URLs) @@ -346,9 +233,10 @@ static char *commands[] = { void url_check_line (char *buf, int len) { + GRegex *re(void); + GMatchInfo *gmi; char *po = buf; - char *start; - int i, wlen; + int i; /* Skip over message prefix */ if (*po == ':') @@ -379,50 +267,243 @@ url_check_line (char *buf, int len) return; po++; - if (buf[0] == ':' && buf[1] != 0) - po++; - - start = po; - - /* check each "word" (space separated) */ - while (1) + g_regex_match(re_url(), po, 0, &gmi); + while (g_match_info_matches(gmi)) { - switch (po[0]) - { - case 0: - case ' ': - case '\r': + int start, end; - wlen = po - start; - if (wlen > 2) - { - /* HACK! :( */ - /* This is to work around not being able to detect URLs that are at - the start of messages. */ - if (start[0] == ':') - { - start++; - wlen--; - } - if (start[0] == '+' || start[0] == '-') - { - start++; - wlen--; - } - - if (wlen > 2 && url_check_word (start, wlen) == WORD_URL) - { - url_add (start, wlen); - } - } - if (po[0] == 0) - return; - po++; - start = po; - break; - - default: - po++; - } + g_match_info_fetch_pos(gmi, 0, &start, &end); + if (po[end - 1] == '\r') + po[--end] = 0; + if (g_strstr_len (po + start, end - start, "://")) + url_add(po + start, end - start); + g_match_info_next(gmi, NULL); } + g_match_info_free(gmi); +} + +int +url_last (int *lstart, int *lend) +{ + *lstart = laststart; + *lend = lastend; + return lasttype; +} + +static int +do_an_re(const char *word,int *start, int *end, int *type) +{ + typedef struct func_s { + GRegex *(*fn)(void); + int type; + } func_t; + func_t funcs[] = + { + { re_email, WORD_EMAIL }, + { re_url, WORD_URL }, + { re_host, WORD_HOST }, + { re_channel, WORD_CHANNEL }, + { re_path, WORD_PATH }, + { re_nick, WORD_NICK } + }; + + GMatchInfo *gmi; + int k; + + for (k = 0; k < sizeof funcs / sizeof (func_t); k++) + { + g_regex_match (funcs[k].fn(), word, 0, &gmi); + if (!g_match_info_matches (gmi)) + { + g_match_info_free (gmi); + continue; + } + while (g_match_info_matches (gmi)) + { + g_match_info_fetch_pos (gmi, 0, start, end); + g_match_info_next (gmi, NULL); + } + g_match_info_free (gmi); + *type = funcs[k].type; + return TRUE; + } + + return FALSE; +} + +/* Miscellaneous description --- */ +#define DOMAIN "[-a-z0-9]+(\\.[-a-z0-9]+)*\\.[a-z]+" +#define IPADDR "[0-9]+(\\.[0-9]+){3}" +#define HOST "(" DOMAIN "|" IPADDR ")" +#define OPT_PORT "(:[1-9][0-9]{0,4})?" + +GRegex * +make_re(char *grist, char *type) +{ + GRegex *ret; + GError *err = NULL; + + ret = g_regex_new (grist, G_REGEX_CASELESS + G_REGEX_OPTIMIZE, 0, &err); + g_free (grist); + return ret; +} + +/* HOST description --- */ +/* (see miscellaneous above) */ +static GRegex * +re_host (void) +{ + static GRegex *host_ret; + char *grist; + grist = g_strdup_printf ( + "(" /* HOST */ + HOST OPT_PORT + ")" + ); + host_ret = make_re (grist, "re_host"); + return host_ret; +} + +/* URL description --- */ +#define SCHEME "(%s)" +#define LPAR "\\(" +#define RPAR "\\)" +#define NOPARENS "[^() \t]*" + +char *prefix[] = { + "irc\\.", + "ftp\\.", + "www\\.", + "irc://", + "ftp://", + "http://", + "https://", + "file://", + "rtsp://", + NULL +}; + +static GRegex * +re_url (void) +{ + static GRegex *url_ret; + char *grist; + char *scheme; + + if (url_ret) return url_ret; + + scheme = g_strjoinv ("|", prefix); + grist = g_strdup_printf ( + "(" /* URL or HOST */ + SCHEME HOST OPT_PORT + "(" /* Optional "/path?query_string#fragment_id" */ + "/" /* Must start with slash */ + "(" + "(" LPAR NOPARENS RPAR ")" + "|" + "(" NOPARENS ")" + ")*" /* Zero or more occurrences of either of these */ + "(?type == SESS_DIALOG) - return WORD_DIALOG; - } + ret = url_check_word (word); + if (ret == 0 && sess->type == SESS_DIALOG) + return WORD_DIALOG; return ret; } @@ -2266,23 +2260,28 @@ static void mg_word_clicked (GtkWidget *xtext, char *word, GdkEventButton *even) { session *sess = current_sess; + int word_type, start, end; + char *tmp; - if (even->button == 1) /* left button */ + if (word == NULL) { - if (word == NULL) - { + if (even->button == 1) /* left button */ mg_focus (sess); - return; - } + return; + } - if ((even->state & 13) == prefs.hex_gui_url_mod) + word_type = mg_word_check (xtext, word); + url_last (&start, &end); + + if (even->button == 1 && (even->state & 13) == prefs.hex_gui_url_mod) + { + switch (word_type) { - switch (mg_word_check (xtext, word, strlen (word))) - { - case WORD_URL: - case WORD_HOST: - fe_open_url (word); - } + case WORD_URL: + case WORD_HOST: + word[end] = 0; + word += start; + fe_open_url (word); } return; } @@ -2296,7 +2295,7 @@ mg_word_clicked (GtkWidget *xtext, char *word, GdkEventButton *even) return; } - switch (mg_word_check (xtext, word, strlen (word))) + switch (word_type) { case 0: case WORD_PATH: @@ -2304,26 +2303,22 @@ mg_word_clicked (GtkWidget *xtext, char *word, GdkEventButton *even) break; case WORD_URL: case WORD_HOST: + word[end] = 0; + word += start; menu_urlmenu (even, word); break; case WORD_NICK: - menu_nickmenu (sess, even, (word[0]=='@' || word[0]=='+' || word[0]=='%') ? - word+1 : word, FALSE); + menu_nickmenu (sess, even, word + (ispunct (*word)? 1: 0), FALSE); break; case WORD_CHANNEL: - if (*word == '@' || *word == '+' || *word=='^' || *word=='%' || *word=='*') - word++; - menu_chanmenu (sess, even, word); + menu_chanmenu (sess, even, word + (ispunct (*word)? 1: 0)); break; case WORD_EMAIL: - { - char *newword = malloc (strlen (word) + 10); - if (*word == '~') - word++; - sprintf (newword, "mailto:%s", word); - menu_urlmenu (even, newword); - free (newword); - } + word[end] = 0; + word += start; + tmp = g_strdup_printf("mailto:%s", word + (ispunct (*word)? 1: 0)); + menu_urlmenu (even, tmp); + g_free (tmp); break; case WORD_DIALOG: menu_nickmenu (sess, even, sess->channel, FALSE); diff --git a/src/fe-gtk/xtext.c b/src/fe-gtk/xtext.c index e8419b5a..4d2a8a69 100644 --- a/src/fe-gtk/xtext.c +++ b/src/fe-gtk/xtext.c @@ -13,7 +13,7 @@ * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * ========================================================================= * * xtext, the text widget used by X-Chat. @@ -73,6 +73,7 @@ #include "../common/fe.h" #include "../common/util.h" #include "../common/hexchatc.h" +#include "../common/url.h" #include "fe-gtk.h" #include "xtext.h" #include "fkeys.h" @@ -1901,7 +1902,7 @@ gtk_xtext_selection_update (GtkXText * xtext, GdkEventMotion * event, int p_y, g static char * gtk_xtext_get_word (GtkXText * xtext, int x, int y, textentry ** ret_ent, - int *ret_off, int *ret_len) + int *ret_off, int *ret_len, GSList **slp) { textentry *ent; int offset; @@ -1950,9 +1951,9 @@ gtk_xtext_get_word (GtkXText * xtext, int x, int y, textentry ** ret_ent, if (ret_off) *ret_off = word - ent->str; if (ret_len) - *ret_len = str - word; + *ret_len = len; /* Length before stripping */ - return gtk_xtext_strip_color (word, len, xtext->scratch_buffer, NULL, NULL, NULL, FALSE); + return gtk_xtext_strip_color (word, len, xtext->scratch_buffer, NULL, NULL, slp, FALSE); } #ifdef MOTION_MONITOR @@ -2028,14 +2029,62 @@ gtk_xtext_check_mark_stamp (GtkXText *xtext, GdkModifierType mask) return redraw; } +static int +gtk_xtext_get_word_adjust (GtkXText *xtext, int x, int y, textentry **word_ent, int *offset, int *len) +{ + GSList *slp = NULL; + unsigned char *word; + int word_type = 0; + + word = gtk_xtext_get_word (xtext, x, y, word_ent, offset, len, &slp); + if (word) + { + int laststart, lastend; + + word_type = xtext->urlcheck_function (GTK_WIDGET (xtext), word); + if (word_type > 0) + { + if (url_last (&laststart, &lastend)) + { + int cumlen, startadj = 0, endadj = 0; + offlen_t o; + GSList *sl; + + for (sl = slp, cumlen = 0; sl; sl = g_slist_next (sl)) + { + o.u = GPOINTER_TO_UINT (sl->data); + startadj = o.o.off - cumlen; + cumlen += o.o.len; + if (laststart < cumlen) + break; + } + for (sl = slp, cumlen = 0; sl; sl = g_slist_next (sl)) + { + o.u = GPOINTER_TO_UINT (sl->data); + endadj = o.o.off - cumlen; + cumlen += o.o.len; + if (lastend < cumlen) + break; + } + laststart += startadj; + *offset += laststart; + *len = lastend + endadj - laststart; + } + } + } + g_slist_free (slp); + + return word_type; +} + static gboolean gtk_xtext_motion_notify (GtkWidget * widget, GdkEventMotion * event) { GtkXText *xtext = GTK_XTEXT (widget); GdkModifierType mask; int redraw, tmp, x, y, offset, len, line_x; - unsigned char *word; textentry *word_ent; + int word_type; gdk_window_get_pointer (widget->window, &x, &y, &mask); @@ -2104,43 +2153,40 @@ gtk_xtext_motion_notify (GtkWidget * widget, GdkEventMotion * event) if (xtext->urlcheck_function == NULL) return FALSE; - word = gtk_xtext_get_word (xtext, x, y, &word_ent, &offset, &len); - if (word) + word_type = gtk_xtext_get_word_adjust (xtext, x, y, &word_ent, &offset, &len); + if (word_type > 0) { - if (xtext->urlcheck_function (GTK_WIDGET (xtext), word, len) > 0) + if (!xtext->cursor_hand || + xtext->hilight_ent != word_ent || + xtext->hilight_start != offset || + xtext->hilight_end != offset + len) { - if (!xtext->cursor_hand || - xtext->hilight_ent != word_ent || - xtext->hilight_start != offset || - xtext->hilight_end != offset + len) + if (!xtext->cursor_hand) { - if (!xtext->cursor_hand) - { - gdk_window_set_cursor (GTK_WIDGET (xtext)->window, - xtext->hand_cursor); - xtext->cursor_hand = TRUE; - } - - /* un-render the old hilight */ - if (xtext->hilight_ent) - gtk_xtext_unrender_hilight (xtext); - - xtext->hilight_ent = word_ent; - xtext->hilight_start = offset; - xtext->hilight_end = offset + len; - - xtext->skip_border_fills = TRUE; - xtext->render_hilights_only = TRUE; - xtext->skip_stamp = TRUE; - - gtk_xtext_render_ents (xtext, word_ent, NULL); - - xtext->skip_border_fills = FALSE; - xtext->render_hilights_only = FALSE; - xtext->skip_stamp = FALSE; + gdk_window_set_cursor (GTK_WIDGET (xtext)->window, + xtext->hand_cursor); + xtext->cursor_hand = TRUE; } - return FALSE; + + /* un-render the old hilight */ + if (xtext->hilight_ent) + gtk_xtext_unrender_hilight (xtext); + + xtext->hilight_ent = word_ent; + xtext->hilight_start = offset; + xtext->hilight_end = offset + len; + + xtext->skip_border_fills = TRUE; + xtext->render_hilights_only = TRUE; + xtext->skip_stamp = TRUE; + + gtk_xtext_render_ents (xtext, word_ent, NULL); + + xtext->skip_border_fills = FALSE; + xtext->render_hilights_only = FALSE; + xtext->skip_stamp = FALSE; } + return FALSE; } gtk_xtext_leave_notify (widget, NULL); @@ -2280,7 +2326,7 @@ gtk_xtext_button_release (GtkWidget * widget, GdkEventButton * event) if (!xtext->hilighting) { - word = gtk_xtext_get_word (xtext, event->x, event->y, 0, 0, 0); + word = gtk_xtext_get_word (xtext, event->x, event->y, 0, 0, 0, 0); g_signal_emit (G_OBJECT (xtext), xtext_signals[WORD_CLICK], 0, word ? word : NULL, event); } else { @@ -2288,7 +2334,6 @@ gtk_xtext_button_release (GtkWidget * widget, GdkEventButton * event) } } - return FALSE; } @@ -2305,7 +2350,7 @@ gtk_xtext_button_press (GtkWidget * widget, GdkEventButton * event) if (event->button == 3 || event->button == 2) /* right/middle click */ { - word = gtk_xtext_get_word (xtext, x, y, 0, 0, 0); + word = gtk_xtext_get_word (xtext, x, y, 0, 0, 0, 0); if (word) { g_signal_emit (G_OBJECT (xtext), xtext_signals[WORD_CLICK], 0, @@ -2322,7 +2367,7 @@ gtk_xtext_button_press (GtkWidget * widget, GdkEventButton * event) if (event->type == GDK_2BUTTON_PRESS) /* WORD select */ { gtk_xtext_check_mark_stamp (xtext, mask); - if (gtk_xtext_get_word (xtext, x, y, &ent, &offset, &len)) + if (gtk_xtext_get_word (xtext, x, y, &ent, &offset, &len, 0)) { if (len == 0) return FALSE; @@ -2343,7 +2388,7 @@ gtk_xtext_button_press (GtkWidget * widget, GdkEventButton * event) if (event->type == GDK_3BUTTON_PRESS) /* LINE select */ { gtk_xtext_check_mark_stamp (xtext, mask); - if (gtk_xtext_get_word (xtext, x, y, &ent, 0, 0)) + if (gtk_xtext_get_word (xtext, x, y, &ent, 0, 0, 0)) { gtk_xtext_selection_clear (xtext->buffer); ent->mark_start = 0; @@ -2852,7 +2897,7 @@ gtk_xtext_render_flush (GtkXText * xtext, int x, int y, unsigned char *str, { int str_width, dofill; GdkDrawable *pix = NULL; - int dest_x, dest_y; + int dest_x = 0, dest_y = 0; if (xtext->dont_render || len < 1 || xtext->hidden) return 0; @@ -5904,7 +5949,7 @@ gtk_xtext_set_tint (GtkXText *xtext, int tint_red, int tint_green, int tint_blue } void -gtk_xtext_set_urlcheck_function (GtkXText *xtext, int (*urlcheck_function) (GtkWidget *, char *, int)) +gtk_xtext_set_urlcheck_function (GtkXText *xtext, int (*urlcheck_function) (GtkWidget *, char *)) { xtext->urlcheck_function = urlcheck_function; } diff --git a/src/fe-gtk/xtext.h b/src/fe-gtk/xtext.h index 48c71d0c..cc6bbebb 100644 --- a/src/fe-gtk/xtext.h +++ b/src/fe-gtk/xtext.h @@ -179,7 +179,7 @@ struct _GtkXText unsigned char scratch_buffer[4096]; void (*error_function) (int type); - int (*urlcheck_function) (GtkWidget * xtext, char *word, int len); + int (*urlcheck_function) (GtkWidget * xtext, char *word); int jump_out_offset; /* point at which to stop rendering */ int jump_in_offset; /* "" start rendering */ @@ -274,7 +274,7 @@ void gtk_xtext_set_show_separator (GtkXText *xtext, gboolean show_separator); void gtk_xtext_set_thin_separator (GtkXText *xtext, gboolean thin_separator); void gtk_xtext_set_time_stamp (xtext_buffer *buf, gboolean timestamp); void gtk_xtext_set_tint (GtkXText *xtext, int tint_red, int tint_green, int tint_blue); -void gtk_xtext_set_urlcheck_function (GtkXText *xtext, int (*urlcheck_function) (GtkWidget *, char *, int)); +void gtk_xtext_set_urlcheck_function (GtkXText *xtext, int (*urlcheck_function) (GtkWidget *, char *)); void gtk_xtext_set_wordwrap (GtkXText *xtext, gboolean word_wrap); xtext_buffer *gtk_xtext_buffer_new (GtkXText *xtext);