From 9a2ea3938d09643c6528c3b83b1db4c30f47d981 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Sun, 20 Jul 2008 13:10:02 +0200 Subject: [PATCH] Basic IDN/IRI support --- src/host.c | 4 +- src/html-url.c | 12 +++--- src/http.c | 6 +-- src/iri.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++-- src/iri.h | 20 ++++++++- src/main.c | 17 +++++--- src/recur.c | 35 +++++++++++----- src/retr.c | 47 +++++++++++++-------- src/url.c | 11 +++-- src/url.h | 2 +- 10 files changed, 209 insertions(+), 53 deletions(-) diff --git a/src/host.c b/src/host.c index fb8158e5..1226a274 100644 --- a/src/host.c +++ b/src/host.c @@ -716,7 +716,7 @@ lookup_host (const char *host, int flags) { char *str = NULL, *name; - if (opt.enable_iri && (name = idn_decode (host)) != NULL) + if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL) { int len = strlen (host) + strlen (name) + 4; str = xmalloc (len); @@ -725,7 +725,7 @@ lookup_host (const char *host, int flags) xfree (name); } - logprintf (LOG_VERBOSE, _("Resolving %s... "), + logprintf (LOG_VERBOSE, _("Resolving %s... "), quotearg_style (escape_quoting_style, str ? str : host)); if (str) diff --git a/src/html-url.c b/src/html-url.c index 9b515432..0d580f9a 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -274,6 +274,7 @@ append_url (const char *link_uri, struct urlpos *newel; const char *base = ctx->base ? ctx->base : ctx->parent_base; struct url *url; + bool utf8_encode = false; if (!base) { @@ -292,7 +293,7 @@ append_url (const char *link_uri, return NULL; } - url = url_parse (link_uri, NULL); + url = url_parse (link_uri, NULL, &utf8_encode); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -311,7 +312,7 @@ append_url (const char *link_uri, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); - url = url_parse (complete_uri, NULL); + url = url_parse (complete_uri, NULL, &utf8_encode); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -549,9 +550,9 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!mcharset) return; - logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset)); + /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/ - /* sXXXav: Not used yet */ + set_current_charset (mcharset); xfree (mcharset); } else if (name && 0 == strcasecmp (name, "robots")) @@ -660,6 +661,7 @@ get_urls_file (const char *file) struct file_memory *fm; struct urlpos *head, *tail; const char *text, *text_end; + bool utf8_encode = false; /* Load the file. */ fm = read_file (file); @@ -711,7 +713,7 @@ get_urls_file (const char *file) url_text = merged; } - url = url_parse (url_text, &up_error_code); + url = url_parse (url_text, &up_error_code, &utf8_encode); if (!url) { logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), diff --git a/src/http.c b/src/http.c index a4571ad7..df9ca2bb 100644 --- a/src/http.c +++ b/src/http.c @@ -1825,7 +1825,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) hs->local_file = url_file_name (u); } } - + /* TODO: perform this check only once. */ if (!hs->existence_checked && file_exists_p (hs->local_file)) { @@ -1894,7 +1894,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); local_dot_orig_file_exists = true; local_filename = filename_plus_orig_suffix; } - } + } if (!local_dot_orig_file_exists) /* Couldn't stat() .orig, so try to stat() . */ @@ -2055,7 +2055,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); /* Try to get remote encoding if needed */ if (opt.enable_iri && !opt.encoding_remote) - /* xxx = */ parse_charset (tmp2); + set_current_charset (parse_charset (tmp2)); } } hs->newloc = resp_header_strdup (resp, "Location"); diff --git a/src/iri.c b/src/iri.c index 000f6550..32eb7210 100644 --- a/src/iri.c +++ b/src/iri.c @@ -41,6 +41,8 @@ as that of the covered work. */ #include "utils.h" #include "iri.h" +char *remote; +char *current; static iconv_t locale2utf8; @@ -80,7 +82,7 @@ parse_charset (char *str) return NULL; } - logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset)); + /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/ return charset; } @@ -196,7 +198,7 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) (*out)++; outlen--; } - else if (errno == E2BIG) /* Output buffer full */ + else if (errno == E2BIG) /* Output buffer full */ { char *new; @@ -222,15 +224,29 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) /* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL on error. */ -char *idn_encode (char *host) +char * +idn_encode (char *host, bool utf8_encoded) { char *new; int ret; + /* Encode to UTF-8 if not done using current remote */ + if (!utf8_encoded) + { + if (!remote_to_utf8 ((const char *) host, (const char **) &new)) + { + /* Nothing to encode or an error occured */ + return NULL; + } + + host = new; + } + /* toASCII UTF-8 NULL terminated string */ ret = idna_to_ascii_8z (host, &new, 0); if (ret != IDNA_SUCCESS) { + /* sXXXav : free new when needed ! */ logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret, quote (idna_strerror (ret))); return NULL; @@ -241,7 +257,8 @@ char *idn_encode (char *host) /* Try to decode an ASCII encoded host. Return the new domain in the locale on success or NULL on error. */ -char *idn_decode (char *host) +char * +idn_decode (char *host) { char *new; int ret; @@ -257,4 +274,87 @@ char *idn_decode (char *host) return new; } +/* Return a new string */ +bool +remote_to_utf8 (const char *str, const char **new) +{ + char *remote; + iconv_t cd; + bool ret = false; + + if (opt.encoding_remote) + remote = opt.encoding_remote; + else if (current) + remote = current; + else + return false; + + cd = iconv_open ("UTF-8", remote); + if (cd == (iconv_t)(-1)) + return false; + + if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new)) + ret = true; + + iconv_close (cd); + + /* Test if something was converted */ + if (!strcmp (str, *new)) + { + xfree ((char *) *new); + return false; + } + + return ret; +} + +char *get_remote_charset (void) +{ + return remote; +} + +char *get_current_charset (void) +{ + return current; +} + +void set_current_charset (char *charset) +{ + /*printf("[ current = `%s'\n", charset);*/ + + if (current) + xfree (current); + + current = charset ? xstrdup (charset) : NULL; +} + +void set_current_as_locale (void) +{ + /*printf("[ current = locale = `%s'\n", opt.locale);*/ + if (current) + xfree (current); + + /* sXXXav : assert opt.locale NULL ? */ + current = xstrdup (opt.locale); +} + +void +set_remote_charset (char *charset) +{ + /*printf("[ remote = `%s'\n", charset);*/ + if (remote) + xfree (remote); + + remote = charset ? xstrdup (charset) : NULL; +} + +void +set_remote_as_current (void) +{ + /*printf("[ remote = current = `%s'\n", current);*/ + if (remote) + xfree (remote); + + remote = current ? xstrdup (current) : NULL; +} diff --git a/src/iri.h b/src/iri.h index 3992d76d..837dbfdd 100644 --- a/src/iri.h +++ b/src/iri.h @@ -36,8 +36,16 @@ char *parse_charset (char *str); char *find_locale (void); bool check_encoding_name (char *encoding); const char *locale_to_utf8 (const char *str); -char *idn_encode (char *host); +char *idn_encode (char *host, bool utf8_encoded); char *idn_decode (char *host); +char *get_remote_charset (void); +char *get_current_charset (void); +void set_current_charset (char *charset); +void set_current_as_locale (void); +void set_current_charset (char *charset); +void set_remote_charset (char *charset); +void set_remote_as_current (void); +bool remote_to_utf8 (const char *str, const char **new); #else /* ENABLE_IRI */ @@ -45,8 +53,16 @@ char *idn_decode (char *host); #define find_locale() NULL #define check_encoding_name(str) false #define locale_to_utf8(str) (str) -#define idn_encode(str) NULL +#define idn_encode(str,encoded) NULL #define idn_decode(str) NULL +#define get_remote_charset() NULL +#define get_current_charset() NULL +#define set_current_charset(str) +#define set_current_as_locale() +#define set_current_charset(str) +#define set_remote_charset(str) +#define set_remote_as_current() +#define remote_to_utf8(a,b) false #endif /* ENABLE_IRI */ #endif /* IRI_H */ diff --git a/src/main.c b/src/main.c index 53ea6b91..d0ff1d21 100644 --- a/src/main.c +++ b/src/main.c @@ -1067,16 +1067,16 @@ for details.\n\n")); #ifdef ENABLE_IRI if (opt.enable_iri) { - if (opt.locale && !check_encoding_name(opt.locale)) + if (opt.locale && !check_encoding_name (opt.locale)) opt.locale = NULL; if (!opt.locale) opt.locale = find_locale (); - if (opt.encoding_remote && !check_encoding_name(opt.encoding_remote)) + if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote)) opt.encoding_remote = NULL; - logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale)); + /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/ } #else if (opt.enable_iri || opt.locale || opt.encoding_remote) @@ -1190,21 +1190,26 @@ WARNING: Can't reopen standard output in binary mode;\n\ char *filename = NULL, *redirected_URL = NULL; int dt; + set_current_as_locale (); + if ((opt.recursive || opt.page_requisites) && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t))) { int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (url_scheme (*t) == SCHEME_FTP) + if (url_scheme (*t) == SCHEME_FTP) opt.follow_ftp = 1; - + status = retrieve_tree (*t); opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + { + set_remote_as_current (); + status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + } if (opt.delete_after && file_exists_p(filename)) { diff --git a/src/recur.c b/src/recur.c index d1d0f18d..e5f2b929 100644 --- a/src/recur.c +++ b/src/recur.c @@ -49,6 +49,7 @@ as that of the covered work. */ #include "res.h" #include "convert.h" #include "spider.h" +#include "iri.h" /* Functions for maintaining the URL queue. */ @@ -58,7 +59,7 @@ struct queue_element { int depth; /* the depth */ bool html_allowed; /* whether the document is allowed to be treated as HTML. */ - + char *remote_encoding; struct queue_element *next; /* next element in queue */ }; @@ -94,12 +95,18 @@ url_enqueue (struct url_queue *queue, const char *url, const char *referer, int depth, bool html_allowed) { struct queue_element *qel = xnew (struct queue_element); + char *charset = get_current_charset (); qel->url = url; qel->referer = referer; qel->depth = depth; qel->html_allowed = html_allowed; qel->next = NULL; + if (charset) + qel->remote_encoding = xstrdup (charset); + else + qel->remote_encoding = NULL; + ++queue->count; if (queue->count > queue->maxcount) queue->maxcount = queue->count; @@ -107,6 +114,8 @@ url_enqueue (struct url_queue *queue, DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/ + if (queue->tail) queue->tail->next = qel; queue->tail = qel; @@ -132,6 +141,10 @@ url_dequeue (struct url_queue *queue, if (!queue->head) queue->tail = NULL; + set_remote_charset (qel->remote_encoding); + if (qel->remote_encoding) + xfree (qel->remote_encoding); + *url = qel->url; *referer = qel->referer; *depth = qel->depth; @@ -177,6 +190,7 @@ uerr_t retrieve_tree (const char *start_url) { uerr_t status = RETROK; + bool utf8_encode = false; /* The queue of URLs we need to load. */ struct url_queue *queue; @@ -186,7 +200,7 @@ retrieve_tree (const char *start_url) struct hash_table *blacklist; int up_error_code; - struct url *start_url_parsed = url_parse (start_url, &up_error_code); + struct url *start_url_parsed = url_parse (start_url, &up_error_code, &utf8_encode); if (!start_url_parsed) { @@ -324,7 +338,7 @@ retrieve_tree (const char *start_url) if (children) { struct urlpos *child = children; - struct url *url_parsed = url_parsed = url_parse (url, NULL); + struct url *url_parsed = url_parsed = url_parse (url, NULL, &utf8_encode); char *referer_url = url; bool strip_auth = (url_parsed != NULL && url_parsed->user != NULL); @@ -360,18 +374,18 @@ retrieve_tree (const char *start_url) } } - if (file - && (opt.delete_after + if (file + && (opt.delete_after || opt.spider /* opt.recursive is implicitely true */ || !acceptable (file))) { /* Either --delete-after was specified, or we loaded this - (otherwise unneeded because of --spider or rejected by -R) - HTML file just to harvest its hyperlinks -- in either case, + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, delete the local file. */ DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : - (opt.spider ? "--spider" : + (opt.spider ? "--spider" : "recursive rejection criteria"))); logprintf (LOG_VERBOSE, (opt.delete_after || opt.spider @@ -627,11 +641,12 @@ descend_redirect_p (const char *redirected, const char *original, int depth, struct url *orig_parsed, *new_parsed; struct urlpos *upos; bool success; + bool utf8_encode = false; - orig_parsed = url_parse (original, NULL); + orig_parsed = url_parse (original, NULL, &utf8_encode); assert (orig_parsed != NULL); - new_parsed = url_parse (redirected, NULL); + new_parsed = url_parse (redirected, NULL, &utf8_encode); assert (new_parsed != NULL); upos = xnew0 (struct urlpos); diff --git a/src/retr.c b/src/retr.c index 179430ac..05ffe1d0 100644 --- a/src/retr.c +++ b/src/retr.c @@ -51,6 +51,7 @@ as that of the covered work. */ #include "hash.h" #include "convert.h" #include "ptimer.h" +#include "iri.h" /* Total size of downloaded files. Used to enforce quota. */ SUM_SIZE_INT total_downloaded_bytes; @@ -612,6 +613,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, char *saved_post_data = NULL; char *saved_post_file_name = NULL; + bool utf8_encoded = opt.enable_iri; + /* If dt is NULL, use local storage. */ if (!dt) { @@ -624,7 +627,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (file) *file = NULL; - u = url_parse (url, &up_error_code); + second_try: + u = url_parse (url, &up_error_code, &utf8_encoded); if (!u) { logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code)); @@ -632,6 +636,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, return URLERROR; } + /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/ + if (!refurl) refurl = opt.referer; @@ -645,8 +651,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { + /* sXXXav : support IRI for proxy */ + bool proxy_utf8_encode = false; /* Parse the proxy URL. */ - proxy_url = url_parse (proxy, &up_error_code); + proxy_url = url_parse (proxy, &up_error_code, &proxy_utf8_encode); if (!proxy_url) { logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), @@ -721,8 +729,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, xfree (mynewloc); mynewloc = construced_newloc; + utf8_encoded = opt.enable_iri; + /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code); + newloc_parsed = url_parse (mynewloc, &up_error_code, &utf8_encoded); if (!newloc_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), @@ -769,16 +779,21 @@ retrieve_url (const char *origurl, char **file, char **newloc, goto redirected; } - if (local_file) + /* Try to not encode in UTF-8 if fetching failed */ + if (result != RETROK && utf8_encoded) { - if (*dt & RETROKF) - { - register_download (u->url, local_file); - if (redirection_count && 0 != strcmp (origurl, u->url)) - register_redirection (origurl, u->url); - if (*dt & TEXTHTML) - register_html (u->url, local_file); - } + utf8_encoded = false; + /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/ + goto second_try; + } + + if (local_file && *dt & RETROKF) + { + register_download (u->url, local_file); + if (redirection_count && 0 != strcmp (origurl, u->url)) + register_redirection (origurl, u->url); + if (*dt & TEXTHTML) + register_html (u->url, local_file); } if (file) @@ -843,9 +858,9 @@ retrieve_from_file (const char *file, bool html, int *count) int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (cur_url->url->scheme == SCHEME_FTP) + if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - + status = retrieve_tree (cur_url->url->url); opt.follow_ftp = old_follow_ftp; @@ -1021,8 +1036,8 @@ getproxy (struct url *u) bool url_uses_proxy (const char *url) { - bool ret; - struct url *u = url_parse (url, NULL); + bool ret, utf8_encode = false; + struct url *u = url_parse (url, NULL, &utf8_encode); if (!u) return false; ret = getproxy (u) != NULL; diff --git a/src/url.c b/src/url.c index 48b23d6c..32de9c75 100644 --- a/src/url.c +++ b/src/url.c @@ -641,7 +641,7 @@ static const char *parse_errors[] = { error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */ struct url * -url_parse (const char *url, int *error) +url_parse (const char *url, int *error, bool *utf8_encode) { struct url *u; const char *p; @@ -671,10 +671,13 @@ url_parse (const char *url, int *error) goto error; } - if (opt.enable_iri) + if (opt.enable_iri && *utf8_encode) { + const char *new; url_unescape ((char *) url); - url = locale_to_utf8(url); + *utf8_encode = remote_to_utf8 (url, &new); + if (*utf8_encode) + url = new; } url_encoded = reencode_escapes (url); @@ -853,7 +856,7 @@ url_parse (const char *url, int *error) if (opt.enable_iri) { - char *new = idn_encode (u->host); + char *new = idn_encode (u->host, *utf8_encode); if (new) { xfree (u->host); diff --git a/src/url.h b/src/url.h index 7c8bcfed..a174568e 100644 --- a/src/url.h +++ b/src/url.h @@ -84,7 +84,7 @@ struct url char *url_escape (const char *); -struct url *url_parse (const char *, int *); +struct url *url_parse (const char *, int *, bool *); const char *url_error (int); char *url_full_path (const struct url *); void url_set_dir (struct url *, const char *);