diff --git a/ChangeLog b/ChangeLog index 4bd9e3b3..c19c374f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -9,6 +9,14 @@ * AUTHORS: Added Steven Schubiger. +2008-06-26 Xavier Saint + + * configure.ac : IRIs support required libiconv, check it. + +2008-06-14 Xavier Saint + + * configure.ac: Add support for IRIs + 2008-05-29 Micah Cowan * po/*.po: Updated from TP (the 1.11.3 set). diff --git a/configure.ac b/configure.ac index 2ccc703d..fb0c65d1 100644 --- a/configure.ac +++ b/configure.ac @@ -460,6 +460,77 @@ else fi AC_SUBST(COMMENT_IF_NO_POD2MAN) + +dnl +dnl Check for IDN/IRIs +dnl + +AC_ARG_ENABLE(iri, + AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]), + [case "${enable_iri}" in + no) + dnl Disable IRIs checking + AC_MSG_NOTICE([disabling IRIs at user request]) + iri=no + ;; + yes) + dnl IRIs explicitly enabled + iri=yes + force_iri=yes + ;; + auto) + dnl Auto-detect IRI + iri=yes + ;; + *) + AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri']) + ;; + esac + ], [ + dnl If nothing is specified, assume auto-detection + iri=yes + ] +) + +AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]], + [Support IDN/IRIs (needs GNU Libidn)]), + libidn=$withval, libidn="") +if test "X$iri" != "Xno"; then + AM_ICONV + + if test "X$am_cv_func_iconv" != "Xyes"; then + iri=no + if test "X$force_iri" = "Xyes"; then + AC_MSG_ERROR([Libiconv is required for IRIs support]) + else + AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found]) + fi + fi +fi + +if test "X$iri" != "Xno"; then + if test "$libidn" != ""; then + LDFLAGS="${LDFLAGS} -L$libidn/lib" + CPPFLAGS="${CPPFLAGS} -I$libidn/include" + fi + AC_CHECK_HEADER(idna.h, + AC_CHECK_LIB(idn, stringprep_check_version, + [iri=yes LIBS="${LIBS} -lidn"], iri=no), + iri=no) + + if test "X$iri" != "Xno" ; then + AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.]) + AC_MSG_NOTICE([Enabling support for IRI.]) + else + AC_MSG_WARN([Libidn not found]) + fi +fi + + +dnl Needed by src/Makefile.am +AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"]) + + dnl dnl Create output dnl diff --git a/doc/ChangeLog b/doc/ChangeLog index cea5f7b5..94a06283 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,12 @@ +2008-08-03 Xavier Saint + + * wget.texi : Add option descriptions for the three new + options --iri, --locale and --remote-encoding related to + IRI support. + + * sample.wgetrc : Add commented lines for the three new + command iri, locale and encoding related to IRI support. + 2008-08-03 Micah Cowan * wget.texi: Don't set UPDATED; already set by version.texi. diff --git a/doc/sample.wgetrc b/doc/sample.wgetrc index c69596bf..7ef9ef4a 100644 --- a/doc/sample.wgetrc +++ b/doc/sample.wgetrc @@ -113,3 +113,12 @@ waitretry = 10 # To try ipv6 addresses first: #prefer-family = IPv6 + +# Set default IRI support state +#iri = off + +# Force the default system encoding +#locale = UTF-8 + +# Force the default remote server encoding +#remoteencoding = UTF-8 diff --git a/doc/wget.texi b/doc/wget.texi index 9cb5db99..54e2eb9d 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -674,6 +674,30 @@ Another instance where you'll get a garbled file if you try to use Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http} servers that support the @code{Range} header. +@cindex iri support +@cindex idn support +@item --iri + +Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to +turn it off. IRI support is activated by default. + +You can set the default state of IRI support using @code{iri} command in +@file{.wgetrc}. That setting may be overridden from the command line. + +@cindex local encoding +@cindex locale +@item --locale=@var{encoding} + +Force Wget to use @var{encoding} as the default system encoding. That affects +how Wget converts URLs specified as arguments from locale to @sc{utf-8} for +IRI support. + +Wget use the function @code{nl_langinfo()} and then the @code{CHARSET} +environment variable to get the locale. If it fails, @sc{ascii} is used. + +You can set the default locale using the @code{locale} command in +@file{.wgetrc}. That setting may be overridden from the command line. + @cindex progress indicator @cindex dot style @item --progress=@var{type} @@ -705,6 +729,21 @@ command line. The exception is that, when the output is not a TTY, the ``dot'' progress will be favored over ``bar''. To force the bar output, use @samp{--progress=bar:force}. +@cindex remote encoding +@item --remote-encoding=@var{encoding} + +Force Wget to use encoding as the default remote server encoding. That +affects how Wget converts URIs found in files from remote encoding to +@sc{utf-8} during a recursive fetch. This options is only useful for +IRI support, for the interpretation of non-@sc{ascii} characters. + +For HTTP, remote encoding can be found in HTTP @code{Content-Type} +header and in HTML @code{Content-Type http-equiv} meta tag. + +You can set the default encoding using the @code{remoteencoding} +command in @file{.wgetrc}. That setting may be overridden from the +command line. + @item -N @itemx --timestamping Turn on time-stamping. @xref{Time-Stamping}, for details. diff --git a/src/ChangeLog b/src/ChangeLog index 8acab925..5e3a8893 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -32,11 +32,27 @@ * init.c (cleanup): Free the memory associated with the base option (when DEBUG_MALLOC is defined). +2008-07-02 Xavier Saint + + * iri.c, iri.h : New function idn_decode() to decode ASCII + encoded hostname to the locale. + + * host.c : Show hostname to be resolved both in locale and + ASCII encoded. + 2008-06-28 Steven Schubiger * retr.c (retrieve_from_file): Allow for reading the links from an external file (HTTP/FTP). +2008-06-26 Xavier Saint + + * iri.c, iri.h : New functions locale_to_utf8() and + idn_encode() adding basic capabilities of IRI/IDN. + + * url.c : Convert URLs from locale to UTF-8 allowing a basic + support of IRI/IDN + 2008-06-25 Steven Schubiger * ftp.c (getftp): When spidering a FTP URL, emit a diagnostic @@ -61,7 +77,7 @@ * http.c: Make -nv --spider include the file's name when it exists. - + 2008-06-22 Micah Cowan * Makefile.am (version.c): Fixed version string invocation so it @@ -69,12 +85,57 @@ string vars pointers-to-const, and moved line lengths below 80 (in Makefile.am, not in version.c). +2008-06-19 Xavier Saint + + * iri.c, iri.h : New function check_encoding_name() as + a preliminary encoding name check. + + * main.c, iri.c : Make use of check_encoding_name(). + +2008-06-19 Xavier Saint + + * iri.c : Include missing stringprep.h file and add a + cast. + + * init.c : set a default initial value for opt.enable_iri, + opt.locale and opt.encoding_remote. + +2008-06-19 Xavier Saint + + * iri.c, iri.h : Add a new function find_locale() to find + out the local system encoding. + + * main.c : Make use of find_locale(). + +2008-06-19 Xavier Saint + + * html-url.c : Add "content-type" meta tag parsing for + retrieving page encoding. + + * iri.h : Make no-op version of parse_charset() return + NULL. + 2008-06-16 Micah Cowan * http.c (http_loop): When hstat.len is higher than the successfully completed content's length, but it's because we _set_ it that way, don't abort. +2008-06-14 Xavier Saint + + * iri.c, iri.h : New files. + + * Makefile.am : Add files iri.h and conditional iri.c. + + * build_info.c : Add compiled feature "iri". + + * http.c : include iri.h and parse charset from Content-Type + header. + + * init.c, main.c, options.h : if an options isn't supported + at compiled time, don't get rid off it and show a dummy + message instead if they are used. + 2008-06-13 Micah Cowan * build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL @@ -118,11 +179,11 @@ default. 2008-05-17 Kenny Parnell - + (cmd_spec_prefer_family): Initialize prefer_family to prefer_none. 2008-05-17 Micah Cowan - + * main.c (main): Handle Ctrl-D on command-line. 2008-05-15 Steven Schubiger @@ -161,7 +222,7 @@ * options.h: Add an according boolean member to the options struct. - + * sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE out, because they're now defined independently by config.h. diff --git a/src/Makefile.am b/src/Makefile.am index 441cfa1b..ab830ba0 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -30,6 +30,10 @@ # Version: @VERSION@ # +if IRI_IS_ENABLED +IRI_OBJ = iri.c +endif + # The following line is losing on some versions of make! DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\" LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@ @@ -40,8 +44,8 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c \ ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \ http.c init.c log.c main.c netrc.c progress.c ptimer.c \ recur.c res.c retr.c snprintf.c spider.c url.c \ - utils.c \ - css-url.h connect.h convert.h cookies.h \ + utils.c $(IRI_OBJ) \ + css-url.h connect.h convert.h cookies.h \ ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \ http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ options.h progress.h ptimer.h recur.h res.h retr.h \ diff --git a/src/build_info.c b/src/build_info.c index ee843ce9..542fed8a 100644 --- a/src/build_info.c +++ b/src/build_info.c @@ -100,6 +100,13 @@ const char* (compiled_features[]) = #else "-gettext", #endif + +#ifdef ENABLE_IRI + "+iri", +#else + "-iri", +#endif + /* sentinel value */ NULL }; diff --git a/src/connect.c b/src/connect.c index 1e8f07e5..41258d26 100644 --- a/src/connect.c +++ b/src/connect.c @@ -266,9 +266,25 @@ connect_to_ip (const ip_address *ip, int port, const char *print) if (print) { const char *txt_addr = print_address (ip); - if (print && 0 != strcmp (print, txt_addr)) - logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), - escnonprint_uri (print), txt_addr, port); + if (0 != strcmp (print, txt_addr)) + { + char *str = NULL, *name; + + if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL) + { + int len = strlen (print) + strlen (name) + 4; + str = xmalloc (len); + snprintf (str, len, "%s (%s)", name, print); + str[len-1] = '\0'; + xfree (name); + } + + logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), + str ? str : escnonprint_uri (print), txt_addr, port); + + if (str) + xfree (str); + } else logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port); } diff --git a/src/convert.c b/src/convert.c index e72a4b0f..54004ad0 100644 --- a/src/convert.c +++ b/src/convert.c @@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set, /* Parse the file... */ urls = is_css ? get_urls_css_file (file, url) : - get_urls_html (file, url, NULL); + get_urls_html (file, url, NULL, NULL); /* We don't respect meta_disallow_follow here because, even if the file is not followed, we might still want to convert the diff --git a/src/ftp-basic.c b/src/ftp-basic.c index 265a1e25..5f250959 100644 --- a/src/ftp-basic.c +++ b/src/ftp-basic.c @@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line) return FTPRERR; /* Strip trailing CRLF before printing the line, so that - escnonprint doesn't include bogus \012 and \015. */ + quotting doesn't include bogus \012 and \015. */ p = strchr (line, '\0'); if (p > line && p[-1] == '\n') *--p = '\0'; diff --git a/src/host.c b/src/host.c index fdb35b1c..bbf40222 100644 --- a/src/host.c +++ b/src/host.c @@ -712,8 +712,24 @@ lookup_host (const char *host, int flags) /* No luck with the cache; resolve HOST. */ if (!silent && !numeric_address) - logprintf (LOG_VERBOSE, _("Resolving %s... "), - quotearg_style (escape_quoting_style, host)); + { + char *str = NULL, *name; + + if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL) + { + int len = strlen (host) + strlen (name) + 4; + str = xmalloc (len); + snprintf (str, len, "%s (%s)", name, host); + str[len-1] = '\0'; + xfree (name); + } + + logprintf (LOG_VERBOSE, _("Resolving %s... "), + quotearg_style (escape_quoting_style, str ? str : host)); + + if (str) + xfree (str); + } #ifdef ENABLE_IPV6 { diff --git a/src/html-url.c b/src/html-url.c index 95df8bf9..c954cb97 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -174,6 +174,10 @@ static const char *additional_attributes[] = { static struct hash_table *interesting_tags; static struct hash_table *interesting_attributes; +/* Will contains the (last) charset found in 'http-equiv=content-type' + meta tags */ +static char *meta_charset; + static void init_interesting (void) { @@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size, return NULL; } - url = url_parse (link_uri, NULL); + url = url_parse (link_uri, NULL, NULL); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -303,7 +307,7 @@ append_url (const char *link_uri, int position, int size, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); - url = url_parse (complete_uri, NULL); + url = url_parse (complete_uri, NULL, NULL); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -553,6 +557,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) entry->link_expect_html = 1; } } + else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type")) + { + /* Handle stuff like: + */ + + char *mcharset; + char *content = find_attr (tag, "content", NULL); + if (!content) + return; + + mcharset = parse_charset (content); + if (!mcharset) + return; + + xfree_null (meta_charset); + meta_charset = mcharset; + } else if (name && 0 == strcasecmp (name, "robots")) { /* Handle stuff like: @@ -617,7 +638,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, + struct iri *iri) { struct file_memory *fm; struct map_context ctx; @@ -657,6 +679,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, NULL, interesting_attributes); + /* If meta charset isn't null, override content encoding */ + if (iri && meta_charset) + set_content_encoding (iri, meta_charset); + DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; @@ -726,7 +752,7 @@ get_urls_file (const char *file) url_text = merged; } - url = url_parse (url_text, &up_error_code); + url = url_parse (url_text, &up_error_code, NULL); if (!url) { char *error = url_error (url_text, up_error_code); diff --git a/src/html-url.h b/src/html-url.h index a94f0db6..2e9ec820 100644 --- a/src/html-url.h +++ b/src/html-url.h @@ -44,7 +44,7 @@ struct map_context { }; struct urlpos *get_urls_file (const char *); -struct urlpos *get_urls_html (const char *, const char *, bool *); +struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *); struct urlpos *append_url (const char *, int, int, struct map_context *); void free_urlpos (struct urlpos *); diff --git a/src/http.c b/src/http.c index 52f65fed..589e18ee 100644 --- a/src/http.c +++ b/src/http.c @@ -1364,7 +1364,8 @@ free_hstat (struct http_stat *hs) If PROXY is non-NULL, the connection will be made to the proxy server, and u->url will be requested. */ static uerr_t -gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) +gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, + struct iri *iri) { struct request *req; @@ -1827,7 +1828,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) hs->local_file = url_file_name (u); } } - + /* TODO: perform this check only once. */ if (!hs->existence_checked && file_exists_p (hs->local_file)) { @@ -1896,7 +1897,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); local_dot_orig_file_exists = true; local_filename = filename_plus_orig_suffix; } - } + } if (!local_dot_orig_file_exists) /* Couldn't stat() .orig, so try to stat() . */ @@ -2048,9 +2049,20 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); char *tmp = strchr (type, ';'); if (tmp) { + /* sXXXav: only needed if IRI support is enabled */ + char *tmp2 = tmp + 1; + while (tmp > type && c_isspace (tmp[-1])) --tmp; *tmp = '\0'; + + /* Try to get remote encoding if needed */ + if (opt.enable_iri && !opt.encoding_remote) + { + tmp = parse_charset (tmp2); + if (tmp) + set_content_encoding (iri, tmp); + } } } hs->newloc = resp_header_strdup (resp, "Location"); @@ -2325,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); retried, and retried, and retried, and... */ uerr_t http_loop (struct url *u, char **newloc, char **local_file, const char *referer, - int *dt, struct url *proxy) + int *dt, struct url *proxy, struct iri *iri) { int count; bool got_head = false; /* used for time-stamping and filename detection */ @@ -2336,16 +2348,16 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, uerr_t err, ret = TRYLIMEXC; time_t tmr = -1; /* remote time-stamp */ struct http_stat hstat; /* HTTP status */ - struct_stat st; + struct_stat st; bool send_head_first = true; /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); - + /* Set LOCAL_FILE parameter. */ if (local_file && opt.output_document) *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document); - + /* Reset NEWLOC parameter. */ *newloc = NULL; @@ -2382,7 +2394,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ logprintf (LOG_VERBOSE, _("\ -File %s already there; not retrieving.\n\n"), +File %s already there; not retrieving.\n\n"), quote (hstat.local_file)); /* If the file is there, we suppose it's retrieved OK. */ *dt |= RETROKF; @@ -2398,10 +2410,10 @@ File %s already there; not retrieving.\n\n"), /* Reset the counter. */ count = 0; - + /* Reset the document type. */ *dt = 0; - + /* Skip preliminary HEAD request if we're not in spider mode AND * if -O was given or HTTP Content-Disposition support is disabled. */ if (!opt.spider @@ -2410,21 +2422,21 @@ File %s already there; not retrieving.\n\n"), /* Send preliminary HEAD request if -N is given and we have an existing * destination file. */ - if (opt.timestamping + if (opt.timestamping && !opt.content_disposition && file_exists_p (url_file_name (u))) send_head_first = true; - + /* THE loop */ do { /* Increment the pass counter. */ ++count; sleep_between_retrievals (count); - + /* Get the current time string. */ tms = datetime_str (time (NULL)); - + if (opt.spider && !got_head) logprintf (LOG_VERBOSE, _("\ Spider mode enabled. Check if remote file exists.\n")); @@ -2433,20 +2445,20 @@ Spider mode enabled. Check if remote file exists.\n")); if (opt.verbose) { char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD); - - if (count > 1) + + if (count > 1) { char tmp[256]; sprintf (tmp, _("(try:%2d)"), count); logprintf (LOG_NOTQUIET, "--%s-- %s %s\n", tms, tmp, hurl); } - else + else { logprintf (LOG_NOTQUIET, "--%s-- %s\n", tms, hurl); } - + #ifdef WINDOWS ws_changetitle (hurl); #endif @@ -2456,7 +2468,7 @@ Spider mode enabled. Check if remote file exists.\n")); /* Default document type is empty. However, if spider mode is on or time-stamping is employed, HEAD_ONLY commands is encoded within *dt. */ - if (send_head_first && !got_head) + if (send_head_first && !got_head) *dt |= HEAD_ONLY; else *dt &= ~HEAD_ONLY; @@ -2489,11 +2501,11 @@ Spider mode enabled. Check if remote file exists.\n")); *dt &= ~SEND_NOCACHE; /* Try fetching the document, or at least its head. */ - err = gethttp (u, &hstat, dt, proxy); + err = gethttp (u, &hstat, dt, proxy, iri); /* Time? */ tms = datetime_str (time (NULL)); - + /* Get the new location (with or without the redirection). */ if (hstat.newloc) *newloc = xstrdup (hstat.newloc); @@ -2532,7 +2544,7 @@ Spider mode enabled. Check if remote file exists.\n")); hstat.statcode); ret = WRONGCODE; } - else + else { ret = NEWLOCATION; } @@ -2548,7 +2560,7 @@ Spider mode enabled. Check if remote file exists.\n")); /* All possibilities should have been exhausted. */ abort (); } - + if (!(*dt & RETROKF)) { char *hurl = NULL; @@ -2567,11 +2579,13 @@ Spider mode enabled. Check if remote file exists.\n")); continue; } /* Maybe we should always keep track of broken links, not just in - * spider mode. */ - else if (opt.spider) + * spider mode. + * Don't log error if it was UTF-8 encoded because we will try + * once unencoded. */ + else if (opt.spider && !iri->utf8_encode) { /* #### Again: ugly ugly ugly! */ - if (!hurl) + if (!hurl) hurl = url_string (u, URL_AUTH_HIDE_PASSWD); nonexisting_url (hurl); logprintf (LOG_NOTQUIET, _("\ @@ -2580,7 +2594,7 @@ Remote file does not exist -- broken link!!!\n")); else { logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), - tms, hstat.statcode, + tms, hstat.statcode, quotearg_style (escape_quoting_style, hstat.error)); } logputs (LOG_VERBOSE, "\n"); diff --git a/src/http.h b/src/http.h index e0e66cea..4769e9d3 100644 --- a/src/http.h +++ b/src/http.h @@ -33,7 +33,7 @@ as that of the covered work. */ struct url; uerr_t http_loop (struct url *, char **, char **, const char *, int *, - struct url *); + struct url *, struct iri *); void save_cookies (void); void http_cleanup (void); time_t http_atotm (const char *); diff --git a/src/init.c b/src/init.c index a774061b..fd71a362 100644 --- a/src/init.c +++ b/src/init.c @@ -182,9 +182,11 @@ static const struct { { "inet6only", &opt.ipv6_only, cmd_boolean }, #endif { "input", &opt.input_filename, cmd_file }, + { "iri", &opt.enable_iri, cmd_boolean }, { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean }, { "limitrate", &opt.limit_rate, cmd_bytes }, { "loadcookies", &opt.cookies_input, cmd_file }, + { "locale", &opt.locale, cmd_string }, { "logfile", &opt.lfilename, cmd_file }, { "login", &opt.ftp_user, cmd_string },/* deprecated*/ { "maxredirect", &opt.max_redirect, cmd_number }, @@ -224,6 +226,7 @@ static const struct { { "referer", &opt.referer, cmd_string }, { "reject", &opt.rejects, cmd_vector }, { "relativeonly", &opt.relative_only, cmd_boolean }, + { "remoteencoding", &opt.encoding_remote, cmd_string }, { "removelisting", &opt.remove_listing, cmd_boolean }, { "restrictfilenames", NULL, cmd_spec_restrict_file_names }, { "retrsymlinks", &opt.retr_symlinks, cmd_boolean }, @@ -331,6 +334,14 @@ defaults (void) opt.restrict_files_case = restrict_no_case_restriction; opt.max_redirect = 20; + +#ifdef ENABLE_IRI + opt.enable_iri = true; +#else + opt.enable_iri = false; +#endif + opt.locale = NULL; + opt.encoding_remote = NULL; } /* Return the user's home directory (strdup-ed), or NULL if none is diff --git a/src/iri.c b/src/iri.c new file mode 100644 index 00000000..e3909d50 --- /dev/null +++ b/src/iri.c @@ -0,0 +1,348 @@ +/* IRI related functions. + Copyright (C) 2008 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at +your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#include "wget.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +/* RFC3987 section 3.1 mandates STD3 ASCII RULES */ +#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES + +/* Note: locale encoding is kept in options struct (opt.locale) */ + +static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out); + + +/* Given a string containing "charset=XXX", return the encoding if found, + or NULL otherwise */ +char * +parse_charset (char *str) +{ + char *charset; + + if (!str || !*str) + return NULL; + + str = strcasestr (str, "charset="); + if (!str) + return NULL; + + str += 8; + charset = str; + + /* sXXXav: which chars should be banned ??? */ + while (*charset && !c_isspace (*charset)) + charset++; + + /* sXXXav: could strdupdelim return NULL ? */ + charset = strdupdelim (str, charset); + + /* Do a minimum check on the charset value */ + if (!check_encoding_name (charset)) + { + xfree (charset); + return NULL; + } + + /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/ + + return charset; +} + +/* Find the locale used, or fall back on a default value */ +char * +find_locale (void) +{ + return (char *) stringprep_locale_charset (); +} + +/* Basic check of an encoding name. */ +bool +check_encoding_name (char *encoding) +{ + char *s = encoding; + + while (*s) + { + if (!c_isascii (*s) || c_isspace (*s)) + { + logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding)); + return false; + } + + s++; + } + + return true; +} + +/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */ +static bool +open_locale_to_utf8 (void) +{ + +} + +/* Try converting string str from locale to UTF-8. Return a new string + on success, or str on error or if conversion isn't needed. */ +const char * +locale_to_utf8 (const char *str) +{ + iconv_t l2u; + char *new; + + /* That shouldn't happen, just in case */ + if (!opt.locale) + { + logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n"); + opt.locale = find_locale (); + } + + if (!opt.locale || !strcasecmp (opt.locale, "utf-8")) + return str; + + l2u = iconv_open ("UTF-8", opt.locale); + if (l2u != (iconv_t)(-1)) + { + logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n", + quote (opt.locale), quote ("UTF-8")); + return str; + } + + if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new)) + return (const char *) new; + + return str; +} + +/* Do the conversion according to the passed conversion descriptor cd. *out + will contain the transcoded string on success. *out content is + unspecified otherwise. */ +static bool +do_conversion (iconv_t cd, char *in, size_t inlen, char **out) +{ + /* sXXXav : hummm hard to guess... */ + size_t len, done, outlen = inlen * 2; + int invalid = 0, tooshort = 0; + char *s; + + s = xmalloc (outlen + 1); + *out = s; + len = outlen; + done = 0; + + for (;;) + { + if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1)) + { + *out = s; + *(s + len - outlen - done) = '\0'; + return true; + } + + /* Incomplete or invalid multibyte sequence */ + if (errno == EINVAL || errno == EILSEQ) + { + if (!invalid) + logprintf (LOG_VERBOSE, + "Incomplete or invalide multibyte sequence encountered\n"); + + invalid++; + **out = *in; + in++; + inlen--; + (*out)++; + outlen--; + } + else if (errno == E2BIG) /* Output buffer full */ + { + char *new; + + tooshort++; + done = len; + outlen = done + inlen * 2; + new = xmalloc (outlen + 1); + memcpy (new, s, done); + xfree (s); + s = new; + len = outlen; + *out = s + done; + } + else /* Weird, we got an unspecified error */ + { + logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno); + break; + } + } + + return false; +} + +/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL + on error. */ +char * +idn_encode (struct iri *i, char *host) +{ + char *new; + int ret; + + /* Encode to UTF-8 if not done */ + if (!i->utf8_encode) + { + if (!remote_to_utf8 (i, (const char *) host, (const char **) &new)) + return NULL; /* Nothing to encode or an error occured */ + host = new; + } + + /* toASCII UTF-8 NULL terminated string */ + ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS); + if (ret != IDNA_SUCCESS) + { + /* sXXXav : free new when needed ! */ + logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret, + quote (idna_strerror (ret))); + return NULL; + } + + return new; +} + +/* Try to decode an "ASCII encoded" host. Return the new domain in the locale + on success or NULL on error. */ +char * +idn_decode (char *host) +{ + char *new; + int ret; + + ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS); + if (ret != IDNA_SUCCESS) + { + logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret, + quote (idna_strerror (ret))); + return NULL; + } + + return new; +} + +/* Try to transcode string str from remote encoding to UTF-8. On success, *new + contains the transcoded string. *new content is unspecified otherwise. */ +bool +remote_to_utf8 (struct iri *i, const char *str, const char **new) +{ + iconv_t cd; + bool ret = false; + + if (!i->uri_encoding) + return false; + + cd = iconv_open ("UTF-8", i->uri_encoding); + if (cd == (iconv_t)(-1)) + return false; + + if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new)) + ret = true; + + iconv_close (cd); + + /* Test if something was converted */ + if (!strcmp (str, *new)) + { + xfree ((char *) *new); + return false; + } + + return ret; +} + +/* Allocate a new iri structure and return a pointer to it. */ +struct iri * +iri_new (void) +{ + struct iri *i = xmalloc (sizeof (struct iri)); + i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL; + i->content_encoding = NULL; + i->utf8_encode = opt.enable_iri; + return i; +} + +/* Completely free an iri structure. */ +void +iri_free (struct iri *i) +{ + xfree_null (i->uri_encoding); + xfree_null (i->content_encoding); + xfree (i); +} + +/* Set uri_encoding of struct iri i. If a remote encoding was specified, use + it unless force is true. */ +void +set_uri_encoding (struct iri *i, char *charset, bool force) +{ + DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None")); + if (!force && opt.encoding_remote) + return; + if (i->uri_encoding) + { + if (charset && !strcasecmp (i->uri_encoding, charset)) + return; + xfree (i->uri_encoding); + } + + i->uri_encoding = charset ? xstrdup (charset) : NULL; +} + +/* Set content_encoding of struct iri i. */ +void +set_content_encoding (struct iri *i, char *charset) +{ + DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None")); + if (opt.encoding_remote) + return; + if (i->content_encoding) + { + if (charset && !strcasecmp (i->content_encoding, charset)) + return; + xfree (i->content_encoding); + } + + i->content_encoding = charset ? xstrdup (charset) : NULL; +} + diff --git a/src/iri.h b/src/iri.h new file mode 100644 index 00000000..c024de72 --- /dev/null +++ b/src/iri.h @@ -0,0 +1,70 @@ +/* Internationalization related declarations. + Copyright (C) 2008 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#ifndef IRI_H +#define IRI_H + +struct iri { + char *uri_encoding; /* Encoding of the uri to fetch */ + char *content_encoding; /* Encoding of links inside the fetched file */ + bool utf8_encode; /* Will/Is the current url encoded in utf8 */ +}; + +#ifdef ENABLE_IRI + +char *parse_charset (char *str); +char *find_locale (void); +bool check_encoding_name (char *encoding); +const char *locale_to_utf8 (const char *str); +char *idn_encode (struct iri *i, char *host); +char *idn_decode (char *host); +bool remote_to_utf8 (struct iri *i, const char *str, const char **new); +struct iri *iri_new (void); +void iri_free (struct iri *i); +void set_uri_encoding (struct iri *i, char *charset, bool force); +void set_content_encoding (struct iri *i, char *charset); + +#else /* ENABLE_IRI */ + +struct iri dummy_iri; + +#define parse_charset(str) NULL +#define find_locale() NULL +#define check_encoding_name(str) false +#define locale_to_utf8(str) (str) +#define idn_encode(a,b) NULL +#define idn_decode(str) NULL +#define remote_to_utf8(a,b,c) false +#define iri_new() (&dummy_iri) +#define iri_free(a) +#define set_uri_encoding(a,b,c) +#define set_content_encoding(a,b) + +#endif /* ENABLE_IRI */ +#endif /* IRI_H */ diff --git a/src/log.c b/src/log.c index e84e5c61..b62bf9dd 100644 --- a/src/log.c +++ b/src/log.c @@ -43,7 +43,7 @@ as that of the covered work. */ #include "utils.h" #include "log.h" -/* This file impplement support for "logging". Logging means printing +/* This file implement support for "logging". Logging means printing output, plus several additional features: - Cataloguing output by importance. You can specify that a log diff --git a/src/main.c b/src/main.c index 3896afd0..414b62bc 100644 --- a/src/main.c +++ b/src/main.c @@ -201,10 +201,12 @@ static struct cmdline_option option_data[] = { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 }, #endif { "input-file", 'i', OPT_VALUE, "input", -1 }, + { "iri", 0, OPT_BOOLEAN, "iri", -1 }, { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 }, { "level", 'l', OPT_VALUE, "reclevel", -1 }, { "limit-rate", 0, OPT_VALUE, "limitrate", -1 }, { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 }, + { "locale", 0, OPT_VALUE, "locale", -1 }, { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 }, { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 }, { "no", 'n', OPT__NO, NULL, required_argument }, @@ -238,6 +240,7 @@ static struct cmdline_option option_data[] = { "referer", 0, OPT_VALUE, "referer", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, + { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1}, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 }, { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 }, @@ -1062,6 +1065,27 @@ for details.\n\n")); exit (1); } +#ifdef ENABLE_IRI + if (opt.enable_iri) + { + if (opt.locale && !check_encoding_name (opt.locale)) + opt.locale = NULL; + + if (!opt.locale) + opt.locale = find_locale (); + + if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote)) + opt.encoding_remote = NULL; + } +#else + if (opt.enable_iri || opt.locale || opt.encoding_remote) + { + /* sXXXav : be more specific... */ + printf(_("This version does not have support for IRIs\n")); + exit(1); + } +#endif + if (opt.ask_passwd) { opt.passwd = prompt_for_password (); @@ -1171,15 +1195,21 @@ WARNING: Can't reopen standard output in binary mode;\n\ int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (url_scheme (*t) == SCHEME_FTP) + if (url_scheme (*t) == SCHEME_FTP) opt.follow_ftp = 1; - - status = retrieve_tree (*t); + + status = retrieve_tree (*t, NULL); opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + { + struct iri *i = iri_new (); + set_uri_encoding (i, opt.locale, true); + status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, + opt.recursive, i); + iri_free (i); + } if (opt.delete_after && file_exists_p(filename)) { diff --git a/src/options.h b/src/options.h index ba39ec4e..4574ab85 100644 --- a/src/options.h +++ b/src/options.h @@ -239,6 +239,10 @@ struct options bool content_disposition; /* Honor HTTP Content-Disposition header. */ bool auth_without_challenge; /* Issue Basic authentication creds without waiting for a challenge. */ + + bool enable_iri; + char *encoding_remote; + char *locale; }; extern struct options opt; diff --git a/src/recur.c b/src/recur.c index 741ca823..78682458 100644 --- a/src/recur.c +++ b/src/recur.c @@ -51,7 +51,7 @@ as that of the covered work. */ #include "html-url.h" #include "css-url.h" #include "spider.h" - + /* Functions for maintaining the URL queue. */ struct queue_element { @@ -60,6 +60,7 @@ struct queue_element { int depth; /* the depth */ bool html_allowed; /* whether the document is allowed to be treated as HTML. */ + struct iri *iri; /* sXXXav */ bool css_allowed; /* whether the document is allowed to be treated as CSS. */ struct queue_element *next; /* next element in queue */ @@ -93,11 +94,12 @@ url_queue_delete (struct url_queue *queue) into it. */ static void -url_enqueue (struct url_queue *queue, +url_enqueue (struct url_queue *queue, struct iri *i, const char *url, const char *referer, int depth, bool html_allowed, bool css_allowed) { struct queue_element *qel = xnew (struct queue_element); + qel->iri = i; qel->url = url; qel->referer = referer; qel->depth = depth; @@ -112,6 +114,10 @@ url_enqueue (struct url_queue *queue, DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); + if (i) + DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url), + i->uri_encoding ? quote_n (1, i->uri_encoding) : "None")); + if (queue->tail) queue->tail->next = qel; queue->tail = qel; @@ -124,7 +130,7 @@ url_enqueue (struct url_queue *queue, succeeded, or false if the queue is empty. */ static bool -url_dequeue (struct url_queue *queue, +url_dequeue (struct url_queue *queue, struct iri **i, const char **url, const char **referer, int *depth, bool *html_allowed, bool *css_allowed) { @@ -137,6 +143,7 @@ url_dequeue (struct url_queue *queue, if (!queue->head) queue->tail = NULL; + *i = qel->iri; *url = qel->url; *referer = qel->referer; *depth = qel->depth; @@ -153,9 +160,9 @@ url_dequeue (struct url_queue *queue, } static bool download_child_p (const struct urlpos *, struct url *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); static bool descend_redirect_p (const char *, const char *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); /* Retrieve a part of the web beginning with START_URL. This used to @@ -180,7 +187,7 @@ static bool descend_redirect_p (const char *, const char *, int, options, add it to the queue. */ uerr_t -retrieve_tree (const char *start_url) +retrieve_tree (const char *start_url, struct iri *pi) { uerr_t status = RETROK; @@ -192,8 +199,22 @@ retrieve_tree (const char *start_url) struct hash_table *blacklist; int up_error_code; - struct url *start_url_parsed = url_parse (start_url, &up_error_code); + struct url *start_url_parsed; + struct iri *i = iri_new (); +#define COPYSTR(x) (x) ? xstrdup(x) : NULL; + /* Duplicate pi struct if not NULL */ + if (pi) + { + i->uri_encoding = COPYSTR (pi->uri_encoding); + i->content_encoding = COPYSTR (pi->content_encoding); + i->utf8_encode = pi->utf8_encode; + } + else + set_uri_encoding (i, opt.locale, true); +#undef COPYSTR + + start_url_parsed = url_parse (start_url, &up_error_code, i); if (!start_url_parsed) { char *error = url_error (start_url, up_error_code); @@ -207,7 +228,8 @@ retrieve_tree (const char *start_url) /* Enqueue the starting URL. Use start_url_parsed->url rather than just URL so we enqueue the canonical form of the URL. */ - url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false); + url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true, + false); string_set_add (blacklist, start_url_parsed->url); while (1) @@ -226,7 +248,7 @@ retrieve_tree (const char *start_url) /* Get the next URL from the queue... */ - if (!url_dequeue (queue, + if (!url_dequeue (queue, (struct iri **) &i, (const char **)&url, (const char **)&referer, &depth, &html_allowed, &css_allowed)) break; @@ -267,7 +289,8 @@ retrieve_tree (const char *start_url) int dt = 0; char *redirected = NULL; - status = retrieve_url (url, &file, &redirected, referer, &dt, false); + status = retrieve_url (url, &file, &redirected, referer, &dt, + false, i); if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) @@ -295,7 +318,7 @@ retrieve_tree (const char *start_url) if (descend) { if (!descend_redirect_p (redirected, url, depth, - start_url_parsed, blacklist)) + start_url_parsed, blacklist, i)) descend = false; else /* Make sure that the old pre-redirect form gets @@ -347,7 +370,7 @@ retrieve_tree (const char *start_url) bool meta_disallow_follow = false; struct urlpos *children = is_css ? get_urls_css_file (file, url) : - get_urls_html (file, url, &meta_disallow_follow); + get_urls_html (file, url, &meta_disallow_follow, i); if (opt.use_robots && meta_disallow_follow) { @@ -358,7 +381,8 @@ retrieve_tree (const char *start_url) if (children) { struct urlpos *child = children; - struct url *url_parsed = url_parsed = url_parse (url, NULL); + struct url *url_parsed = url_parse (url, NULL, i); + struct iri *ci; char *referer_url = url; bool strip_auth = (url_parsed != NULL && url_parsed->user != NULL); @@ -375,9 +399,11 @@ retrieve_tree (const char *start_url) if (dash_p_leaf_HTML && !child->link_inline_p) continue; if (download_child_p (child, url_parsed, depth, start_url_parsed, - blacklist)) + blacklist, i)) { - url_enqueue (queue, xstrdup (child->url->url), + ci = iri_new (); + set_uri_encoding (ci, i->content_encoding, false); + url_enqueue (queue, ci, xstrdup (child->url->url), xstrdup (referer_url), depth + 1, child->link_expect_html, child->link_expect_css); @@ -395,18 +421,18 @@ retrieve_tree (const char *start_url) } } - if (file - && (opt.delete_after + if (file + && (opt.delete_after || opt.spider /* opt.recursive is implicitely true */ || !acceptable (file))) { /* Either --delete-after was specified, or we loaded this - (otherwise unneeded because of --spider or rejected by -R) - HTML file just to harvest its hyperlinks -- in either case, + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, delete the local file. */ DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : - (opt.spider ? "--spider" : + (opt.spider ? "--spider" : "recursive rejection criteria"))); logprintf (LOG_VERBOSE, (opt.delete_after || opt.spider @@ -422,6 +448,7 @@ retrieve_tree (const char *start_url) xfree (url); xfree_null (referer); xfree_null (file); + iri_free (i); } /* If anything is left of the queue due to a premature exit, free it @@ -430,9 +457,11 @@ retrieve_tree (const char *start_url) char *d1, *d2; int d3; bool d4, d5; - while (url_dequeue (queue, + struct iri *d6; + while (url_dequeue (queue, (struct iri **)&d6, (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) { + iri_free (d6); xfree (d1); xfree_null (d2); } @@ -461,7 +490,8 @@ retrieve_tree (const char *start_url) static bool download_child_p (const struct urlpos *upos, struct url *parent, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { struct url *u = upos->url; const char *url = u->url; @@ -471,7 +501,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (string_set_contains (blacklist, url)) { - if (opt.spider) + if (opt.spider) { char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD); DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url))); @@ -602,7 +632,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (!specs) { char *rfile; - if (res_retrieve_file (url, &rfile)) + if (res_retrieve_file (url, &rfile, iri)) { specs = res_parse_from_file (rfile); @@ -657,23 +687,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, static bool descend_redirect_p (const char *redirected, const char *original, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { struct url *orig_parsed, *new_parsed; struct urlpos *upos; bool success; - orig_parsed = url_parse (original, NULL); + orig_parsed = url_parse (original, NULL, NULL); assert (orig_parsed != NULL); - new_parsed = url_parse (redirected, NULL); + new_parsed = url_parse (redirected, NULL, NULL); assert (new_parsed != NULL); upos = xnew0 (struct urlpos); upos->url = new_parsed; success = download_child_p (upos, orig_parsed, depth, - start_url_parsed, blacklist); + start_url_parsed, blacklist, iri); url_free (orig_parsed); url_free (new_parsed); diff --git a/src/recur.h b/src/recur.h index 5ab26a95..515a382b 100644 --- a/src/recur.h +++ b/src/recur.h @@ -42,6 +42,6 @@ as that of the covered work. */ struct urlpos; void recursive_cleanup (void); -uerr_t retrieve_tree (const char *); +uerr_t retrieve_tree (const char *, struct iri *); #endif /* RECUR_H */ diff --git a/src/res.c b/src/res.c index 8c35f0e1..0320d034 100644 --- a/src/res.c +++ b/src/res.c @@ -532,21 +532,28 @@ res_get_specs (const char *host, int port) Return true if robots were retrieved OK, false otherwise. */ bool -res_retrieve_file (const char *url, char **file) +res_retrieve_file (const char *url, char **file, struct iri *iri) { + struct iri *i = iri_new (); uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); int saved_ts_val = opt.timestamping; int saved_sp_val = opt.spider; + /* Copy server URI encoding for a possible IDNA transformation, no need to + encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ + set_uri_encoding (i, iri->uri_encoding, false); + i->utf8_encode = false; + logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; opt.timestamping = false; opt.spider = false; - err = retrieve_url (robots_url, file, NULL, NULL, NULL, false); + err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i); opt.timestamping = saved_ts_val; - opt.spider = saved_sp_val; + opt.spider = saved_sp_val; xfree (robots_url); + iri_free (i); if (err != RETROK && *file != NULL) { diff --git a/src/res.h b/src/res.h index 94a57750..5439eaf9 100644 --- a/src/res.h +++ b/src/res.h @@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *); void res_register_specs (const char *, int, struct robot_specs *); struct robot_specs *res_get_specs (const char *, int); -bool res_retrieve_file (const char *, char **); +bool res_retrieve_file (const char *, char **, struct iri *); bool is_robots_txt_url (const char *); diff --git a/src/retr.c b/src/retr.c index 85774297..28a6d874 100644 --- a/src/retr.c +++ b/src/retr.c @@ -597,7 +597,7 @@ static char *getproxy (struct url *); uerr_t retrieve_url (const char *origurl, char **file, char **newloc, - const char *refurl, int *dt, bool recursive) + const char *refurl, int *dt, bool recursive, struct iri *iri) { uerr_t result; char *url; @@ -625,7 +625,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (file) *file = NULL; - u = url_parse (url, &up_error_code); + second_try: + u = url_parse (url, &up_error_code, iri); if (!u) { char *error = url_error (url, up_error_code); @@ -635,6 +636,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, return URLERROR; } + DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url), + iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None", + iri->utf8_encode)); + if (!refurl) refurl = opt.referer; @@ -648,8 +653,12 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { + struct iri *pi = iri_new (); + set_uri_encoding (pi, opt.locale, true); + pi->utf8_encode = false; + /* Parse the proxy URL. */ - proxy_url = url_parse (proxy, &up_error_code); + proxy_url = url_parse (proxy, &up_error_code, NULL); if (!proxy_url) { char *error = url_error (proxy, up_error_code); @@ -676,7 +685,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, #endif || (proxy_url && proxy_url->scheme == SCHEME_HTTP)) { - result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url); + result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri); } else if (u->scheme == SCHEME_FTP) { @@ -726,8 +735,13 @@ retrieve_url (const char *origurl, char **file, char **newloc, xfree (mynewloc); mynewloc = construced_newloc; + /* Reset UTF-8 encoding state, keep the URI encoding and reset + the content encoding. */ + iri->utf8_encode = opt.enable_iri; + set_content_encoding (iri, NULL); + /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code); + newloc_parsed = url_parse (mynewloc, &up_error_code, iri); if (!newloc_parsed) { char *error = url_error (mynewloc, up_error_code); @@ -776,8 +790,21 @@ retrieve_url (const char *origurl, char **file, char **newloc, goto redirected; } - if (local_file) + /* Try to not encode in UTF-8 if fetching failed */ + if (!(*dt & RETROKF) && iri->utf8_encode) { + iri->utf8_encode = false; + DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url))); + goto second_try; + } + + if (local_file && *dt & RETROKF) + { + register_download (u->url, local_file); + if (redirection_count && 0 != strcmp (origurl, u->url)) + register_redirection (origurl, u->url); + if (*dt & TEXTHTML) + register_html (u->url, local_file); if (*dt & RETROKF) { register_download (u->url, local_file); @@ -827,13 +854,18 @@ retrieve_from_file (const char *file, bool html, int *count) { uerr_t status; struct urlpos *url_list, *cur_url; + struct iri *iri = iri_new(); char *input_file = NULL; const char *url = file; status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ - + + /* sXXXav : Assume filename and links in the file are in the locale */ + set_uri_encoding (iri, opt.locale, true); + set_content_encoding (iri, opt.locale); + if (url_has_scheme (url)) { int dt; @@ -842,17 +874,21 @@ retrieve_from_file (const char *file, bool html, int *count) if (!opt.base_href) opt.base_href = xstrdup (url); - status = retrieve_url (url, &input_file, NULL, NULL, &dt, false); + status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri); if (status != RETROK) return status; if (dt & TEXTHTML) html = true; + + /* If we have a found a content encoding, use it */ + if (iri->content_encoding) + set_uri_encoding (iri, iri->content_encoding, false); } else input_file = (char *) file; - url_list = (html ? get_urls_html (input_file, NULL, NULL) + url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) : get_urls_file (input_file)); for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) @@ -868,21 +904,26 @@ retrieve_from_file (const char *file, bool html, int *count) status = QUOTEXC; break; } + + /* Reset UTF-8 encode status */ + iri->utf8_encode = opt.enable_iri; + if ((opt.recursive || opt.page_requisites) && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url))) { int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (cur_url->url->scheme == SCHEME_FTP) + if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - - status = retrieve_tree (cur_url->url->url); + + status = retrieve_tree (cur_url->url->url, iri); opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive); + status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, + &dt, opt.recursive, iri); if (filename && opt.delete_after && file_exists_p (filename)) { @@ -901,6 +942,8 @@ Removing file due to --delete-after in retrieve_from_file():\n")); /* Free the linked list of URL-s. */ free_urlpos (url_list); + iri_free (iri); + return status; } @@ -1053,7 +1096,11 @@ bool url_uses_proxy (const char *url) { bool ret; - struct url *u = url_parse (url, NULL); + struct url *u; + struct iri *i = iri_new(); + /* url was given in the command line, so use locale as encoding */ + set_uri_encoding (i, opt.locale, true); + u= url_parse (url, NULL, i); if (!u) return false; ret = getproxy (u) != NULL; diff --git a/src/retr.h b/src/retr.h index ec55cfda..bb2e66d3 100644 --- a/src/retr.h +++ b/src/retr.h @@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int); char *fd_read_hunk (int, hunk_terminator_t, long, long); char *fd_read_line (int); -uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool); +uerr_t retrieve_url (const char *, char **, char **, const char *, int *, + bool, struct iri *); uerr_t retrieve_from_file (const char *, bool, int *); const char *retr_rate (wgint, double); diff --git a/src/url.c b/src/url.c index 3f4b8992..c937d056 100644 --- a/src/url.c +++ b/src/url.c @@ -640,7 +640,7 @@ static const char *parse_errors[] = { error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */ struct url * -url_parse (const char *url, int *error) +url_parse (const char *url, int *error, struct iri *iri) { struct url *u; const char *p; @@ -659,7 +659,7 @@ url_parse (const char *url, int *error) int port; char *user = NULL, *passwd = NULL; - char *url_encoded = NULL; + char *url_encoded = NULL, *new_url = NULL; int error_code; @@ -670,9 +670,20 @@ url_parse (const char *url, int *error) goto error; } - url_encoded = reencode_escapes (url); + if (iri && iri->utf8_encode) + { + url_unescape ((char *) url); + iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url); + if (!iri->utf8_encode) + new_url = NULL; + } + + url_encoded = reencode_escapes (new_url ? new_url : url); p = url_encoded; + if (new_url && url_encoded != new_url) + xfree (new_url); + p += strlen (supported_schemes[scheme].leading_string); uname_b = p; p = url_skip_credentials (p); @@ -842,6 +853,18 @@ url_parse (const char *url, int *error) { url_unescape (u->host); host_modified = true; + + /* Apply IDNA regardless of iri->utf8_encode status */ + if (opt.enable_iri && iri) + { + char *new = idn_encode (iri, u->host); + if (new) + { + xfree (u->host); + u->host = new; + host_modified = true; + } + } } if (params_b) @@ -851,7 +874,7 @@ url_parse (const char *url, int *error) if (fragment_b) u->fragment = strdupdelim (fragment_b, fragment_e); - if (path_modified || u->fragment || host_modified || path_b == path_e) + if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e) { /* If we suspect that a transformation has rendered what url_string might return different from URL_ENCODED, rebuild diff --git a/src/url.h b/src/url.h index ce308f6f..0748e214 100644 --- a/src/url.h +++ b/src/url.h @@ -84,7 +84,7 @@ struct url char *url_escape (const char *); -struct url *url_parse (const char *, int *); +struct url *url_parse (const char *, int *, struct iri *iri); char *url_error (const char *, int); char *url_full_path (const struct url *); void url_set_dir (struct url *, const char *); diff --git a/src/wget.h b/src/wget.h index d87dfcac..b17b6709 100644 --- a/src/wget.h +++ b/src/wget.h @@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT; #include "quote.h" #include "quotearg.h" +/* Likewise for struct iri definition */ +#include "iri.h" + /* Useful macros used across the code: */ /* The number of elements in an array. For example: diff --git a/tests/ChangeLog b/tests/ChangeLog index 36bc35dc..f2179763 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,30 @@ +2008-08-14 Xavier Saint + + * Test-iri-list.px : Fetch files from a remote list. + +2008-08-03 Xavier Saint + + * Test-iri.px : HTTP recursive fetch for testing IRI support and + fallback. + + * Test-iri-disabled.px : Same file structure as Test-iri.px but with + IRI support disabled + + * Test-iri-forced-remote.px : There's a difference between ISO-8859-1 + and ISO-8859-15 for character 0xA4 (respectively currency sign and + euro sign). So with a forced ISO-8859-1 remote encoding, wget should + see 0xA4 as a currency sign and transcode it correctly in UTF-8 instead + of using the ISO-8859-15 given by the server. + + * Test-ftp-iri.px : Give a file to fetch via FTP in a specific locale + and expect wget to fetch the file UTF-8 encoded. + + * Test-ftp-iri-fallback.px : Same as above but wget should fallback on + locale encoding to fetch the file. + + * Test-ftp-iri.px : Same as Test-ftp-iri.px but with IRI support + disabled. The UTF-8 encoded file should not be retrieved. + 2008-06-22 Micah Cowan * Test-proxied-https-auth.px: Shift exit code so it falls in the diff --git a/tests/Test-ftp-iri-disabled.px b/tests/Test-ftp-iri-disabled.px new file mode 100755 index 00000000..14d849da --- /dev/null +++ b/tests/Test-ftp-iri-disabled.px @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w + +use strict; + +use FTPTest; + + +############################################################################### + +my $ccedilla_l1 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; + +my $francais = < { + content => $francais, + }, + "/fran${ccedilla_l1}ais.txt" => { + content => $francais, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri=no --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + "fran${ccedilla_l1}ais.txt" => { + content => $francais, + }, +); + +############################################################################### + +my $the_test = FTPTest->new (name => "Test-ftp-iri", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-ftp-iri-fallback.px b/tests/Test-ftp-iri-fallback.px new file mode 100755 index 00000000..8902e0f9 --- /dev/null +++ b/tests/Test-ftp-iri-fallback.px @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w + +use strict; + +use FTPTest; + + +############################################################################### + +my $ccedilla_l1 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; + +my $francais = < { + content => $francais, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + "fran${ccedilla_l1}ais.txt" => { + content => $francais, + }, +); + +############################################################################### + +my $the_test = FTPTest->new (name => "Test-ftp-iri", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-ftp-iri.px b/tests/Test-ftp-iri.px new file mode 100755 index 00000000..d453669c --- /dev/null +++ b/tests/Test-ftp-iri.px @@ -0,0 +1,47 @@ +#!/usr/bin/perl -w + +use strict; + +use FTPTest; + + +############################################################################### + +my $ccedilla_l1 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; + +my $francais = < { + content => $francais, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + "fran${ccedilla_u8}ais.txt" => { + content => $francais, + }, +); + +############################################################################### + +my $the_test = FTPTest->new (name => "Test-ftp-iri", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-iri-disabled.px b/tests/Test-iri-disabled.px new file mode 100755 index 00000000..17e43361 --- /dev/null +++ b/tests/Test-iri-disabled.px @@ -0,0 +1,196 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# cf. http://en.wikipedia.org/wiki/Latin1 +# http://en.wikipedia.org/wiki/ISO-8859-15 + +############################################################################### +# +# mime : charset found in Content-Type HTTP MIME header +# meta : charset found in Content-Type meta tag +# +# index.html mime + file = iso-8859-15 +# p1_français.html meta + file = iso-8859-1, mime = utf-8 +# p2_één.html mime + file = iso-8859-1 +# p3_€€€.html meta + file = utf-8, mime = iso-8859-1 +# + +my $ccedilla_l15 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $eacute_l1 = "\xE9"; +my $eacute_u8 = "\xC3\xA9"; +my $eurosign_l15 = "\xA4"; +my $eurosign_u8 = "\xE2\x82\xAC"; + +my $pageindex = < + + Main Page + + +

+ Link to page 1 La seule page en français. + Link to page 3 My tailor is rich. +

+ + +EOF + +my $pagefrancais = < + + La seule page en français + + + +

+ Link to page 2 Die enkele nerderlangstalige pagina. +

+ + +EOF + +my $pageeen = < + + Die enkele nederlandstalige pagina + + +

+ Één is niet veel maar toch meer dan nul.
+ Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :) +

+ + +EOF + +my $pageeuro = < + + Euro page + + +

+ My tailor isn't rich anymore. +

+ + +EOF + +my $page404 = < + + 404 + + +

+ Nop nop nop... +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-15", + }, + content => $pageindex, + }, + '/robots.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => "", + }, + '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded + code => "200", + msg => "File not found", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p1_fran%E7ais.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pageeen, + }, + '/p2_%E9%E9n.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => $pageeuro, + }, + '/p3_%A4%A4%A4.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => $pageeuro, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri=no -nH -r http://localhost:{{port}}/"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'index.html' => { + content => $pageindex, + }, + 'robots.txt' => { + content => "", + }, + "p1_fran${ccedilla_l15}ais.html" => { + content => $pagefrancais, + }, + "p2_${eacute_l1}${eacute_l1}n.html" => { + content => $pageeen, + }, + "p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html" => { + content => $pageeuro, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-disabled", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-iri-forced-remote.px b/tests/Test-iri-forced-remote.px new file mode 100755 index 00000000..1acd03a7 --- /dev/null +++ b/tests/Test-iri-forced-remote.px @@ -0,0 +1,207 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# cf. http://en.wikipedia.org/wiki/Latin1 +# http://en.wikipedia.org/wiki/ISO-8859-15 + +############################################################################### +# Force remote encoding to ISO-8859-1 +# +# mime : charset found in Content-Type HTTP MIME header +# meta : charset found in Content-Type meta tag +# +# index.html mime + file = iso-8859-15 +# p1_français.html meta + file = iso-8859-1, mime = utf-8 +# p2_één.html mime + file = iso-8859-1 +# p3_€€€.html meta + file = utf-8, mime = iso-8859-1 +# + +my $ccedilla_l15 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $eacute_l1 = "\xE9"; +my $eacute_u8 = "\xC3\xA9"; +my $eurosign_l15 = "\xA4"; +my $eurosign_u8 = "\xE2\x82\xAC"; +my $currency_l1 = "\xA4"; +my $currency_u8 = "\xC2\xA4"; + +my $pageindex = < + + Main Page + + +

+ Link to page 1 La seule page en français. + Link to page 3 My tailor is rich. +

+ + +EOF + +my $pagefrancais = < + + La seule page en français + + + +

+ Link to page 2 Die enkele nerderlangstalige pagina. +

+ + +EOF + +my $pageeen = < + + Die enkele nederlandstalige pagina + + +

+ Één is niet veel maar toch meer dan nul.
+ Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :) +

+ + +EOF + +my $pageeuro = < + + Euro page + + +

+ My tailor isn't rich anymore. +

+ + +EOF + +my $page404 = < + + 404 + + +

+ Nop nop nop... +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-15", + }, + content => $pageindex, + }, + '/robots.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => "", + }, + '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded + code => "404", + msg => "File not found", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $page404, + }, + '/p1_fran%E7ais.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pageeen, + }, + '/p2_%E9%E9n.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => $pageeuro, + }, + '/p3_%A4%A4%A4.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => $pageeuro, + }, + '/p3_%C2%A4%C2%A4%C2%A4.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => $pageeuro, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'index.html' => { + content => $pageindex, + }, + 'robots.txt' => { + content => "", + }, + "p1_fran${ccedilla_l15}ais.html" => { + content => $pagefrancais, + }, + "p2_${eacute_u8}${eacute_u8}n.html" => { + content => $pageeen, + }, + "p3_${currency_u8}${currency_u8}${currency_u8}.html" => { + content => $pageeuro, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-forced-remote", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-iri-list.px b/tests/Test-iri-list.px new file mode 100755 index 00000000..51bb09fe --- /dev/null +++ b/tests/Test-iri-list.px @@ -0,0 +1,173 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# cf. http://en.wikipedia.org/wiki/Latin1 +# http://en.wikipedia.org/wiki/ISO-8859-15 +############################################################################### +# +# mime : charset found in Content-Type HTTP MIME header +# meta : charset found in Content-Type meta tag +# +# index.html mime + file = iso-8859-15 +# p1_français.html meta + file = iso-8859-1, mime = utf-8 +# p2_één.html meta + file = utf-8, mime =iso-8859-1 +# + +my $ccedilla_l1 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $eacute_l1 = "\xE9"; +my $eacute_u8 = "\xC3\xA9"; + +my $urllist = < + + Main Page + + +

+ Main page. +

+ + +EOF + +my $pagefrancais = < + + La seule page en français + + + +

+ French page. +

+ + +EOF + +my $pageeen = < + + Die enkele nederlandstalige pagina + + + +

+ Dutch page. +

+ + +EOF + +my $page404 = < + + 404 + + +

+ Nop nop nop... +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-15", + }, + content => $pageindex, + }, + '/robots.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => "", + }, + '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded + code => "404", + msg => "File not found", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $page404, + }, + '/p1_fran%E7ais.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p2_%E9%E9n.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/url_list.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain; charset=ISO-8859-1", + }, + content => $urllist, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri -d -i http://localhost:{{port}}/url_list.txt"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'url_list.txt' => { + content => $urllist, + }, + 'index.html' => { + content => $pageindex, + }, + "p1_fran${ccedilla_l1}ais.html" => { + content => $pagefrancais, + }, + "p2_${eacute_u8}${eacute_u8}n.html" => { + content => $pageeen, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-list", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/Test-iri.px b/tests/Test-iri.px new file mode 100755 index 00000000..d228721c --- /dev/null +++ b/tests/Test-iri.px @@ -0,0 +1,224 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# cf. http://en.wikipedia.org/wiki/Latin1 +# http://en.wikipedia.org/wiki/ISO-8859-15 + +############################################################################### +# +# mime : charset found in Content-Type HTTP MIME header +# meta : charset found in Content-Type meta tag +# +# index.html mime + file = iso-8859-15 +# p1_français.html meta + file = iso-8859-1, mime = utf-8 +# p2_één.html meta + file = utf-8, mime =iso-8859-1 +# p3_€€€.html meta + file = utf-8, mime = iso-8859-1 +# p4_méér.html mime + file = utf-8 +# + +my $ccedilla_l15 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $eacute_l1 = "\xE9"; +my $eacute_u8 = "\xC3\xA9"; +my $eurosign_l15 = "\xA4"; +my $eurosign_u8 = "\xE2\x82\xAC"; + +my $pageindex = < + + Main Page + + +

+ Link to page 1 La seule page en français. + Link to page 3 My tailor is rich. +

+ + +EOF + +my $pagefrancais = < + + La seule page en français + + + +

+ Link to page 2 Die enkele nerderlangstalige pagina. +

+ + +EOF + +my $pageeen = < + + Die enkele nederlandstalige pagina + + + +

+ Één is niet veel maar toch meer dan nul.
+ Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
+ Méér +

+ + +EOF + +my $pageeuro = < + + Euro page + + + +

+ My tailor isn't rich anymore. +

+ + +EOF + +my $pagemeer = < + + Bekende supermarkt + + +

+ Ik ben toch niet gek ! +

+ + +EOF + +my $page404 = < + + 404 + + +

+ Nop nop nop... +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-15", + }, + content => $pageindex, + }, + '/robots.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => "", + }, + '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded + code => "404", + msg => "File not found", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $page404, + }, + '/p1_fran%E7ais.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p2_%E9%E9n.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain; charset=ISO-8859-1", + }, + content => $pageeuro, + }, + '/p3_%A4%A4%A4.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain; charset=ISO-8859-1", + }, + content => $pageeuro, + }, + '/p4_m%C3%A9%C3%A9r.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain; charset=UTF-8", + }, + content => $pagemeer, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri --restrict-file-names=nocontrol -nH -r http://localhost:{{port}}/"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'index.html' => { + content => $pageindex, + }, + 'robots.txt' => { + content => "", + }, + "p1_fran${ccedilla_l15}ais.html" => { + content => $pagefrancais, + }, + "p2_${eacute_u8}${eacute_u8}n.html" => { + content => $pageeen, + }, + "p3_${eurosign_u8}${eurosign_u8}${eurosign_u8}.html" => { + content => $pageeuro, + }, + "p4_m${eacute_u8}${eacute_u8}r.html" => { + content => $pagemeer, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index 50db5819..c18c8d85 100755 --- a/tests/run-px +++ b/tests/run-px @@ -17,9 +17,16 @@ my @tests = ( 'Test-E-k-K.px', 'Test-E-k.px', 'Test-ftp.px', + 'Test-ftp-iri.px', + 'Test-ftp-iri-fallback.px', + 'Test-ftp-iri-disabled.px', 'Test-HTTP-Content-Disposition-1.px', 'Test-HTTP-Content-Disposition-2.px', 'Test-HTTP-Content-Disposition.px', + 'Test-iri.px', + 'Test-iri-disabled.px', + 'Test-iri-forced-remote.px', + 'Test-iri-list.px', 'Test-N-current.px', 'Test-N-smaller.px', 'Test-N-no-info.px',