From 59b920874daa565a1323ffa1e756e80493190686 Mon Sep 17 00:00:00 2001 From: Eli Zaretskii Date: Fri, 18 Dec 2015 17:03:26 +0200 Subject: [PATCH] Support non-ASCII URLs * src/url.c [HAVE_ICONV]: Include iconv.h and langinfo.h. (convert_fname): New function. [HAVE_ICONV]: Convert file name from remote encoding to local encoding. (url_file_name): Call convert_fname. (filechr_table): Don't consider bytes in 128..159 as control characters. * tests/Test-ftp-iri.px: Fix the expected file name to match the new file-name recoding. State the remote encoding explicitly on the Wget command line. * NEWS: Mention the URI recoding when built with libiconv. --- NEWS | 7 ++++ src/url.c | 87 ++++++++++++++++++++++++++++++++++++++++++- tests/Test-ftp-iri.px | 4 +- 3 files changed, 94 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index c8cebad3..c63c678f 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,13 @@ Please send GNU Wget bug reports to . * Changes in Wget X.Y.Z +* When Wget is built with libiconv, it now converts non-ASCII URIs to + the locale's codeset when it creates files. The encoding of the + remote files and URIs is taken from --remote-encoding, defaulting to + UTF-8. The result is that non-ASCII URIs and files downloaded via + HTTP/HTTPS and FTP will have names on the local filesystem that + correspond to their remote names. + * Changes in Wget 1.17.1 * Fix compile error when IPv6 is disabled or SSL is not present. diff --git a/src/url.c b/src/url.c index c62867fc..ca7fe29b 100644 --- a/src/url.c +++ b/src/url.c @@ -43,6 +43,11 @@ as that of the covered work. */ #include "host.h" /* for is_valid_ipv6_address */ #include "c-strcase.h" +#if HAVE_ICONV +#include +#include +#endif + #ifdef __VMS #include "vms.h" #endif /* def __VMS */ @@ -1399,8 +1404,8 @@ UVWC, VC, VC, VC, VC, VC, VC, VC, /* NUL SOH STX ETX EOT ENQ ACK BEL */ 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ 0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */ - C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */ - C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1531,6 +1536,82 @@ append_uri_pathel (const char *b, const char *e, bool escaped, append_null (dest); } +static char * +convert_fname (const char *fname) +{ + char *converted_fname = (char *)fname; +#if HAVE_ICONV + const char *from_encoding = opt.encoding_remote; + const char *to_encoding = opt.locale; + iconv_t cd; + size_t len, done, inlen, outlen; + char *s; + const char *orig_fname = fname;; + + /* Defaults for remote and local encodings. */ + if (!from_encoding) + from_encoding = "UTF-8"; + if (!to_encoding) + to_encoding = nl_langinfo (CODESET); + + cd = iconv_open (to_encoding, from_encoding); + if (cd == (iconv_t)(-1)) + logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"), + quote (from_encoding), quote (to_encoding)); + else + { + inlen = strlen (fname); + len = outlen = inlen * 2; + converted_fname = s = xmalloc (outlen + 1); + done = 0; + + for (;;) + { + if (iconv (cd, &fname, &inlen, &s, &outlen) != (size_t)(-1) + && iconv (cd, NULL, NULL, &s, &outlen) != (size_t)(-1)) + { + *(converted_fname + len - outlen - done) = '\0'; + iconv_close(cd); + DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n", + orig_fname, from_encoding, converted_fname, to_encoding)); + xfree (orig_fname); + return converted_fname; + } + + /* Incomplete or invalid multibyte sequence */ + if (errno == EINVAL || errno == EILSEQ) + { + logprintf (LOG_VERBOSE, + _("Incomplete or invalid multibyte sequence encountered\n")); + xfree (converted_fname); + converted_fname = (char *)orig_fname; + break; + } + else if (errno == E2BIG) /* Output buffer full */ + { + done = len; + len = outlen = done + inlen * 2; + converted_fname = xrealloc (converted_fname, outlen + 1); + s = converted_fname + done; + } + else /* Weird, we got an unspecified error */ + { + logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno); + xfree (converted_fname); + converted_fname = (char *)orig_fname; + break; + } + } + DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n", + orig_fname, from_encoding, to_encoding)); + } + + iconv_close(cd); +#endif + + return converted_fname; +} + /* Append to DEST the directory structure that corresponds the directory part of URL's path. For example, if the URL is http://server/dir1/dir2/file, this appends "/dir1/dir2". @@ -1706,6 +1787,8 @@ url_file_name (const struct url *u, char *replaced_filename) xfree (temp_fnres.base); + fname = convert_fname (fname); + /* Check the cases in which the unique extensions are not used: 1) Clobbering is turned off (-nc). 2) Retrieval with regetting. diff --git a/tests/Test-ftp-iri.px b/tests/Test-ftp-iri.px index e196dbe7..42e0eca5 100755 --- a/tests/Test-ftp-iri.px +++ b/tests/Test-ftp-iri.px @@ -26,12 +26,12 @@ my %urls = ( }, ); -my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt"; +my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 --remote-encoding=utf-8 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt"; my $expected_error_code = 0; my %expected_downloaded_files = ( - "fran${ccedilla_u8}ais.txt" => { + "fran${ccedilla_l1}ais.txt" => { content => $francais, }, );