Support non-ASCII URLs

* src/url.c [HAVE_ICONV]: Include iconv.h and langinfo.h.
(convert_fname): New function.
[HAVE_ICONV]: Convert file name from remote encoding to local
encoding.
(url_file_name): Call convert_fname.
(filechr_table): Don't consider bytes in 128..159 as control
characters.

* tests/Test-ftp-iri.px: Fix the expected file name to match the
new file-name recoding.  State the remote encoding explicitly on
the Wget command line.

* NEWS: Mention the URI recoding when built with libiconv.
This commit is contained in:
Eli Zaretskii 2015-12-18 17:03:26 +02:00 committed by Tim Rühsen
parent 9a6e63bee9
commit 59b920874d
3 changed files with 94 additions and 4 deletions

7
NEWS
View File

@ -9,6 +9,13 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
* Changes in Wget X.Y.Z * Changes in Wget X.Y.Z
* When Wget is built with libiconv, it now converts non-ASCII URIs to
the locale's codeset when it creates files. The encoding of the
remote files and URIs is taken from --remote-encoding, defaulting to
UTF-8. The result is that non-ASCII URIs and files downloaded via
HTTP/HTTPS and FTP will have names on the local filesystem that
correspond to their remote names.
* Changes in Wget 1.17.1 * Changes in Wget 1.17.1
* Fix compile error when IPv6 is disabled or SSL is not present. * Fix compile error when IPv6 is disabled or SSL is not present.

View File

@ -43,6 +43,11 @@ as that of the covered work. */
#include "host.h" /* for is_valid_ipv6_address */ #include "host.h" /* for is_valid_ipv6_address */
#include "c-strcase.h" #include "c-strcase.h"
#if HAVE_ICONV
#include <iconv.h>
#include <langinfo.h>
#endif
#ifdef __VMS #ifdef __VMS
#include "vms.h" #include "vms.h"
#endif /* def __VMS */ #endif /* def __VMS */
@ -1399,8 +1404,8 @@ UVWC, VC, VC, VC, VC, VC, VC, VC, /* NUL SOH STX ETX EOT ENQ ACK BEL */
0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */ 0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */
C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -1531,6 +1536,82 @@ append_uri_pathel (const char *b, const char *e, bool escaped,
append_null (dest); append_null (dest);
} }
static char *
convert_fname (const char *fname)
{
char *converted_fname = (char *)fname;
#if HAVE_ICONV
const char *from_encoding = opt.encoding_remote;
const char *to_encoding = opt.locale;
iconv_t cd;
size_t len, done, inlen, outlen;
char *s;
const char *orig_fname = fname;;
/* Defaults for remote and local encodings. */
if (!from_encoding)
from_encoding = "UTF-8";
if (!to_encoding)
to_encoding = nl_langinfo (CODESET);
cd = iconv_open (to_encoding, from_encoding);
if (cd == (iconv_t)(-1))
logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
quote (from_encoding), quote (to_encoding));
else
{
inlen = strlen (fname);
len = outlen = inlen * 2;
converted_fname = s = xmalloc (outlen + 1);
done = 0;
for (;;)
{
if (iconv (cd, &fname, &inlen, &s, &outlen) != (size_t)(-1)
&& iconv (cd, NULL, NULL, &s, &outlen) != (size_t)(-1))
{
*(converted_fname + len - outlen - done) = '\0';
iconv_close(cd);
DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n",
orig_fname, from_encoding, converted_fname, to_encoding));
xfree (orig_fname);
return converted_fname;
}
/* Incomplete or invalid multibyte sequence */
if (errno == EINVAL || errno == EILSEQ)
{
logprintf (LOG_VERBOSE,
_("Incomplete or invalid multibyte sequence encountered\n"));
xfree (converted_fname);
converted_fname = (char *)orig_fname;
break;
}
else if (errno == E2BIG) /* Output buffer full */
{
done = len;
len = outlen = done + inlen * 2;
converted_fname = xrealloc (converted_fname, outlen + 1);
s = converted_fname + done;
}
else /* Weird, we got an unspecified error */
{
logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
xfree (converted_fname);
converted_fname = (char *)orig_fname;
break;
}
}
DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n",
orig_fname, from_encoding, to_encoding));
}
iconv_close(cd);
#endif
return converted_fname;
}
/* Append to DEST the directory structure that corresponds the /* Append to DEST the directory structure that corresponds the
directory part of URL's path. For example, if the URL is directory part of URL's path. For example, if the URL is
http://server/dir1/dir2/file, this appends "/dir1/dir2". http://server/dir1/dir2/file, this appends "/dir1/dir2".
@ -1706,6 +1787,8 @@ url_file_name (const struct url *u, char *replaced_filename)
xfree (temp_fnres.base); xfree (temp_fnres.base);
fname = convert_fname (fname);
/* Check the cases in which the unique extensions are not used: /* Check the cases in which the unique extensions are not used:
1) Clobbering is turned off (-nc). 1) Clobbering is turned off (-nc).
2) Retrieval with regetting. 2) Retrieval with regetting.

View File

@ -26,12 +26,12 @@ my %urls = (
}, },
); );
my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt"; my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 --remote-encoding=utf-8 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0; my $expected_error_code = 0;
my %expected_downloaded_files = ( my %expected_downloaded_files = (
"fran${ccedilla_u8}ais.txt" => { "fran${ccedilla_l1}ais.txt" => {
content => $francais, content => $francais,
}, },
); );