mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Support non-ASCII URLs
* src/url.c [HAVE_ICONV]: Include iconv.h and langinfo.h. (convert_fname): New function. [HAVE_ICONV]: Convert file name from remote encoding to local encoding. (url_file_name): Call convert_fname. (filechr_table): Don't consider bytes in 128..159 as control characters. * tests/Test-ftp-iri.px: Fix the expected file name to match the new file-name recoding. State the remote encoding explicitly on the Wget command line. * NEWS: Mention the URI recoding when built with libiconv.
This commit is contained in:
parent
9a6e63bee9
commit
59b920874d
7
NEWS
7
NEWS
@ -9,6 +9,13 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
|
|||||||
|
|
||||||
* Changes in Wget X.Y.Z
|
* Changes in Wget X.Y.Z
|
||||||
|
|
||||||
|
* When Wget is built with libiconv, it now converts non-ASCII URIs to
|
||||||
|
the locale's codeset when it creates files. The encoding of the
|
||||||
|
remote files and URIs is taken from --remote-encoding, defaulting to
|
||||||
|
UTF-8. The result is that non-ASCII URIs and files downloaded via
|
||||||
|
HTTP/HTTPS and FTP will have names on the local filesystem that
|
||||||
|
correspond to their remote names.
|
||||||
|
|
||||||
* Changes in Wget 1.17.1
|
* Changes in Wget 1.17.1
|
||||||
|
|
||||||
* Fix compile error when IPv6 is disabled or SSL is not present.
|
* Fix compile error when IPv6 is disabled or SSL is not present.
|
||||||
|
87
src/url.c
87
src/url.c
@ -43,6 +43,11 @@ as that of the covered work. */
|
|||||||
#include "host.h" /* for is_valid_ipv6_address */
|
#include "host.h" /* for is_valid_ipv6_address */
|
||||||
#include "c-strcase.h"
|
#include "c-strcase.h"
|
||||||
|
|
||||||
|
#if HAVE_ICONV
|
||||||
|
#include <iconv.h>
|
||||||
|
#include <langinfo.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __VMS
|
#ifdef __VMS
|
||||||
#include "vms.h"
|
#include "vms.h"
|
||||||
#endif /* def __VMS */
|
#endif /* def __VMS */
|
||||||
@ -1399,8 +1404,8 @@ UVWC, VC, VC, VC, VC, VC, VC, VC, /* NUL SOH STX ETX EOT ENQ ACK BEL */
|
|||||||
0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
|
0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
|
||||||
0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */
|
0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */
|
||||||
|
|
||||||
C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
|
||||||
C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
||||||
@ -1531,6 +1536,82 @@ append_uri_pathel (const char *b, const char *e, bool escaped,
|
|||||||
append_null (dest);
|
append_null (dest);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static char *
|
||||||
|
convert_fname (const char *fname)
|
||||||
|
{
|
||||||
|
char *converted_fname = (char *)fname;
|
||||||
|
#if HAVE_ICONV
|
||||||
|
const char *from_encoding = opt.encoding_remote;
|
||||||
|
const char *to_encoding = opt.locale;
|
||||||
|
iconv_t cd;
|
||||||
|
size_t len, done, inlen, outlen;
|
||||||
|
char *s;
|
||||||
|
const char *orig_fname = fname;;
|
||||||
|
|
||||||
|
/* Defaults for remote and local encodings. */
|
||||||
|
if (!from_encoding)
|
||||||
|
from_encoding = "UTF-8";
|
||||||
|
if (!to_encoding)
|
||||||
|
to_encoding = nl_langinfo (CODESET);
|
||||||
|
|
||||||
|
cd = iconv_open (to_encoding, from_encoding);
|
||||||
|
if (cd == (iconv_t)(-1))
|
||||||
|
logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
|
||||||
|
quote (from_encoding), quote (to_encoding));
|
||||||
|
else
|
||||||
|
{
|
||||||
|
inlen = strlen (fname);
|
||||||
|
len = outlen = inlen * 2;
|
||||||
|
converted_fname = s = xmalloc (outlen + 1);
|
||||||
|
done = 0;
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
if (iconv (cd, &fname, &inlen, &s, &outlen) != (size_t)(-1)
|
||||||
|
&& iconv (cd, NULL, NULL, &s, &outlen) != (size_t)(-1))
|
||||||
|
{
|
||||||
|
*(converted_fname + len - outlen - done) = '\0';
|
||||||
|
iconv_close(cd);
|
||||||
|
DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n",
|
||||||
|
orig_fname, from_encoding, converted_fname, to_encoding));
|
||||||
|
xfree (orig_fname);
|
||||||
|
return converted_fname;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Incomplete or invalid multibyte sequence */
|
||||||
|
if (errno == EINVAL || errno == EILSEQ)
|
||||||
|
{
|
||||||
|
logprintf (LOG_VERBOSE,
|
||||||
|
_("Incomplete or invalid multibyte sequence encountered\n"));
|
||||||
|
xfree (converted_fname);
|
||||||
|
converted_fname = (char *)orig_fname;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (errno == E2BIG) /* Output buffer full */
|
||||||
|
{
|
||||||
|
done = len;
|
||||||
|
len = outlen = done + inlen * 2;
|
||||||
|
converted_fname = xrealloc (converted_fname, outlen + 1);
|
||||||
|
s = converted_fname + done;
|
||||||
|
}
|
||||||
|
else /* Weird, we got an unspecified error */
|
||||||
|
{
|
||||||
|
logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
|
||||||
|
xfree (converted_fname);
|
||||||
|
converted_fname = (char *)orig_fname;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n",
|
||||||
|
orig_fname, from_encoding, to_encoding));
|
||||||
|
}
|
||||||
|
|
||||||
|
iconv_close(cd);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return converted_fname;
|
||||||
|
}
|
||||||
|
|
||||||
/* Append to DEST the directory structure that corresponds the
|
/* Append to DEST the directory structure that corresponds the
|
||||||
directory part of URL's path. For example, if the URL is
|
directory part of URL's path. For example, if the URL is
|
||||||
http://server/dir1/dir2/file, this appends "/dir1/dir2".
|
http://server/dir1/dir2/file, this appends "/dir1/dir2".
|
||||||
@ -1706,6 +1787,8 @@ url_file_name (const struct url *u, char *replaced_filename)
|
|||||||
|
|
||||||
xfree (temp_fnres.base);
|
xfree (temp_fnres.base);
|
||||||
|
|
||||||
|
fname = convert_fname (fname);
|
||||||
|
|
||||||
/* Check the cases in which the unique extensions are not used:
|
/* Check the cases in which the unique extensions are not used:
|
||||||
1) Clobbering is turned off (-nc).
|
1) Clobbering is turned off (-nc).
|
||||||
2) Retrieval with regetting.
|
2) Retrieval with regetting.
|
||||||
|
@ -26,12 +26,12 @@ my %urls = (
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
|
my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 --remote-encoding=utf-8 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
|
||||||
|
|
||||||
my $expected_error_code = 0;
|
my $expected_error_code = 0;
|
||||||
|
|
||||||
my %expected_downloaded_files = (
|
my %expected_downloaded_files = (
|
||||||
"fran${ccedilla_u8}ais.txt" => {
|
"fran${ccedilla_l1}ais.txt" => {
|
||||||
content => $francais,
|
content => $francais,
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
Loading…
Reference in New Issue
Block a user