1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

Merge with mainline.

This commit is contained in:
Micah Cowan 2009-06-25 01:14:11 -07:00
commit 4f3dd68173
46 changed files with 2261 additions and 123 deletions

View File

@ -49,6 +49,14 @@
* AUTHORS: Added Steven Schubiger.
2008-06-26 Xavier Saint <wget@sxav.eu>
* configure.ac : IRIs support required libiconv, check it.
2008-06-14 Xavier Saint <wget@sxav.eu>
* configure.ac: Add support for IRIs
2008-05-29 Micah Cowan <micah@cowan.name>
* po/*.po: Updated from TP (the 1.11.3 set).

View File

@ -462,6 +462,77 @@ else
fi
AC_SUBST(COMMENT_IF_NO_POD2MAN)
dnl
dnl Check for IDN/IRIs
dnl
AC_ARG_ENABLE(iri,
AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
[case "${enable_iri}" in
no)
dnl Disable IRIs checking
AC_MSG_NOTICE([disabling IRIs at user request])
iri=no
;;
yes)
dnl IRIs explicitly enabled
iri=yes
force_iri=yes
;;
auto)
dnl Auto-detect IRI
iri=yes
;;
*)
AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
;;
esac
], [
dnl If nothing is specified, assume auto-detection
iri=yes
]
)
AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
[Support IDN/IRIs (needs GNU Libidn)]),
libidn=$withval, libidn="")
if test "X$iri" != "Xno"; then
AM_ICONV
if test "X$am_cv_func_iconv" != "Xyes"; then
iri=no
if test "X$force_iri" = "Xyes"; then
AC_MSG_ERROR([Libiconv is required for IRIs support])
else
AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found])
fi
fi
fi
if test "X$iri" != "Xno"; then
if test "$libidn" != ""; then
LDFLAGS="${LDFLAGS} -L$libidn/lib"
CPPFLAGS="${CPPFLAGS} -I$libidn/include"
fi
AC_CHECK_HEADER(idna.h,
AC_CHECK_LIB(idn, stringprep_check_version,
[iri=yes LIBS="${LIBS} -lidn"], iri=no),
iri=no)
if test "X$iri" != "Xno" ; then
AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
AC_MSG_NOTICE([Enabling support for IRI.])
else
AC_MSG_WARN([Libidn not found])
fi
fi
dnl Needed by src/Makefile.am
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
dnl
dnl Create output
dnl

View File

@ -70,6 +70,15 @@
* wget.texi (Robot Exclusion): Fixed typo "downloads" ->
"download"
2008-08-03 Xavier Saint <wget@sxav.eu>
* wget.texi : Add option descriptions for the three new
options --iri, --locale and --remote-encoding related to
IRI support.
* sample.wgetrc : Add commented lines for the three new
command iri, locale and encoding related to IRI support.
2008-08-03 Micah Cowan <micah@cowan.name>
* wget.texi: Don't set UPDATED; already set by version.texi.

View File

@ -114,3 +114,12 @@
# To try ipv6 addresses first:
#prefer-family = IPv6
# Set default IRI support state
#iri = off
# Force the default system encoding
#locale = UTF-8
# Force the default remote server encoding
#remoteencoding = UTF-8

View File

@ -675,6 +675,30 @@ Another instance where you'll get a garbled file if you try to use
Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http}
servers that support the @code{Range} header.
@cindex iri support
@cindex idn support
@item --iri
Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to
turn it off. IRI support is activated by default.
You can set the default state of IRI support using @code{iri} command in
@file{.wgetrc}. That setting may be overridden from the command line.
@cindex local encoding
@cindex locale
@item --locale=@var{encoding}
Force Wget to use @var{encoding} as the default system encoding. That affects
how Wget converts URLs specified as arguments from locale to @sc{utf-8} for
IRI support.
Wget use the function @code{nl_langinfo()} and then the @code{CHARSET}
environment variable to get the locale. If it fails, @sc{ascii} is used.
You can set the default locale using the @code{locale} command in
@file{.wgetrc}. That setting may be overridden from the command line.
@cindex progress indicator
@cindex dot style
@item --progress=@var{type}
@ -706,6 +730,21 @@ command line. The exception is that, when the output is not a TTY, the
``dot'' progress will be favored over ``bar''. To force the bar output,
use @samp{--progress=bar:force}.
@cindex remote encoding
@item --remote-encoding=@var{encoding}
Force Wget to use encoding as the default remote server encoding. That
affects how Wget converts URIs found in files from remote encoding to
@sc{utf-8} during a recursive fetch. This options is only useful for
IRI support, for the interpretation of non-@sc{ascii} characters.
For HTTP, remote encoding can be found in HTTP @code{Content-Type}
header and in HTML @code{Content-Type http-equiv} meta tag.
You can set the default encoding using the @code{remoteencoding}
command in @file{.wgetrc}. That setting may be overridden from the
command line.
@item -N
@itemx --timestamping
Turn on time-stamping. @xref{Time-Stamping}, for details.

View File

@ -256,11 +256,27 @@
* init.c (cleanup): Free the memory associated with the base
option (when DEBUG_MALLOC is defined).
2008-07-02 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New function idn_decode() to decode ASCII
encoded hostname to the locale.
* host.c : Show hostname to be resolved both in locale and
ASCII encoded.
2008-06-28 Steven Schubiger <stsc@members.fsf.org>
* retr.c (retrieve_from_file): Allow for reading the links from
an external file (HTTP/FTP).
2008-06-26 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New functions locale_to_utf8() and
idn_encode() adding basic capabilities of IRI/IDN.
* url.c : Convert URLs from locale to UTF-8 allowing a basic
support of IRI/IDN
2008-06-25 Steven Schubiger <stsc@members.fsf.org>
* ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
@ -293,12 +309,57 @@
string vars pointers-to-const, and moved line lengths
below 80 (in Makefile.am, not in version.c).
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New function check_encoding_name() as
a preliminary encoding name check.
* main.c, iri.c : Make use of check_encoding_name().
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c : Include missing stringprep.h file and add a
cast.
* init.c : set a default initial value for opt.enable_iri,
opt.locale and opt.encoding_remote.
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : Add a new function find_locale() to find
out the local system encoding.
* main.c : Make use of find_locale().
2008-06-19 Xavier Saint <wget@sxav.eu>
* html-url.c : Add "content-type" meta tag parsing for
retrieving page encoding.
* iri.h : Make no-op version of parse_charset() return
NULL.
2008-06-16 Micah Cowan <micah@cowan.name>
* http.c (http_loop): When hstat.len is higher than the
successfully completed content's length, but it's because we
_set_ it that way, don't abort.
2008-06-14 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New files.
* Makefile.am : Add files iri.h and conditional iri.c.
* build_info.c : Add compiled feature "iri".
* http.c : include iri.h and parse charset from Content-Type
header.
* init.c, main.c, options.h : if an options isn't supported
at compiled time, don't get rid off it and show a dummy
message instead if they are used.
2008-06-13 Micah Cowan <micah@cowan.name>
* build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL

View File

@ -30,6 +30,10 @@
# Version: @VERSION@
#
if IRI_IS_ENABLED
IRI_OBJ = iri.c
endif
# The following line is losing on some versions of make!
DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
@ -40,7 +44,7 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c \
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
recur.c res.c retr.c snprintf.c spider.c url.c \
utils.c \
utils.c $(IRI_OBJ) \
css-url.h connect.h convert.h cookies.h \
ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \

View File

@ -103,6 +103,13 @@ const char* (compiled_features[]) =
#else
"-gettext",
#endif
#ifdef ENABLE_IRI
"+iri",
#else
"-iri",
#endif
/* sentinel value */
NULL
};

View File

@ -271,9 +271,25 @@ connect_to_ip (const ip_address *ip, int port, const char *print)
if (print)
{
const char *txt_addr = print_address (ip);
if (print && 0 != strcmp (print, txt_addr))
if (0 != strcmp (print, txt_addr))
{
char *str = NULL, *name;
if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL)
{
int len = strlen (print) + strlen (name) + 4;
str = xmalloc (len);
snprintf (str, len, "%s (%s)", name, print);
str[len-1] = '\0';
xfree (name);
}
logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
escnonprint_uri (print), txt_addr, port);
str ? str : escnonprint_uri (print), txt_addr, port);
if (str)
xfree (str);
}
else
logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port);
}

View File

@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
/* Parse the file... */
urls = is_css ? get_urls_css_file (file, url) :
get_urls_html (file, url, NULL);
get_urls_html (file, url, NULL, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the

View File

@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line)
return FTPRERR;
/* Strip trailing CRLF before printing the line, so that
escnonprint doesn't include bogus \012 and \015. */
quotting doesn't include bogus \012 and \015. */
p = strchr (line, '\0');
if (p > line && p[-1] == '\n')
*--p = '\0';

View File

@ -718,8 +718,24 @@ lookup_host (const char *host, int flags)
/* No luck with the cache; resolve HOST. */
if (!silent && !numeric_address)
{
char *str = NULL, *name;
if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
{
int len = strlen (host) + strlen (name) + 4;
str = xmalloc (len);
snprintf (str, len, "%s (%s)", name, host);
str[len-1] = '\0';
xfree (name);
}
logprintf (LOG_VERBOSE, _("Resolving %s... "),
quotearg_style (escape_quoting_style, host));
quotearg_style (escape_quoting_style, str ? str : host));
if (str)
xfree (str);
}
#ifdef ENABLE_IPV6
{

View File

@ -174,6 +174,10 @@ static const char *additional_attributes[] = {
static struct hash_table *interesting_tags;
static struct hash_table *interesting_attributes;
/* Will contains the (last) charset found in 'http-equiv=content-type'
meta tags */
static char *meta_charset;
static void
init_interesting (void)
{
@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size,
return NULL;
}
url = url_parse (link_uri, NULL);
url = url_parse (link_uri, NULL, NULL, false);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@ -303,7 +307,7 @@ append_url (const char *link_uri, int position, int size,
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri));
url = url_parse (complete_uri, NULL);
url = url_parse (complete_uri, NULL, NULL, false);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@ -553,6 +557,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
entry->link_expect_html = 1;
}
}
else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
{
/* Handle stuff like:
<meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
char *mcharset;
char *content = find_attr (tag, "content", NULL);
if (!content)
return;
mcharset = parse_charset (content);
if (!mcharset)
return;
xfree_null (meta_charset);
meta_charset = mcharset;
}
else if (name && 0 == strcasecmp (name, "robots"))
{
/* Handle stuff like:
@ -617,7 +638,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
<base href=...> and does the right thing. */
struct urlpos *
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
struct iri *iri)
{
struct file_memory *fm;
struct map_context ctx;
@ -657,6 +679,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
NULL, interesting_attributes);
/* If meta charset isn't null, override content encoding */
if (iri && meta_charset)
set_content_encoding (iri, meta_charset);
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)
*meta_disallow_follow = ctx.nofollow;
@ -726,7 +752,7 @@ get_urls_file (const char *file)
url_text = merged;
}
url = url_parse (url_text, &up_error_code);
url = url_parse (url_text, &up_error_code, NULL, false);
if (!url)
{
char *error = url_error (url_text, up_error_code);

View File

@ -44,7 +44,7 @@ struct map_context {
};
struct urlpos *get_urls_file (const char *);
struct urlpos *get_urls_html (const char *, const char *, bool *);
struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
struct urlpos *append_url (const char *, int, int, struct map_context *);
void free_urlpos (struct urlpos *);

View File

@ -1366,7 +1366,8 @@ free_hstat (struct http_stat *hs)
If PROXY is non-NULL, the connection will be made to the proxy
server, and u->url will be requested. */
static uerr_t
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
struct iri *iri)
{
struct request *req;
@ -2061,9 +2062,20 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
char *tmp = strchr (type, ';');
if (tmp)
{
/* sXXXav: only needed if IRI support is enabled */
char *tmp2 = tmp + 1;
while (tmp > type && c_isspace (tmp[-1]))
--tmp;
*tmp = '\0';
/* Try to get remote encoding if needed */
if (opt.enable_iri && !opt.encoding_remote)
{
tmp = parse_charset (tmp2);
if (tmp)
set_content_encoding (iri, tmp);
}
}
}
hs->newloc = resp_header_strdup (resp, "Location");
@ -2348,7 +2360,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
retried, and retried, and retried, and... */
uerr_t
http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
int *dt, struct url *proxy)
int *dt, struct url *proxy, struct iri *iri)
{
int count;
bool got_head = false; /* used for time-stamping and filename detection */
@ -2515,7 +2527,7 @@ Spider mode enabled. Check if remote file exists.\n"));
*dt &= ~SEND_NOCACHE;
/* Try fetching the document, or at least its head. */
err = gethttp (u, &hstat, dt, proxy);
err = gethttp (u, &hstat, dt, proxy, iri);
/* Time? */
tms = datetime_str (time (NULL));
@ -2593,8 +2605,10 @@ Spider mode enabled. Check if remote file exists.\n"));
continue;
}
/* Maybe we should always keep track of broken links, not just in
* spider mode. */
else if (opt.spider)
* spider mode.
* Don't log error if it was UTF-8 encoded because we will try
* once unencoded. */
else if (opt.spider && !iri->utf8_encode)
{
/* #### Again: ugly ugly ugly! */
if (!hurl)

View File

@ -33,7 +33,7 @@ as that of the covered work. */
struct url;
uerr_t http_loop (struct url *, char **, char **, const char *, int *,
struct url *);
struct url *, struct iri *);
void save_cookies (void);
void http_cleanup (void);
time_t http_atotm (const char *);

View File

@ -177,9 +177,11 @@ static const struct {
{ "inet6only", &opt.ipv6_only, cmd_boolean },
#endif
{ "input", &opt.input_filename, cmd_file },
{ "iri", &opt.enable_iri, cmd_boolean },
{ "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
{ "limitrate", &opt.limit_rate, cmd_bytes },
{ "loadcookies", &opt.cookies_input, cmd_file },
{ "locale", &opt.locale, cmd_string },
{ "logfile", &opt.lfilename, cmd_file },
{ "login", &opt.ftp_user, cmd_string },/* deprecated*/
{ "maxredirect", &opt.max_redirect, cmd_number },
@ -219,6 +221,7 @@ static const struct {
{ "referer", &opt.referer, cmd_string },
{ "reject", &opt.rejects, cmd_vector },
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
@ -328,6 +331,14 @@ defaults (void)
opt.max_redirect = 20;
opt.waitretry = 10;
#ifdef ENABLE_IRI
opt.enable_iri = true;
#else
opt.enable_iri = false;
#endif
opt.locale = NULL;
opt.encoding_remote = NULL;
}
/* Return the user's home directory (strdup-ed), or NULL if none is

350
src/iri.c Normal file
View File

@ -0,0 +1,350 @@
/* IRI related functions.
Copyright (C) 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
Additional permission under GNU GPL version 3 section 7
If you modify this program, or any covered work, by linking or
combining it with the OpenSSL project's OpenSSL library (or a
modified version of that library), containing parts covered by the
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
grants you additional permission to convey the resulting work.
Corresponding Source for a non-source form of such a combination
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <iconv.h>
#include <stringprep.h>
#include <idna.h>
#include <errno.h>
#include "utils.h"
/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
/* Note: locale encoding is kept in options struct (opt.locale) */
static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
/* Given a string containing "charset=XXX", return the encoding if found,
or NULL otherwise */
char *
parse_charset (char *str)
{
char *charset;
if (!str || !*str)
return NULL;
str = strcasestr (str, "charset=");
if (!str)
return NULL;
str += 8;
charset = str;
/* sXXXav: which chars should be banned ??? */
while (*charset && !c_isspace (*charset))
charset++;
/* sXXXav: could strdupdelim return NULL ? */
charset = strdupdelim (str, charset);
/* Do a minimum check on the charset value */
if (!check_encoding_name (charset))
{
xfree (charset);
return NULL;
}
/*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
return charset;
}
/* Find the locale used, or fall back on a default value */
char *
find_locale (void)
{
return (char *) stringprep_locale_charset ();
}
/* Basic check of an encoding name. */
bool
check_encoding_name (char *encoding)
{
char *s = encoding;
while (*s)
{
if (!c_isascii (*s) || c_isspace (*s))
{
logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
return false;
}
s++;
}
return true;
}
/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
static bool
open_locale_to_utf8 (void)
{
}
/* Try converting string str from locale to UTF-8. Return a new string
on success, or str on error or if conversion isn't needed. */
const char *
locale_to_utf8 (const char *str)
{
iconv_t l2u;
char *new;
/* That shouldn't happen, just in case */
if (!opt.locale)
{
logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
opt.locale = find_locale ();
}
if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
return str;
l2u = iconv_open ("UTF-8", opt.locale);
if (l2u != (iconv_t)(-1))
{
logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
quote (opt.locale), quote ("UTF-8"));
return str;
}
if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new))
return (const char *) new;
return str;
}
/* Do the conversion according to the passed conversion descriptor cd. *out
will contain the transcoded string on success. *out content is
unspecified otherwise. */
static bool
do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
{
/* sXXXav : hummm hard to guess... */
size_t len, done, outlen = inlen * 2;
int invalid = 0, tooshort = 0;
char *s;
s = xmalloc (outlen + 1);
*out = s;
len = outlen;
done = 0;
for (;;)
{
if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
{
*out = s;
*(s + len - outlen - done) = '\0';
return true;
}
/* Incomplete or invalid multibyte sequence */
if (errno == EINVAL || errno == EILSEQ)
{
if (!invalid)
logprintf (LOG_VERBOSE,
"Incomplete or invalide multibyte sequence encountered\n");
invalid++;
**out = *in;
in++;
inlen--;
(*out)++;
outlen--;
}
else if (errno == E2BIG) /* Output buffer full */
{
char *new;
tooshort++;
done = len;
outlen = done + inlen * 2;
new = xmalloc (outlen + 1);
memcpy (new, s, done);
xfree (s);
s = new;
len = outlen;
*out = s + done;
}
else /* Weird, we got an unspecified error */
{
logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
break;
}
}
return false;
}
/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
on error. */
char *
idn_encode (struct iri *i, char *host)
{
char *new;
int ret;
/* Encode to UTF-8 if not done */
if (!i->utf8_encode)
{
if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
return NULL; /* Nothing to encode or an error occured */
host = new;
}
/* toASCII UTF-8 NULL terminated string */
ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
if (ret != IDNA_SUCCESS)
{
/* sXXXav : free new when needed ! */
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
}
return new;
}
/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
on success or NULL on error. */
char *
idn_decode (char *host)
{
char *new;
int ret;
ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
if (ret != IDNA_SUCCESS)
{
logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
}
return new;
}
/* Try to transcode string str from remote encoding to UTF-8. On success, *new
contains the transcoded string. *new content is unspecified otherwise. */
bool
remote_to_utf8 (struct iri *i, const char *str, const char **new)
{
iconv_t cd;
bool ret = false;
if (!i->uri_encoding)
return false;
cd = iconv_open ("UTF-8", i->uri_encoding);
if (cd == (iconv_t)(-1))
return false;
if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
ret = true;
iconv_close (cd);
/* Test if something was converted */
if (!strcmp (str, *new))
{
xfree ((char *) *new);
return false;
}
return ret;
}
/* Allocate a new iri structure and return a pointer to it. */
struct iri *
iri_new (void)
{
struct iri *i = xmalloc (sizeof (struct iri));
i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
i->content_encoding = NULL;
i->orig_url = NULL;
i->utf8_encode = opt.enable_iri;
return i;
}
/* Completely free an iri structure. */
void
iri_free (struct iri *i)
{
xfree_null (i->uri_encoding);
xfree_null (i->content_encoding);
xfree_null (i->orig_url);
xfree (i);
}
/* Set uri_encoding of struct iri i. If a remote encoding was specified, use
it unless force is true. */
void
set_uri_encoding (struct iri *i, char *charset, bool force)
{
DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None"));
if (!force && opt.encoding_remote)
return;
if (i->uri_encoding)
{
if (charset && !strcasecmp (i->uri_encoding, charset))
return;
xfree (i->uri_encoding);
}
i->uri_encoding = charset ? xstrdup (charset) : NULL;
}
/* Set content_encoding of struct iri i. */
void
set_content_encoding (struct iri *i, char *charset)
{
DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None"));
if (opt.encoding_remote)
return;
if (i->content_encoding)
{
if (charset && !strcasecmp (i->content_encoding, charset))
return;
xfree (i->content_encoding);
}
i->content_encoding = charset ? xstrdup (charset) : NULL;
}

71
src/iri.h Normal file
View File

@ -0,0 +1,71 @@
/* Internationalization related declarations.
Copyright (C) 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
Additional permission under GNU GPL version 3 section 7
If you modify this program, or any covered work, by linking or
combining it with the OpenSSL project's OpenSSL library (or a
modified version of that library), containing parts covered by the
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
grants you additional permission to convey the resulting work.
Corresponding Source for a non-source form of such a combination
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
#ifndef IRI_H
#define IRI_H
struct iri {
char *uri_encoding; /* Encoding of the uri to fetch */
char *content_encoding; /* Encoding of links inside the fetched file */
char *orig_url; /* */
bool utf8_encode; /* Will/Is the current url encoded in utf8 */
};
#ifdef ENABLE_IRI
char *parse_charset (char *str);
char *find_locale (void);
bool check_encoding_name (char *encoding);
const char *locale_to_utf8 (const char *str);
char *idn_encode (struct iri *i, char *host);
char *idn_decode (char *host);
bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
struct iri *iri_new (void);
void iri_free (struct iri *i);
void set_uri_encoding (struct iri *i, char *charset, bool force);
void set_content_encoding (struct iri *i, char *charset);
#else /* ENABLE_IRI */
struct iri dummy_iri;
#define parse_charset(str) NULL
#define find_locale() NULL
#define check_encoding_name(str) false
#define locale_to_utf8(str) (str)
#define idn_encode(a,b) NULL
#define idn_decode(str) NULL
#define remote_to_utf8(a,b,c) false
#define iri_new() (&dummy_iri)
#define iri_free(a)
#define set_uri_encoding(a,b,c)
#define set_content_encoding(a,b)
#endif /* ENABLE_IRI */
#endif /* IRI_H */

View File

@ -43,7 +43,7 @@ as that of the covered work. */
#include "utils.h"
#include "log.h"
/* This file impplement support for "logging". Logging means printing
/* This file implement support for "logging". Logging means printing
output, plus several additional features:
- Cataloguing output by importance. You can specify that a log

View File

@ -202,10 +202,12 @@ static struct cmdline_option option_data[] =
{ "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
#endif
{ "input-file", 'i', OPT_VALUE, "input", -1 },
{ "iri", 0, OPT_BOOLEAN, "iri", -1 },
{ "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
{ "level", 'l', OPT_VALUE, "reclevel", -1 },
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
{ "locale", 0, OPT_VALUE, "locale", -1 },
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
{ "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
{ "no", 'n', OPT__NO, NULL, required_argument },
@ -239,6 +241,7 @@ static struct cmdline_option option_data[] =
{ "referer", 0, OPT_VALUE, "referer", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 },
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
{ "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
{ "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
@ -1077,6 +1080,27 @@ for details.\n\n"));
exit (1);
}
#ifdef ENABLE_IRI
if (opt.enable_iri)
{
if (opt.locale && !check_encoding_name (opt.locale))
opt.locale = NULL;
if (!opt.locale)
opt.locale = find_locale ();
if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
opt.encoding_remote = NULL;
}
#else
if (opt.enable_iri || opt.locale || opt.encoding_remote)
{
/* sXXXav : be more specific... */
printf(_("This version does not have support for IRIs\n"));
exit(1);
}
#endif
if (opt.ask_passwd)
{
opt.passwd = prompt_for_password ();
@ -1179,7 +1203,7 @@ WARNING: Can't reopen standard output in binary mode;\n\
{
char *filename = NULL, *redirected_URL = NULL;
int dt, url_err;
struct url *url_parsed = url_parse (*t, &url_err);
struct url *url_parsed = url_parse (*t, &url_err, NULL, false);
if (!url_parsed)
{
@ -1199,12 +1223,18 @@ WARNING: Can't reopen standard output in binary mode;\n\
if (url_scheme (*t) == SCHEME_FTP)
opt.follow_ftp = 1;
status = retrieve_tree (url_parsed);
status = retrieve_tree (url_parsed, NULL);
opt.follow_ftp = old_follow_ftp;
}
else
status = retrieve_url (url_parsed, *t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
{
struct iri *i = iri_new ();
set_uri_encoding (i, opt.locale, true);
status = retrieve_url (url_parsed, *t, &filename, &redirected_URL,
NULL, &dt, opt.recursive, i);
iri_free (i);
}
if (opt.delete_after && file_exists_p(filename))
{

View File

@ -235,6 +235,10 @@ struct options
bool content_disposition; /* Honor HTTP Content-Disposition header. */
bool auth_without_challenge; /* Issue Basic authentication creds without
waiting for a challenge. */
bool enable_iri;
char *encoding_remote;
char *locale;
};
extern struct options opt;

View File

@ -51,7 +51,7 @@ as that of the covered work. */
#include "html-url.h"
#include "css-url.h"
#include "spider.h"
/* Functions for maintaining the URL queue. */
struct queue_element {
@ -60,6 +60,7 @@ struct queue_element {
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
struct iri *iri; /* sXXXav */
bool css_allowed; /* whether the document is allowed to
be treated as CSS. */
struct queue_element *next; /* next element in queue */
@ -93,11 +94,12 @@ url_queue_delete (struct url_queue *queue)
into it. */
static void
url_enqueue (struct url_queue *queue,
url_enqueue (struct url_queue *queue, struct iri *i,
const char *url, const char *referer, int depth,
bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
qel->iri = i;
qel->url = url;
qel->referer = referer;
qel->depth = depth;
@ -112,6 +114,10 @@ url_enqueue (struct url_queue *queue,
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
if (i)
DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url),
i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
@ -124,7 +130,7 @@ url_enqueue (struct url_queue *queue,
succeeded, or false if the queue is empty. */
static bool
url_dequeue (struct url_queue *queue,
url_dequeue (struct url_queue *queue, struct iri **i,
const char **url, const char **referer, int *depth,
bool *html_allowed, bool *css_allowed)
{
@ -137,6 +143,7 @@ url_dequeue (struct url_queue *queue,
if (!queue->head)
queue->tail = NULL;
*i = qel->iri;
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
@ -153,9 +160,9 @@ url_dequeue (struct url_queue *queue,
}
static bool download_child_p (const struct urlpos *, struct url *, int,
struct url *, struct hash_table *);
struct url *, struct hash_table *, struct iri *);
static bool descend_redirect_p (const char *, struct url *, int,
struct url *, struct hash_table *);
struct url *, struct hash_table *, struct iri *);
/* Retrieve a part of the web beginning with START_URL. This used to
@ -180,7 +187,7 @@ static bool descend_redirect_p (const char *, struct url *, int,
options, add it to the queue. */
uerr_t
retrieve_tree (struct url *start_url_parsed)
retrieve_tree (struct url *start_url_parsed, struct iri *pi)
{
uerr_t status = RETROK;
@ -191,12 +198,28 @@ retrieve_tree (struct url *start_url_parsed)
the queue, but haven't been downloaded yet. */
struct hash_table *blacklist;
int up_error_code;
struct iri *i = iri_new ();
#define COPYSTR(x) (x) ? xstrdup(x) : NULL;
/* Duplicate pi struct if not NULL */
if (pi)
{
i->uri_encoding = COPYSTR (pi->uri_encoding);
i->content_encoding = COPYSTR (pi->content_encoding);
i->utf8_encode = pi->utf8_encode;
}
else
set_uri_encoding (i, opt.locale, true);
#undef COPYSTR
queue = url_queue_new ();
blacklist = make_string_hash_table (0);
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
false);
string_set_add (blacklist, start_url_parsed->url);
while (1)
@ -215,7 +238,7 @@ retrieve_tree (struct url *start_url_parsed)
/* Get the next URL from the queue... */
if (!url_dequeue (queue,
if (!url_dequeue (queue, (struct iri **) &i,
(const char **)&url, (const char **)&referer,
&depth, &html_allowed, &css_allowed))
break;
@ -255,20 +278,10 @@ retrieve_tree (struct url *start_url_parsed)
{
int dt = 0, url_err;
char *redirected = NULL;
struct url *url_parsed = url_parse (url, &url_err);
struct url *url_parsed = url_parse (url, &url_err, i, false);
if (!url_parsed)
{
char *error = url_error (url, url_err);
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
xfree (error);
status = URLERROR;
}
else
{
status = retrieve_url (url_parsed, url, &file, &redirected,
referer, &dt, false);
}
status = retrieve_url (url_parsed, url, &file, &redirected, referer,
&dt, false, i);
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
@ -296,7 +309,7 @@ retrieve_tree (struct url *start_url_parsed)
if (descend)
{
if (!descend_redirect_p (redirected, url_parsed, depth,
start_url_parsed, blacklist))
start_url_parsed, blacklist, i))
descend = false;
else
/* Make sure that the old pre-redirect form gets
@ -349,7 +362,7 @@ retrieve_tree (struct url *start_url_parsed)
bool meta_disallow_follow = false;
struct urlpos *children
= is_css ? get_urls_css_file (file, url) :
get_urls_html (file, url, &meta_disallow_follow);
get_urls_html (file, url, &meta_disallow_follow, i);
if (opt.use_robots && meta_disallow_follow)
{
@ -360,7 +373,8 @@ retrieve_tree (struct url *start_url_parsed)
if (children)
{
struct urlpos *child = children;
struct url *url_parsed = url_parsed = url_parse (url, NULL);
struct url *url_parsed = url_parse (url, NULL, i, false);
struct iri *ci;
char *referer_url = url;
bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL);
@ -377,9 +391,11 @@ retrieve_tree (struct url *start_url_parsed)
if (dash_p_leaf_HTML && !child->link_inline_p)
continue;
if (download_child_p (child, url_parsed, depth, start_url_parsed,
blacklist))
blacklist, i))
{
url_enqueue (queue, xstrdup (child->url->url),
ci = iri_new ();
set_uri_encoding (ci, i->content_encoding, false);
url_enqueue (queue, ci, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1,
child->link_expect_html,
child->link_expect_css);
@ -424,6 +440,7 @@ retrieve_tree (struct url *start_url_parsed)
xfree (url);
xfree_null (referer);
xfree_null (file);
iri_free (i);
}
/* If anything is left of the queue due to a premature exit, free it
@ -432,9 +449,11 @@ retrieve_tree (struct url *start_url_parsed)
char *d1, *d2;
int d3;
bool d4, d5;
while (url_dequeue (queue,
struct iri *d6;
while (url_dequeue (queue, (struct iri **)&d6,
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
iri_free (d6);
xfree (d1);
xfree_null (d2);
}
@ -461,7 +480,8 @@ retrieve_tree (struct url *start_url_parsed)
static bool
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *u = upos->url;
const char *url = u->url;
@ -602,7 +622,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (!specs)
{
char *rfile;
if (res_retrieve_file (url, &rfile))
if (res_retrieve_file (url, &rfile, iri))
{
specs = res_parse_from_file (rfile);
@ -657,7 +677,8 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
static bool
descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *new_parsed;
struct urlpos *upos;
@ -665,14 +686,14 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL);
new_parsed = url_parse (redirected, NULL, NULL, false);
assert (new_parsed != NULL);
upos = xnew0 (struct urlpos);
upos->url = new_parsed;
success = download_child_p (upos, orig_parsed, depth,
start_url_parsed, blacklist);
start_url_parsed, blacklist, iri);
url_free (new_parsed);
xfree (upos);

View File

@ -44,6 +44,6 @@ as that of the covered work. */
struct urlpos;
void recursive_cleanup (void);
uerr_t retrieve_tree (struct url *);
uerr_t retrieve_tree (struct url *, struct iri *);
#endif /* RECUR_H */

View File

@ -532,20 +532,26 @@ res_get_specs (const char *host, int port)
Return true if robots were retrieved OK, false otherwise. */
bool
res_retrieve_file (const char *url, char **file)
res_retrieve_file (const char *url, char **file, struct iri *iri)
{
struct iri *i = iri_new ();
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping;
int saved_sp_val = opt.spider, url_err;
struct url * url_parsed;
/* Copy server URI encoding for a possible IDNA transformation, no need to
encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
set_uri_encoding (i, iri->uri_encoding, false);
i->utf8_encode = false;
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
opt.timestamping = false;
opt.spider = false;
url_parsed = url_parse (robots_url, &url_err);
url_parsed = url_parse (robots_url, &url_err, iri, true);
if (!url_parsed)
{
char *error = url_error (robots_url, url_err);
@ -556,13 +562,14 @@ res_retrieve_file (const char *url, char **file)
else
{
err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
false);
false, i);
url_free(url_parsed);
}
opt.timestamping = saved_ts_val;
opt.spider = saved_sp_val;
xfree (robots_url);
iri_free (i);
if (err != RETROK && *file != NULL)
{

View File

@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *);
void res_register_specs (const char *, int, struct robot_specs *);
struct robot_specs *res_get_specs (const char *, int);
bool res_retrieve_file (const char *, char **);
bool res_retrieve_file (const char *, char **, struct iri *);
bool is_robots_txt_url (const char *);

View File

@ -598,7 +598,8 @@ static char *getproxy (struct url *);
uerr_t
retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
char **newloc, const char *refurl, int *dt, bool recursive)
char **newloc, const char *refurl, int *dt, bool recursive,
struct iri *iri)
{
uerr_t result;
char *url;
@ -626,6 +627,11 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
if (file)
*file = NULL;
second_try:
DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url),
iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None",
iri->utf8_encode));
if (!refurl)
refurl = opt.referer;
@ -639,8 +645,12 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
proxy = getproxy (u);
if (proxy)
{
struct iri *pi = iri_new ();
set_uri_encoding (pi, opt.locale, true);
pi->utf8_encode = false;
/* Parse the proxy URL. */
proxy_url = url_parse (proxy, &up_error_code);
proxy_url = url_parse (proxy, &up_error_code, NULL, true);
if (!proxy_url)
{
char *error = url_error (proxy, up_error_code);
@ -667,7 +677,7 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
#endif
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
{
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
}
else if (u->scheme == SCHEME_FTP)
{
@ -717,8 +727,14 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
xfree (mynewloc);
mynewloc = construced_newloc;
/* Reset UTF-8 encoding state, keep the URI encoding and reset
the content encoding. */
iri->utf8_encode = opt.enable_iri;
set_content_encoding (iri, NULL);
xfree_null (iri->orig_url);
/* Now, see if this new location makes sense. */
newloc_parsed = url_parse (mynewloc, &up_error_code);
newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
if (!newloc_parsed)
{
char *error = url_error (mynewloc, up_error_code);
@ -776,8 +792,21 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
goto redirected;
}
if (local_file)
/* Try to not encode in UTF-8 if fetching failed */
if (!(*dt & RETROKF) && iri->utf8_encode)
{
iri->utf8_encode = false;
DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
goto second_try;
}
if (local_file && *dt & RETROKF)
{
register_download (u->url, local_file);
if (redirection_count && 0 != strcmp (origurl, u->url))
register_redirection (origurl, u->url);
if (*dt & TEXTHTML)
register_html (u->url, local_file);
if (*dt & RETROKF)
{
register_download (u->url, local_file);
@ -830,6 +859,7 @@ retrieve_from_file (const char *file, bool html, int *count)
{
uerr_t status;
struct urlpos *url_list, *cur_url;
struct iri *iri = iri_new();
char *input_file = NULL;
const char *url = file;
@ -837,11 +867,15 @@ retrieve_from_file (const char *file, bool html, int *count)
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
/* sXXXav : Assume filename and links in the file are in the locale */
set_uri_encoding (iri, opt.locale, true);
set_content_encoding (iri, opt.locale);
if (url_has_scheme (url))
{
int dt,url_err;
uerr_t status;
struct url * url_parsed = url_parse(url, &url_err);
struct url * url_parsed = url_parse(url, &url_err, NULL, true);
if (!url_parsed)
{
@ -854,17 +888,22 @@ retrieve_from_file (const char *file, bool html, int *count)
if (!opt.base_href)
opt.base_href = xstrdup (url);
status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt, false);
status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt,
false, iri);
if (status != RETROK)
return status;
if (dt & TEXTHTML)
html = true;
/* If we have a found a content encoding, use it */
if (iri->content_encoding)
set_uri_encoding (iri, iri->content_encoding, false);
}
else
input_file = (char *) file;
url_list = (html ? get_urls_html (input_file, NULL, NULL)
url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
: get_urls_file (input_file));
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
@ -880,6 +919,12 @@ retrieve_from_file (const char *file, bool html, int *count)
status = QUOTEXC;
break;
}
/* Reset UTF-8 encode status */
iri->utf8_encode = opt.enable_iri;
xfree_null (iri->orig_url);
iri->orig_url = NULL;
if ((opt.recursive || opt.page_requisites)
&& (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
{
@ -889,15 +934,13 @@ retrieve_from_file (const char *file, bool html, int *count)
if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
status = retrieve_tree (cur_url->url);
status = retrieve_tree (cur_url->url, iri);
opt.follow_ftp = old_follow_ftp;
}
else
{
status = retrieve_url (cur_url->url, cur_url->url->url, &filename,
&new_file, NULL, &dt, opt.recursive);
}
&new_file, NULL, &dt, opt.recursive, iri);
if (filename && opt.delete_after && file_exists_p (filename))
{
@ -916,6 +959,8 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
/* Free the linked list of URL-s. */
free_urlpos (url_list);
iri_free (iri);
return status;
}

View File

@ -53,7 +53,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
char *fd_read_hunk (int, hunk_terminator_t, long, long);
char *fd_read_line (int);
uerr_t retrieve_url (struct url *, const char *, char **, char **, const char *, int *, bool);
uerr_t retrieve_url (struct url *, const char *, char **, char **,
const char *, int *, bool, struct iri *);
uerr_t retrieve_from_file (const char *, bool, int *);
const char *retr_rate (wgint, double);

View File

@ -649,7 +649,7 @@ static const char *parse_errors[] = {
error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */
struct url *
url_parse (const char *url, int *error)
url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
{
struct url *u;
const char *p;
@ -668,7 +668,8 @@ url_parse (const char *url, int *error)
int port;
char *user = NULL, *passwd = NULL;
char *url_encoded = NULL;
const char *url_encoded = NULL;
char *new_url = NULL;
int error_code;
@ -679,9 +680,26 @@ url_parse (const char *url, int *error)
goto error;
}
url_encoded = reencode_escapes (url);
if (iri && iri->utf8_encode)
{
iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
if (!iri->utf8_encode)
new_url = NULL;
else
iri->orig_url = xstrdup (url);
}
/* XXX XXX Could that change introduce (security) bugs ??? XXX XXX*/
if (percent_encode)
url_encoded = reencode_escapes (new_url ? new_url : url);
else
url_encoded = new_url ? new_url : url;
p = url_encoded;
if (new_url && url_encoded != new_url)
xfree (new_url);
p += strlen (supported_schemes[scheme].leading_string);
uname_b = p;
p = url_skip_credentials (p);
@ -851,6 +869,18 @@ url_parse (const char *url, int *error)
{
url_unescape (u->host);
host_modified = true;
/* Apply IDNA regardless of iri->utf8_encode status */
if (opt.enable_iri && iri)
{
char *new = idn_encode (iri, u->host);
if (new)
{
xfree (u->host);
u->host = new;
host_modified = true;
}
}
}
if (params_b)
@ -860,7 +890,7 @@ url_parse (const char *url, int *error)
if (fragment_b)
u->fragment = strdupdelim (fragment_b, fragment_e);
if (path_modified || u->fragment || host_modified || path_b == path_e)
if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
{
/* If we suspect that a transformation has rendered what
url_string might return different from URL_ENCODED, rebuild
@ -875,7 +905,7 @@ url_parse (const char *url, int *error)
if (url_encoded == url)
u->url = xstrdup (url);
else
u->url = url_encoded;
u->url = (char *) url_encoded;
}
return u;
@ -883,7 +913,7 @@ url_parse (const char *url, int *error)
error:
/* Cleanup in case of error: */
if (url_encoded && url_encoded != url)
xfree (url_encoded);
xfree ((char *) url_encoded);
/* Transmit the error code to the caller, if the caller wants to
know. */

View File

@ -85,7 +85,7 @@ struct url
char *url_escape (const char *);
char *url_escape_unsafe_and_reserved (const char *);
struct url *url_parse (const char *, int *);
struct url *url_parse (const char *, int *, struct iri *iri, bool percent_encode);
char *url_error (const char *, int);
char *url_full_path (const struct url *);
void url_set_dir (struct url *, const char *);

View File

@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT;
#include "quote.h"
#include "quotearg.h"
/* Likewise for struct iri definition */
#include "iri.h"
/* Useful macros used across the code: */
/* The number of elements in an array. For example:

View File

@ -1,3 +1,19 @@
2008-12-04 Micah Cowan <micah@cowan.name> (not copyrightable)
* run-px, Test-idn-robots.px: Added test for robots-file
downloads.
* Test-idn-cmd.px, Test-idn-meta.px, Test-idn-headers.px:
Fix test names.
2008-11-26 Micah Cowan <micah@cowan.name> (not copyrightable)
* Test-ftp-iri-disabled.px, Test-ftp-iri-fallback.px,
Test-ftp-iri.px, Test-idn-cmd.px, Test-idn-headers.px,
Test-idn-meta.px, Test-iri-disabled.px,
Test-iri-forced-remote.px, Test-iri-list.px, Test-iri.px: More
module-scope warnings.
2009-06-14 Micah Cowan <micah@cowan.name>
* Makefile.am (EXTRA_DIST): Include all the tests, run-px, and
@ -95,6 +111,51 @@
* run-px: Use strict (thanks Steven Schubiger!).
2008-09-09 Micah Cowan <micah@cowan.name>
* Test-idn-cmd.px: Added.
* run-px: Added Test-idn-cmd.px.
2008-08-28 Micah Cowan <micah@cowan.name>
* HTTPServer.pm (run): Allow distinguishing between hostnames,
when used as a proxy.
* Test-idn-headers.px, Test-idn-meta.px: Added.
* run-px: Added Test-idn-headers.px, Test-idn-meta.px.
* Test-proxy-auth-basic.px: Use the full URL, rather than just the
path (made necessary by the accompanying change to HTTPServer.pm).
2008-08-14 Xavier Saint <wget@sxav.eu>
* Test-iri-list.px : Fetch files from a remote list.
2008-08-03 Xavier Saint <wget@sxav.eu>
* Test-iri.px : HTTP recursive fetch for testing IRI support and
fallback.
* Test-iri-disabled.px : Same file structure as Test-iri.px but with
IRI support disabled
* Test-iri-forced-remote.px : There's a difference between ISO-8859-1
and ISO-8859-15 for character 0xA4 (respectively currency sign and
euro sign). So with a forced ISO-8859-1 remote encoding, wget should
see 0xA4 as a currency sign and transcode it correctly in UTF-8 instead
of using the ISO-8859-15 given by the server.
* Test-ftp-iri.px : Give a file to fetch via FTP in a specific locale
and expect wget to fetch the file UTF-8 encoded.
* Test-ftp-iri-fallback.px : Same as above but wget should fallback on
locale encoding to fetch the file.
* Test-ftp-iri.px : Same as Test-ftp-iri.px but with IRI support
disabled. The UTF-8 encoded file should not be retrieved.
2008-06-22 Micah Cowan <micah@cowan.name>
* Test-proxied-https-auth.px: Shift exit code so it falls in the

View File

@ -26,7 +26,8 @@ sub run {
my $con = $self->accept();
print STDERR "Accepted a new connection\n" if $log;
while (my $req = $con->get_request) {
my $url_path = $req->url->path;
#my $url_path = $req->url->path;
my $url_path = $req->url->as_string;
if ($url_path =~ m{/$}) { # append 'index.html'
$url_path .= 'index.html';
}

51
tests/Test-ftp-iri-disabled.px Executable file
View File

@ -0,0 +1,51 @@
#!/usr/bin/perl
use strict;
use warnings;
use FTPTest;
###############################################################################
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $francais = <<EOF;
Some text.
EOF
$francais =~ s/\n/\r\n/;
# code, msg, headers, content
my %urls = (
"/fran${ccedilla_u8}ais.txt" => {
content => $francais,
},
"/fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri=no --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
###############################################################################
my $the_test = FTPTest->new (name => "Test-ftp-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

47
tests/Test-ftp-iri-fallback.px Executable file
View File

@ -0,0 +1,47 @@
#!/usr/bin/perl
use strict;
use warnings;
use FTPTest;
###############################################################################
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $francais = <<EOF;
Some text.
EOF
$francais =~ s/\n/\r\n/;
# code, msg, headers, content
my %urls = (
"/fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
###############################################################################
my $the_test = FTPTest->new (name => "Test-ftp-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

48
tests/Test-ftp-iri.px Executable file
View File

@ -0,0 +1,48 @@
#!/usr/bin/perl
use strict;
use warnings;
use FTPTest;
###############################################################################
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $francais = <<EOF;
Some text.
EOF
$francais =~ s/\n/\r\n/;
# code, msg, headers, content
my %urls = (
"/fran${ccedilla_u8}ais.txt" => {
content => $francais,
},
);
my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"fran${ccedilla_u8}ais.txt" => {
content => $francais,
},
);
###############################################################################
my $the_test = FTPTest->new (name => "Test-ftp-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

51
tests/Test-idn-cmd.px Executable file
View File

@ -0,0 +1,51 @@
#!/usr/bin/perl
use strict;
use warnings;
use HTTPTest;
# " Kon'nichiwa <dot> Japan
my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
###############################################################################
my $result_file = <<EOF;
Found me!
EOF
# code, msg, headers, content
my %urls = (
"http://$punycoded_hostname/index.html" => {
code => "200",
msg => "Yes, please",
headers => {
'Content-Type' => 'text/plain',
},
content => $result_file,
},
);
my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
. " -e http_proxy=localhost:{{port}} --locale=EUC-JP $euc_jp_hostname";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"$punycoded_hostname/index.html" => {
content => $result_file,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-idn-cmd",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

66
tests/Test-idn-headers.px Executable file
View File

@ -0,0 +1,66 @@
#!/usr/bin/perl
use strict;
use warnings;
use HTTPTest;
# " Kon'nichiwa <dot> Japan
my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
###############################################################################
my $starter_file = <<EOF;
<a href="http://$euc_jp_hostname/">The link</a>
EOF
my $result_file = <<EOF;
Found me!
EOF
# code, msg, headers, content
my %urls = (
'http://start-here.com/start.html' => {
code => "200",
msg => "You want fries with that?",
headers => {
'Content-Type' => 'text/html; charset=EUC-JP',
},
content => $starter_file,
},
"http://$punycoded_hostname/index.html" => {
code => "200",
msg => "Yes, please",
headers => {
'Content-Type' => 'text/plain',
},
content => $result_file,
},
);
my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
. " -e http_proxy=localhost:{{port}} http://start-here.com/start.html";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'start-here.com/start.html' => {
content => $starter_file,
},
"$punycoded_hostname/index.html" => {
content => $result_file,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-idn-headers",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

67
tests/Test-idn-meta.px Executable file
View File

@ -0,0 +1,67 @@
#!/usr/bin/perl
use strict;
use warnings;
use HTTPTest;
# " Kon'nichiwa <dot> Japan
my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
###############################################################################
my $starter_file = <<EOF;
<meta http-equiv="Content-Type" content="text/html; charset=EUC-JP" />
<a href="http://$euc_jp_hostname/">The link</a>
EOF
my $result_file = <<EOF;
Found me!
EOF
# code, msg, headers, content
my %urls = (
'http://start-here.com/start.html' => {
code => "200",
msg => "You want fries with that?",
headers => {
'Content-Type' => 'text/html; charset=UTF-8',
},
content => $starter_file,
},
"http://$punycoded_hostname/index.html" => {
code => "200",
msg => "Yes, please",
headers => {
'Content-Type' => 'text/plain',
},
content => $result_file,
},
);
my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
. " -e http_proxy=localhost:{{port}} http://start-here.com/start.html";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'start-here.com/start.html' => {
content => $starter_file,
},
"$punycoded_hostname/index.html" => {
content => $result_file,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-idn-meta",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

78
tests/Test-idn-robots.px Executable file
View File

@ -0,0 +1,78 @@
#!/usr/bin/perl
use strict;
use warnings;
use HTTPTest;
# " Kon'nichiwa <dot> Japan
my $euc_jp_hostname = "\272\243\306\374\244\317.\306\374\313\334";
my $punycoded_hostname = 'xn--v9ju72g90p.xn--wgv71a';
###############################################################################
my $starter_file = <<EOF;
<a href="http://$euc_jp_hostname/foo.txt">The link</a>
EOF
my $result_file = <<EOF;
Found me!
EOF
# code, msg, headers, content
my %urls = (
"http://$punycoded_hostname/index.html" => {
code => "200",
msg => "Yes, please",
headers => {
'Content-Type' => 'text/html; charset=EUC-JP',
},
content => $starter_file,
},
"http://$punycoded_hostname/foo.txt" => {
code => "200",
msg => "Uh-huh",
headers => {
'Content-Type' => 'text/plain',
},
content => $result_file,
},
"http://$punycoded_hostname/robots.txt" => {
code => "200",
msg => "Uh-huh",
headers => {
'Content-Type' => 'text/plain',
},
content => '',
},
);
my $cmdline = $WgetTest::WGETPATH . " --debug --iri -rH"
. " -e http_proxy=localhost:{{port}} --locale=EUC-JP"
. " http://$euc_jp_hostname/";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"$punycoded_hostname/index.html" => {
content => $starter_file,
},
"$punycoded_hostname/foo.txt" => {
content => $result_file,
},
"$punycoded_hostname/robots.txt" => {
content => '',
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-idn-robots",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

197
tests/Test-iri-disabled.px Executable file
View File

@ -0,0 +1,197 @@
#!/usr/bin/perl
use strict;
use warnings;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html mime + file = iso-8859-1
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
#
my $ccedilla_l15 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $eurosign_l15 = "\xA4";
my $eurosign_u8 = "\xE2\x82\xAC";
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
</head>
<body>
<p>
&Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
</p>
</body>
</html>
EOF
my $pageeuro = <<EOF;
<html>
<head>
<title>Euro page</title>
</head>
<body>
<p>
My tailor isn't rich anymore.
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "200",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
'/p3_%A4%A4%A4.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri=no -nH -r http://localhost:{{port}}/";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'index.html' => {
content => $pageindex,
},
'robots.txt' => {
content => "",
},
"p1_fran${ccedilla_l15}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_l1}${eacute_l1}n.html" => {
content => $pageeen,
},
"p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html" => {
content => $pageeuro,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri-disabled",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

208
tests/Test-iri-forced-remote.px Executable file
View File

@ -0,0 +1,208 @@
#!/usr/bin/perl
use strict;
use warnings;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
# Force remote encoding to ISO-8859-1
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html mime + file = iso-8859-1
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
#
my $ccedilla_l15 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $eurosign_l15 = "\xA4";
my $eurosign_u8 = "\xE2\x82\xAC";
my $currency_l1 = "\xA4";
my $currency_u8 = "\xC2\xA4";
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
</head>
<body>
<p>
&Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
</p>
</body>
</html>
EOF
my $pageeuro = <<EOF;
<html>
<head>
<title>Euro page</title>
</head>
<body>
<p>
My tailor isn't rich anymore.
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "404",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $page404,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
'/p3_%A4%A4%A4.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
'/p3_%C2%A4%C2%A4%C2%A4.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'index.html' => {
content => $pageindex,
},
'robots.txt' => {
content => "",
},
"p1_fran${ccedilla_l15}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
content => $pageeen,
},
"p3_${currency_u8}${currency_u8}${currency_u8}.html" => {
content => $pageeuro,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri-forced-remote",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

174
tests/Test-iri-list.px Executable file
View File

@ -0,0 +1,174 @@
#!/usr/bin/perl
use strict;
use warnings;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html meta + file = utf-8, mime =iso-8859-1
#
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $urllist = <<EOF;
http://localhost:{{port}}/
http://localhost:{{port}}/p1_fran${ccedilla_l1}ais.html
http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html
EOF
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Main page.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
French page.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>
<p>
Dutch page.
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "404",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $page404,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/url_list.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=ISO-8859-1",
},
content => $urllist,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri -d -i http://localhost:{{port}}/url_list.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'url_list.txt' => {
content => $urllist,
},
'index.html' => {
content => $pageindex,
},
"p1_fran${ccedilla_l1}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
content => $pageeen,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri-list",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

225
tests/Test-iri.px Executable file
View File

@ -0,0 +1,225 @@
#!/usr/bin/perl
use strict;
use warnings;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html meta + file = utf-8, mime =iso-8859-1
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
# p4_méér.html mime + file = utf-8
#
my $ccedilla_l15 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $eurosign_l15 = "\xA4";
my $eurosign_u8 = "\xE2\x82\xAC";
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>
<p>
&Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)<br/>
<a href="http://localhost:{{port}}/p4_m${eacute_u8}${eacute_u8}r.html">M&eacute&eacute;r</a>
</p>
</body>
</html>
EOF
my $pageeuro = <<EOF;
<html>
<head>
<title>Euro page</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>
<p>
My tailor isn't rich anymore.
</p>
</body>
</html>
EOF
my $pagemeer = <<EOF;
<html>
<head>
<title>Bekende supermarkt</title>
</head>
<body>
<p>
Ik ben toch niet gek !
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "404",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $page404,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=ISO-8859-1",
},
content => $pageeuro,
},
'/p3_%A4%A4%A4.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=ISO-8859-1",
},
content => $pageeuro,
},
'/p4_m%C3%A9%C3%A9r.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=UTF-8",
},
content => $pagemeer,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri --restrict-file-names=nocontrol -nH -r http://localhost:{{port}}/";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'index.html' => {
content => $pageindex,
},
'robots.txt' => {
content => "",
},
"p1_fran${ccedilla_l15}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
content => $pageeen,
},
"p3_${eurosign_u8}${eurosign_u8}${eurosign_u8}.html" => {
content => $pageeuro,
},
"p4_m${eacute_u8}${eacute_u8}r.html" => {
content => $pagemeer,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

View File

@ -12,7 +12,7 @@ my $wholefile = "You're all authenticated.\n";
# code, msg, headers, content
my %urls = (
'/needs-auth.txt' => {
'http://no.such.domain/needs-auth.txt' => {
auth_method => 'Basic',
user => 'fiddle-dee-dee',
passwd => 'Dodgson',

View File

@ -25,9 +25,20 @@ my @tests = (
'Test-E-k-K.px',
'Test-E-k.px',
'Test-ftp.px',
'Test-ftp-iri.px',
'Test-ftp-iri-fallback.px',
'Test-ftp-iri-disabled.px',
'Test-HTTP-Content-Disposition-1.px',
'Test-HTTP-Content-Disposition-2.px',
'Test-HTTP-Content-Disposition.px',
'Test-idn-headers.px',
'Test-idn-meta.px',
'Test-idn-cmd.px',
'Test-idn-robots.px',
'Test-iri.px',
'Test-iri-disabled.px',
'Test-iri-forced-remote.px',
'Test-iri-list.px',
'Test-N-current.px',
'Test-N-smaller.px',
'Test-N-no-info.px',