Automated merge.

This commit is contained in:
Micah Cowan 2008-08-28 01:21:59 -07:00
commit 090f1596ae
40 changed files with 1936 additions and 104 deletions

View File

@ -9,6 +9,14 @@
* AUTHORS: Added Steven Schubiger.
2008-06-26 Xavier Saint <wget@sxav.eu>
* configure.ac : IRIs support required libiconv, check it.
2008-06-14 Xavier Saint <wget@sxav.eu>
* configure.ac: Add support for IRIs
2008-05-29 Micah Cowan <micah@cowan.name>
* po/*.po: Updated from TP (the 1.11.3 set).

View File

@ -460,6 +460,77 @@ else
fi
AC_SUBST(COMMENT_IF_NO_POD2MAN)
dnl
dnl Check for IDN/IRIs
dnl
AC_ARG_ENABLE(iri,
AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
[case "${enable_iri}" in
no)
dnl Disable IRIs checking
AC_MSG_NOTICE([disabling IRIs at user request])
iri=no
;;
yes)
dnl IRIs explicitly enabled
iri=yes
force_iri=yes
;;
auto)
dnl Auto-detect IRI
iri=yes
;;
*)
AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
;;
esac
], [
dnl If nothing is specified, assume auto-detection
iri=yes
]
)
AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
[Support IDN/IRIs (needs GNU Libidn)]),
libidn=$withval, libidn="")
if test "X$iri" != "Xno"; then
AM_ICONV
if test "X$am_cv_func_iconv" != "Xyes"; then
iri=no
if test "X$force_iri" = "Xyes"; then
AC_MSG_ERROR([Libiconv is required for IRIs support])
else
AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found])
fi
fi
fi
if test "X$iri" != "Xno"; then
if test "$libidn" != ""; then
LDFLAGS="${LDFLAGS} -L$libidn/lib"
CPPFLAGS="${CPPFLAGS} -I$libidn/include"
fi
AC_CHECK_HEADER(idna.h,
AC_CHECK_LIB(idn, stringprep_check_version,
[iri=yes LIBS="${LIBS} -lidn"], iri=no),
iri=no)
if test "X$iri" != "Xno" ; then
AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
AC_MSG_NOTICE([Enabling support for IRI.])
else
AC_MSG_WARN([Libidn not found])
fi
fi
dnl Needed by src/Makefile.am
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
dnl
dnl Create output
dnl

View File

@ -1,3 +1,12 @@
2008-08-03 Xavier Saint <wget@sxav.eu>
* wget.texi : Add option descriptions for the three new
options --iri, --locale and --remote-encoding related to
IRI support.
* sample.wgetrc : Add commented lines for the three new
command iri, locale and encoding related to IRI support.
2008-08-03 Micah Cowan <micah@cowan.name>
* wget.texi: Don't set UPDATED; already set by version.texi.

View File

@ -113,3 +113,12 @@ waitretry = 10
# To try ipv6 addresses first:
#prefer-family = IPv6
# Set default IRI support state
#iri = off
# Force the default system encoding
#locale = UTF-8
# Force the default remote server encoding
#remoteencoding = UTF-8

View File

@ -674,6 +674,30 @@ Another instance where you'll get a garbled file if you try to use
Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http}
servers that support the @code{Range} header.
@cindex iri support
@cindex idn support
@item --iri
Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to
turn it off. IRI support is activated by default.
You can set the default state of IRI support using @code{iri} command in
@file{.wgetrc}. That setting may be overridden from the command line.
@cindex local encoding
@cindex locale
@item --locale=@var{encoding}
Force Wget to use @var{encoding} as the default system encoding. That affects
how Wget converts URLs specified as arguments from locale to @sc{utf-8} for
IRI support.
Wget use the function @code{nl_langinfo()} and then the @code{CHARSET}
environment variable to get the locale. If it fails, @sc{ascii} is used.
You can set the default locale using the @code{locale} command in
@file{.wgetrc}. That setting may be overridden from the command line.
@cindex progress indicator
@cindex dot style
@item --progress=@var{type}
@ -705,6 +729,21 @@ command line. The exception is that, when the output is not a TTY, the
``dot'' progress will be favored over ``bar''. To force the bar output,
use @samp{--progress=bar:force}.
@cindex remote encoding
@item --remote-encoding=@var{encoding}
Force Wget to use encoding as the default remote server encoding. That
affects how Wget converts URIs found in files from remote encoding to
@sc{utf-8} during a recursive fetch. This options is only useful for
IRI support, for the interpretation of non-@sc{ascii} characters.
For HTTP, remote encoding can be found in HTTP @code{Content-Type}
header and in HTML @code{Content-Type http-equiv} meta tag.
You can set the default encoding using the @code{remoteencoding}
command in @file{.wgetrc}. That setting may be overridden from the
command line.
@item -N
@itemx --timestamping
Turn on time-stamping. @xref{Time-Stamping}, for details.

View File

@ -32,11 +32,27 @@
* init.c (cleanup): Free the memory associated with the base
option (when DEBUG_MALLOC is defined).
2008-07-02 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New function idn_decode() to decode ASCII
encoded hostname to the locale.
* host.c : Show hostname to be resolved both in locale and
ASCII encoded.
2008-06-28 Steven Schubiger <stsc@members.fsf.org>
* retr.c (retrieve_from_file): Allow for reading the links from
an external file (HTTP/FTP).
2008-06-26 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New functions locale_to_utf8() and
idn_encode() adding basic capabilities of IRI/IDN.
* url.c : Convert URLs from locale to UTF-8 allowing a basic
support of IRI/IDN
2008-06-25 Steven Schubiger <stsc@members.fsf.org>
* ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
@ -61,7 +77,7 @@
* http.c: Make -nv --spider include the file's name when it
exists.
2008-06-22 Micah Cowan <micah@cowan.name>
* Makefile.am (version.c): Fixed version string invocation so it
@ -69,12 +85,57 @@
string vars pointers-to-const, and moved line lengths
below 80 (in Makefile.am, not in version.c).
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New function check_encoding_name() as
a preliminary encoding name check.
* main.c, iri.c : Make use of check_encoding_name().
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c : Include missing stringprep.h file and add a
cast.
* init.c : set a default initial value for opt.enable_iri,
opt.locale and opt.encoding_remote.
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : Add a new function find_locale() to find
out the local system encoding.
* main.c : Make use of find_locale().
2008-06-19 Xavier Saint <wget@sxav.eu>
* html-url.c : Add "content-type" meta tag parsing for
retrieving page encoding.
* iri.h : Make no-op version of parse_charset() return
NULL.
2008-06-16 Micah Cowan <micah@cowan.name>
* http.c (http_loop): When hstat.len is higher than the
successfully completed content's length, but it's because we
_set_ it that way, don't abort.
2008-06-14 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New files.
* Makefile.am : Add files iri.h and conditional iri.c.
* build_info.c : Add compiled feature "iri".
* http.c : include iri.h and parse charset from Content-Type
header.
* init.c, main.c, options.h : if an options isn't supported
at compiled time, don't get rid off it and show a dummy
message instead if they are used.
2008-06-13 Micah Cowan <micah@cowan.name>
* build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL
@ -118,11 +179,11 @@
default.
2008-05-17 Kenny Parnell <k.parnell@gmail.com>
(cmd_spec_prefer_family): Initialize prefer_family to prefer_none.
2008-05-17 Micah Cowan <micah@cowan.name>
* main.c (main): Handle Ctrl-D on command-line.
2008-05-15 Steven Schubiger <schubiger@gmail.com>
@ -161,7 +222,7 @@
* options.h: Add an according boolean member to the options
struct.
* sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE
out, because they're now defined independently by config.h.

View File

@ -30,6 +30,10 @@
# Version: @VERSION@
#
if IRI_IS_ENABLED
IRI_OBJ = iri.c
endif
# The following line is losing on some versions of make!
DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
@ -40,8 +44,8 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c \
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
recur.c res.c retr.c snprintf.c spider.c url.c \
utils.c \
css-url.h connect.h convert.h cookies.h \
utils.c $(IRI_OBJ) \
css-url.h connect.h convert.h cookies.h \
ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
options.h progress.h ptimer.h recur.h res.h retr.h \

View File

@ -100,6 +100,13 @@ const char* (compiled_features[]) =
#else
"-gettext",
#endif
#ifdef ENABLE_IRI
"+iri",
#else
"-iri",
#endif
/* sentinel value */
NULL
};

View File

@ -266,9 +266,25 @@ connect_to_ip (const ip_address *ip, int port, const char *print)
if (print)
{
const char *txt_addr = print_address (ip);
if (print && 0 != strcmp (print, txt_addr))
logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
escnonprint_uri (print), txt_addr, port);
if (0 != strcmp (print, txt_addr))
{
char *str = NULL, *name;
if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL)
{
int len = strlen (print) + strlen (name) + 4;
str = xmalloc (len);
snprintf (str, len, "%s (%s)", name, print);
str[len-1] = '\0';
xfree (name);
}
logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
str ? str : escnonprint_uri (print), txt_addr, port);
if (str)
xfree (str);
}
else
logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port);
}

View File

@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
/* Parse the file... */
urls = is_css ? get_urls_css_file (file, url) :
get_urls_html (file, url, NULL);
get_urls_html (file, url, NULL, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the

View File

@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line)
return FTPRERR;
/* Strip trailing CRLF before printing the line, so that
escnonprint doesn't include bogus \012 and \015. */
quotting doesn't include bogus \012 and \015. */
p = strchr (line, '\0');
if (p > line && p[-1] == '\n')
*--p = '\0';

View File

@ -712,8 +712,24 @@ lookup_host (const char *host, int flags)
/* No luck with the cache; resolve HOST. */
if (!silent && !numeric_address)
logprintf (LOG_VERBOSE, _("Resolving %s... "),
quotearg_style (escape_quoting_style, host));
{
char *str = NULL, *name;
if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
{
int len = strlen (host) + strlen (name) + 4;
str = xmalloc (len);
snprintf (str, len, "%s (%s)", name, host);
str[len-1] = '\0';
xfree (name);
}
logprintf (LOG_VERBOSE, _("Resolving %s... "),
quotearg_style (escape_quoting_style, str ? str : host));
if (str)
xfree (str);
}
#ifdef ENABLE_IPV6
{

View File

@ -174,6 +174,10 @@ static const char *additional_attributes[] = {
static struct hash_table *interesting_tags;
static struct hash_table *interesting_attributes;
/* Will contains the (last) charset found in 'http-equiv=content-type'
meta tags */
static char *meta_charset;
static void
init_interesting (void)
{
@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size,
return NULL;
}
url = url_parse (link_uri, NULL);
url = url_parse (link_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@ -303,7 +307,7 @@ append_url (const char *link_uri, int position, int size,
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri));
url = url_parse (complete_uri, NULL);
url = url_parse (complete_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@ -553,6 +557,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
entry->link_expect_html = 1;
}
}
else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
{
/* Handle stuff like:
<meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
char *mcharset;
char *content = find_attr (tag, "content", NULL);
if (!content)
return;
mcharset = parse_charset (content);
if (!mcharset)
return;
xfree_null (meta_charset);
meta_charset = mcharset;
}
else if (name && 0 == strcasecmp (name, "robots"))
{
/* Handle stuff like:
@ -617,7 +638,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
<base href=...> and does the right thing. */
struct urlpos *
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
struct iri *iri)
{
struct file_memory *fm;
struct map_context ctx;
@ -657,6 +679,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
NULL, interesting_attributes);
/* If meta charset isn't null, override content encoding */
if (iri && meta_charset)
set_content_encoding (iri, meta_charset);
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)
*meta_disallow_follow = ctx.nofollow;
@ -726,7 +752,7 @@ get_urls_file (const char *file)
url_text = merged;
}
url = url_parse (url_text, &up_error_code);
url = url_parse (url_text, &up_error_code, NULL);
if (!url)
{
char *error = url_error (url_text, up_error_code);

View File

@ -44,7 +44,7 @@ struct map_context {
};
struct urlpos *get_urls_file (const char *);
struct urlpos *get_urls_html (const char *, const char *, bool *);
struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
struct urlpos *append_url (const char *, int, int, struct map_context *);
void free_urlpos (struct urlpos *);

View File

@ -1364,7 +1364,8 @@ free_hstat (struct http_stat *hs)
If PROXY is non-NULL, the connection will be made to the proxy
server, and u->url will be requested. */
static uerr_t
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
struct iri *iri)
{
struct request *req;
@ -1827,7 +1828,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
hs->local_file = url_file_name (u);
}
}
/* TODO: perform this check only once. */
if (!hs->existence_checked && file_exists_p (hs->local_file))
{
@ -1896,7 +1897,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
local_dot_orig_file_exists = true;
local_filename = filename_plus_orig_suffix;
}
}
}
if (!local_dot_orig_file_exists)
/* Couldn't stat() <file>.orig, so try to stat() <file>. */
@ -2048,9 +2049,20 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
char *tmp = strchr (type, ';');
if (tmp)
{
/* sXXXav: only needed if IRI support is enabled */
char *tmp2 = tmp + 1;
while (tmp > type && c_isspace (tmp[-1]))
--tmp;
*tmp = '\0';
/* Try to get remote encoding if needed */
if (opt.enable_iri && !opt.encoding_remote)
{
tmp = parse_charset (tmp2);
if (tmp)
set_content_encoding (iri, tmp);
}
}
}
hs->newloc = resp_header_strdup (resp, "Location");
@ -2325,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
retried, and retried, and retried, and... */
uerr_t
http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
int *dt, struct url *proxy)
int *dt, struct url *proxy, struct iri *iri)
{
int count;
bool got_head = false; /* used for time-stamping and filename detection */
@ -2336,16 +2348,16 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
uerr_t err, ret = TRYLIMEXC;
time_t tmr = -1; /* remote time-stamp */
struct http_stat hstat; /* HTTP status */
struct_stat st;
struct_stat st;
bool send_head_first = true;
/* Assert that no value for *LOCAL_FILE was passed. */
assert (local_file == NULL || *local_file == NULL);
/* Set LOCAL_FILE parameter. */
if (local_file && opt.output_document)
*local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document);
/* Reset NEWLOC parameter. */
*newloc = NULL;
@ -2382,7 +2394,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
retrieve the file. But if the output_document was given, then this
test was already done and the file didn't exist. Hence the !opt.output_document */
logprintf (LOG_VERBOSE, _("\
File %s already there; not retrieving.\n\n"),
File %s already there; not retrieving.\n\n"),
quote (hstat.local_file));
/* If the file is there, we suppose it's retrieved OK. */
*dt |= RETROKF;
@ -2398,10 +2410,10 @@ File %s already there; not retrieving.\n\n"),
/* Reset the counter. */
count = 0;
/* Reset the document type. */
*dt = 0;
/* Skip preliminary HEAD request if we're not in spider mode AND
* if -O was given or HTTP Content-Disposition support is disabled. */
if (!opt.spider
@ -2410,21 +2422,21 @@ File %s already there; not retrieving.\n\n"),
/* Send preliminary HEAD request if -N is given and we have an existing
* destination file. */
if (opt.timestamping
if (opt.timestamping
&& !opt.content_disposition
&& file_exists_p (url_file_name (u)))
send_head_first = true;
/* THE loop */
do
{
/* Increment the pass counter. */
++count;
sleep_between_retrievals (count);
/* Get the current time string. */
tms = datetime_str (time (NULL));
if (opt.spider && !got_head)
logprintf (LOG_VERBOSE, _("\
Spider mode enabled. Check if remote file exists.\n"));
@ -2433,20 +2445,20 @@ Spider mode enabled. Check if remote file exists.\n"));
if (opt.verbose)
{
char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
if (count > 1)
if (count > 1)
{
char tmp[256];
sprintf (tmp, _("(try:%2d)"), count);
logprintf (LOG_NOTQUIET, "--%s-- %s %s\n",
tms, tmp, hurl);
}
else
else
{
logprintf (LOG_NOTQUIET, "--%s-- %s\n",
tms, hurl);
}
#ifdef WINDOWS
ws_changetitle (hurl);
#endif
@ -2456,7 +2468,7 @@ Spider mode enabled. Check if remote file exists.\n"));
/* Default document type is empty. However, if spider mode is
on or time-stamping is employed, HEAD_ONLY commands is
encoded within *dt. */
if (send_head_first && !got_head)
if (send_head_first && !got_head)
*dt |= HEAD_ONLY;
else
*dt &= ~HEAD_ONLY;
@ -2489,11 +2501,11 @@ Spider mode enabled. Check if remote file exists.\n"));
*dt &= ~SEND_NOCACHE;
/* Try fetching the document, or at least its head. */
err = gethttp (u, &hstat, dt, proxy);
err = gethttp (u, &hstat, dt, proxy, iri);
/* Time? */
tms = datetime_str (time (NULL));
/* Get the new location (with or without the redirection). */
if (hstat.newloc)
*newloc = xstrdup (hstat.newloc);
@ -2532,7 +2544,7 @@ Spider mode enabled. Check if remote file exists.\n"));
hstat.statcode);
ret = WRONGCODE;
}
else
else
{
ret = NEWLOCATION;
}
@ -2548,7 +2560,7 @@ Spider mode enabled. Check if remote file exists.\n"));
/* All possibilities should have been exhausted. */
abort ();
}
if (!(*dt & RETROKF))
{
char *hurl = NULL;
@ -2567,11 +2579,13 @@ Spider mode enabled. Check if remote file exists.\n"));
continue;
}
/* Maybe we should always keep track of broken links, not just in
* spider mode. */
else if (opt.spider)
* spider mode.
* Don't log error if it was UTF-8 encoded because we will try
* once unencoded. */
else if (opt.spider && !iri->utf8_encode)
{
/* #### Again: ugly ugly ugly! */
if (!hurl)
if (!hurl)
hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
nonexisting_url (hurl);
logprintf (LOG_NOTQUIET, _("\
@ -2580,7 +2594,7 @@ Remote file does not exist -- broken link!!!\n"));
else
{
logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
tms, hstat.statcode,
tms, hstat.statcode,
quotearg_style (escape_quoting_style, hstat.error));
}
logputs (LOG_VERBOSE, "\n");

View File

@ -33,7 +33,7 @@ as that of the covered work. */
struct url;
uerr_t http_loop (struct url *, char **, char **, const char *, int *,
struct url *);
struct url *, struct iri *);
void save_cookies (void);
void http_cleanup (void);
time_t http_atotm (const char *);

View File

@ -182,9 +182,11 @@ static const struct {
{ "inet6only", &opt.ipv6_only, cmd_boolean },
#endif
{ "input", &opt.input_filename, cmd_file },
{ "iri", &opt.enable_iri, cmd_boolean },
{ "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
{ "limitrate", &opt.limit_rate, cmd_bytes },
{ "loadcookies", &opt.cookies_input, cmd_file },
{ "locale", &opt.locale, cmd_string },
{ "logfile", &opt.lfilename, cmd_file },
{ "login", &opt.ftp_user, cmd_string },/* deprecated*/
{ "maxredirect", &opt.max_redirect, cmd_number },
@ -224,6 +226,7 @@ static const struct {
{ "referer", &opt.referer, cmd_string },
{ "reject", &opt.rejects, cmd_vector },
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
@ -331,6 +334,14 @@ defaults (void)
opt.restrict_files_case = restrict_no_case_restriction;
opt.max_redirect = 20;
#ifdef ENABLE_IRI
opt.enable_iri = true;
#else
opt.enable_iri = false;
#endif
opt.locale = NULL;
opt.encoding_remote = NULL;
}
/* Return the user's home directory (strdup-ed), or NULL if none is

348
src/iri.c Normal file
View File

@ -0,0 +1,348 @@
/* IRI related functions.
Copyright (C) 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
Additional permission under GNU GPL version 3 section 7
If you modify this program, or any covered work, by linking or
combining it with the OpenSSL project's OpenSSL library (or a
modified version of that library), containing parts covered by the
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
grants you additional permission to convey the resulting work.
Corresponding Source for a non-source form of such a combination
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <iconv.h>
#include <stringprep.h>
#include <idna.h>
#include <errno.h>
#include "utils.h"
/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
/* Note: locale encoding is kept in options struct (opt.locale) */
static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
/* Given a string containing "charset=XXX", return the encoding if found,
or NULL otherwise */
char *
parse_charset (char *str)
{
char *charset;
if (!str || !*str)
return NULL;
str = strcasestr (str, "charset=");
if (!str)
return NULL;
str += 8;
charset = str;
/* sXXXav: which chars should be banned ??? */
while (*charset && !c_isspace (*charset))
charset++;
/* sXXXav: could strdupdelim return NULL ? */
charset = strdupdelim (str, charset);
/* Do a minimum check on the charset value */
if (!check_encoding_name (charset))
{
xfree (charset);
return NULL;
}
/*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
return charset;
}
/* Find the locale used, or fall back on a default value */
char *
find_locale (void)
{
return (char *) stringprep_locale_charset ();
}
/* Basic check of an encoding name. */
bool
check_encoding_name (char *encoding)
{
char *s = encoding;
while (*s)
{
if (!c_isascii (*s) || c_isspace (*s))
{
logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
return false;
}
s++;
}
return true;
}
/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
static bool
open_locale_to_utf8 (void)
{
}
/* Try converting string str from locale to UTF-8. Return a new string
on success, or str on error or if conversion isn't needed. */
const char *
locale_to_utf8 (const char *str)
{
iconv_t l2u;
char *new;
/* That shouldn't happen, just in case */
if (!opt.locale)
{
logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
opt.locale = find_locale ();
}
if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
return str;
l2u = iconv_open ("UTF-8", opt.locale);
if (l2u != (iconv_t)(-1))
{
logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
quote (opt.locale), quote ("UTF-8"));
return str;
}
if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new))
return (const char *) new;
return str;
}
/* Do the conversion according to the passed conversion descriptor cd. *out
will contain the transcoded string on success. *out content is
unspecified otherwise. */
static bool
do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
{
/* sXXXav : hummm hard to guess... */
size_t len, done, outlen = inlen * 2;
int invalid = 0, tooshort = 0;
char *s;
s = xmalloc (outlen + 1);
*out = s;
len = outlen;
done = 0;
for (;;)
{
if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
{
*out = s;
*(s + len - outlen - done) = '\0';
return true;
}
/* Incomplete or invalid multibyte sequence */
if (errno == EINVAL || errno == EILSEQ)
{
if (!invalid)
logprintf (LOG_VERBOSE,
"Incomplete or invalide multibyte sequence encountered\n");
invalid++;
**out = *in;
in++;
inlen--;
(*out)++;
outlen--;
}
else if (errno == E2BIG) /* Output buffer full */
{
char *new;
tooshort++;
done = len;
outlen = done + inlen * 2;
new = xmalloc (outlen + 1);
memcpy (new, s, done);
xfree (s);
s = new;
len = outlen;
*out = s + done;
}
else /* Weird, we got an unspecified error */
{
logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
break;
}
}
return false;
}
/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
on error. */
char *
idn_encode (struct iri *i, char *host)
{
char *new;
int ret;
/* Encode to UTF-8 if not done */
if (!i->utf8_encode)
{
if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
return NULL; /* Nothing to encode or an error occured */
host = new;
}
/* toASCII UTF-8 NULL terminated string */
ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
if (ret != IDNA_SUCCESS)
{
/* sXXXav : free new when needed ! */
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
}
return new;
}
/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
on success or NULL on error. */
char *
idn_decode (char *host)
{
char *new;
int ret;
ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
if (ret != IDNA_SUCCESS)
{
logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
}
return new;
}
/* Try to transcode string str from remote encoding to UTF-8. On success, *new
contains the transcoded string. *new content is unspecified otherwise. */
bool
remote_to_utf8 (struct iri *i, const char *str, const char **new)
{
iconv_t cd;
bool ret = false;
if (!i->uri_encoding)
return false;
cd = iconv_open ("UTF-8", i->uri_encoding);
if (cd == (iconv_t)(-1))
return false;
if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
ret = true;
iconv_close (cd);
/* Test if something was converted */
if (!strcmp (str, *new))
{
xfree ((char *) *new);
return false;
}
return ret;
}
/* Allocate a new iri structure and return a pointer to it. */
struct iri *
iri_new (void)
{
struct iri *i = xmalloc (sizeof (struct iri));
i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
i->content_encoding = NULL;
i->utf8_encode = opt.enable_iri;
return i;
}
/* Completely free an iri structure. */
void
iri_free (struct iri *i)
{
xfree_null (i->uri_encoding);
xfree_null (i->content_encoding);
xfree (i);
}
/* Set uri_encoding of struct iri i. If a remote encoding was specified, use
it unless force is true. */
void
set_uri_encoding (struct iri *i, char *charset, bool force)
{
DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None"));
if (!force && opt.encoding_remote)
return;
if (i->uri_encoding)
{
if (charset && !strcasecmp (i->uri_encoding, charset))
return;
xfree (i->uri_encoding);
}
i->uri_encoding = charset ? xstrdup (charset) : NULL;
}
/* Set content_encoding of struct iri i. */
void
set_content_encoding (struct iri *i, char *charset)
{
DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None"));
if (opt.encoding_remote)
return;
if (i->content_encoding)
{
if (charset && !strcasecmp (i->content_encoding, charset))
return;
xfree (i->content_encoding);
}
i->content_encoding = charset ? xstrdup (charset) : NULL;
}

70
src/iri.h Normal file
View File

@ -0,0 +1,70 @@
/* Internationalization related declarations.
Copyright (C) 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
Additional permission under GNU GPL version 3 section 7
If you modify this program, or any covered work, by linking or
combining it with the OpenSSL project's OpenSSL library (or a
modified version of that library), containing parts covered by the
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
grants you additional permission to convey the resulting work.
Corresponding Source for a non-source form of such a combination
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
#ifndef IRI_H
#define IRI_H
struct iri {
char *uri_encoding; /* Encoding of the uri to fetch */
char *content_encoding; /* Encoding of links inside the fetched file */
bool utf8_encode; /* Will/Is the current url encoded in utf8 */
};
#ifdef ENABLE_IRI
char *parse_charset (char *str);
char *find_locale (void);
bool check_encoding_name (char *encoding);
const char *locale_to_utf8 (const char *str);
char *idn_encode (struct iri *i, char *host);
char *idn_decode (char *host);
bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
struct iri *iri_new (void);
void iri_free (struct iri *i);
void set_uri_encoding (struct iri *i, char *charset, bool force);
void set_content_encoding (struct iri *i, char *charset);
#else /* ENABLE_IRI */
struct iri dummy_iri;
#define parse_charset(str) NULL
#define find_locale() NULL
#define check_encoding_name(str) false
#define locale_to_utf8(str) (str)
#define idn_encode(a,b) NULL
#define idn_decode(str) NULL
#define remote_to_utf8(a,b,c) false
#define iri_new() (&dummy_iri)
#define iri_free(a)
#define set_uri_encoding(a,b,c)
#define set_content_encoding(a,b)
#endif /* ENABLE_IRI */
#endif /* IRI_H */

View File

@ -43,7 +43,7 @@ as that of the covered work. */
#include "utils.h"
#include "log.h"
/* This file impplement support for "logging". Logging means printing
/* This file implement support for "logging". Logging means printing
output, plus several additional features:
- Cataloguing output by importance. You can specify that a log

View File

@ -201,10 +201,12 @@ static struct cmdline_option option_data[] =
{ "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
#endif
{ "input-file", 'i', OPT_VALUE, "input", -1 },
{ "iri", 0, OPT_BOOLEAN, "iri", -1 },
{ "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
{ "level", 'l', OPT_VALUE, "reclevel", -1 },
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
{ "locale", 0, OPT_VALUE, "locale", -1 },
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
{ "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
{ "no", 'n', OPT__NO, NULL, required_argument },
@ -238,6 +240,7 @@ static struct cmdline_option option_data[] =
{ "referer", 0, OPT_VALUE, "referer", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 },
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
{ "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
{ "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
@ -1062,6 +1065,27 @@ for details.\n\n"));
exit (1);
}
#ifdef ENABLE_IRI
if (opt.enable_iri)
{
if (opt.locale && !check_encoding_name (opt.locale))
opt.locale = NULL;
if (!opt.locale)
opt.locale = find_locale ();
if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
opt.encoding_remote = NULL;
}
#else
if (opt.enable_iri || opt.locale || opt.encoding_remote)
{
/* sXXXav : be more specific... */
printf(_("This version does not have support for IRIs\n"));
exit(1);
}
#endif
if (opt.ask_passwd)
{
opt.passwd = prompt_for_password ();
@ -1171,15 +1195,21 @@ WARNING: Can't reopen standard output in binary mode;\n\
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
if (url_scheme (*t) == SCHEME_FTP)
if (url_scheme (*t) == SCHEME_FTP)
opt.follow_ftp = 1;
status = retrieve_tree (*t);
status = retrieve_tree (*t, NULL);
opt.follow_ftp = old_follow_ftp;
}
else
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
{
struct iri *i = iri_new ();
set_uri_encoding (i, opt.locale, true);
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
opt.recursive, i);
iri_free (i);
}
if (opt.delete_after && file_exists_p(filename))
{

View File

@ -239,6 +239,10 @@ struct options
bool content_disposition; /* Honor HTTP Content-Disposition header. */
bool auth_without_challenge; /* Issue Basic authentication creds without
waiting for a challenge. */
bool enable_iri;
char *encoding_remote;
char *locale;
};
extern struct options opt;

View File

@ -51,7 +51,7 @@ as that of the covered work. */
#include "html-url.h"
#include "css-url.h"
#include "spider.h"
/* Functions for maintaining the URL queue. */
struct queue_element {
@ -60,6 +60,7 @@ struct queue_element {
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
struct iri *iri; /* sXXXav */
bool css_allowed; /* whether the document is allowed to
be treated as CSS. */
struct queue_element *next; /* next element in queue */
@ -93,11 +94,12 @@ url_queue_delete (struct url_queue *queue)
into it. */
static void
url_enqueue (struct url_queue *queue,
url_enqueue (struct url_queue *queue, struct iri *i,
const char *url, const char *referer, int depth,
bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
qel->iri = i;
qel->url = url;
qel->referer = referer;
qel->depth = depth;
@ -112,6 +114,10 @@ url_enqueue (struct url_queue *queue,
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
if (i)
DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url),
i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
@ -124,7 +130,7 @@ url_enqueue (struct url_queue *queue,
succeeded, or false if the queue is empty. */
static bool
url_dequeue (struct url_queue *queue,
url_dequeue (struct url_queue *queue, struct iri **i,
const char **url, const char **referer, int *depth,
bool *html_allowed, bool *css_allowed)
{
@ -137,6 +143,7 @@ url_dequeue (struct url_queue *queue,
if (!queue->head)
queue->tail = NULL;
*i = qel->iri;
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
@ -153,9 +160,9 @@ url_dequeue (struct url_queue *queue,
}
static bool download_child_p (const struct urlpos *, struct url *, int,
struct url *, struct hash_table *);
struct url *, struct hash_table *, struct iri *);
static bool descend_redirect_p (const char *, const char *, int,
struct url *, struct hash_table *);
struct url *, struct hash_table *, struct iri *);
/* Retrieve a part of the web beginning with START_URL. This used to
@ -180,7 +187,7 @@ static bool descend_redirect_p (const char *, const char *, int,
options, add it to the queue. */
uerr_t
retrieve_tree (const char *start_url)
retrieve_tree (const char *start_url, struct iri *pi)
{
uerr_t status = RETROK;
@ -192,8 +199,22 @@ retrieve_tree (const char *start_url)
struct hash_table *blacklist;
int up_error_code;
struct url *start_url_parsed = url_parse (start_url, &up_error_code);
struct url *start_url_parsed;
struct iri *i = iri_new ();
#define COPYSTR(x) (x) ? xstrdup(x) : NULL;
/* Duplicate pi struct if not NULL */
if (pi)
{
i->uri_encoding = COPYSTR (pi->uri_encoding);
i->content_encoding = COPYSTR (pi->content_encoding);
i->utf8_encode = pi->utf8_encode;
}
else
set_uri_encoding (i, opt.locale, true);
#undef COPYSTR
start_url_parsed = url_parse (start_url, &up_error_code, i);
if (!start_url_parsed)
{
char *error = url_error (start_url, up_error_code);
@ -207,7 +228,8 @@ retrieve_tree (const char *start_url)
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
false);
string_set_add (blacklist, start_url_parsed->url);
while (1)
@ -226,7 +248,7 @@ retrieve_tree (const char *start_url)
/* Get the next URL from the queue... */
if (!url_dequeue (queue,
if (!url_dequeue (queue, (struct iri **) &i,
(const char **)&url, (const char **)&referer,
&depth, &html_allowed, &css_allowed))
break;
@ -267,7 +289,8 @@ retrieve_tree (const char *start_url)
int dt = 0;
char *redirected = NULL;
status = retrieve_url (url, &file, &redirected, referer, &dt, false);
status = retrieve_url (url, &file, &redirected, referer, &dt,
false, i);
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
@ -295,7 +318,7 @@ retrieve_tree (const char *start_url)
if (descend)
{
if (!descend_redirect_p (redirected, url, depth,
start_url_parsed, blacklist))
start_url_parsed, blacklist, i))
descend = false;
else
/* Make sure that the old pre-redirect form gets
@ -347,7 +370,7 @@ retrieve_tree (const char *start_url)
bool meta_disallow_follow = false;
struct urlpos *children
= is_css ? get_urls_css_file (file, url) :
get_urls_html (file, url, &meta_disallow_follow);
get_urls_html (file, url, &meta_disallow_follow, i);
if (opt.use_robots && meta_disallow_follow)
{
@ -358,7 +381,8 @@ retrieve_tree (const char *start_url)
if (children)
{
struct urlpos *child = children;
struct url *url_parsed = url_parsed = url_parse (url, NULL);
struct url *url_parsed = url_parse (url, NULL, i);
struct iri *ci;
char *referer_url = url;
bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL);
@ -375,9 +399,11 @@ retrieve_tree (const char *start_url)
if (dash_p_leaf_HTML && !child->link_inline_p)
continue;
if (download_child_p (child, url_parsed, depth, start_url_parsed,
blacklist))
blacklist, i))
{
url_enqueue (queue, xstrdup (child->url->url),
ci = iri_new ();
set_uri_encoding (ci, i->content_encoding, false);
url_enqueue (queue, ci, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1,
child->link_expect_html,
child->link_expect_css);
@ -395,18 +421,18 @@ retrieve_tree (const char *start_url)
}
}
if (file
&& (opt.delete_after
if (file
&& (opt.delete_after
|| opt.spider /* opt.recursive is implicitely true */
|| !acceptable (file)))
{
/* Either --delete-after was specified, or we loaded this
(otherwise unneeded because of --spider or rejected by -R)
HTML file just to harvest its hyperlinks -- in either case,
(otherwise unneeded because of --spider or rejected by -R)
HTML file just to harvest its hyperlinks -- in either case,
delete the local file. */
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
opt.delete_after ? "--delete-after" :
(opt.spider ? "--spider" :
(opt.spider ? "--spider" :
"recursive rejection criteria")));
logprintf (LOG_VERBOSE,
(opt.delete_after || opt.spider
@ -422,6 +448,7 @@ retrieve_tree (const char *start_url)
xfree (url);
xfree_null (referer);
xfree_null (file);
iri_free (i);
}
/* If anything is left of the queue due to a premature exit, free it
@ -430,9 +457,11 @@ retrieve_tree (const char *start_url)
char *d1, *d2;
int d3;
bool d4, d5;
while (url_dequeue (queue,
struct iri *d6;
while (url_dequeue (queue, (struct iri **)&d6,
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
iri_free (d6);
xfree (d1);
xfree_null (d2);
}
@ -461,7 +490,8 @@ retrieve_tree (const char *start_url)
static bool
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *u = upos->url;
const char *url = u->url;
@ -471,7 +501,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (string_set_contains (blacklist, url))
{
if (opt.spider)
if (opt.spider)
{
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
@ -602,7 +632,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (!specs)
{
char *rfile;
if (res_retrieve_file (url, &rfile))
if (res_retrieve_file (url, &rfile, iri))
{
specs = res_parse_from_file (rfile);
@ -657,23 +687,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
static bool
descend_redirect_p (const char *redirected, const char *original, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *orig_parsed, *new_parsed;
struct urlpos *upos;
bool success;
orig_parsed = url_parse (original, NULL);
orig_parsed = url_parse (original, NULL, NULL);
assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL);
new_parsed = url_parse (redirected, NULL, NULL);
assert (new_parsed != NULL);
upos = xnew0 (struct urlpos);
upos->url = new_parsed;
success = download_child_p (upos, orig_parsed, depth,
start_url_parsed, blacklist);
start_url_parsed, blacklist, iri);
url_free (orig_parsed);
url_free (new_parsed);

View File

@ -42,6 +42,6 @@ as that of the covered work. */
struct urlpos;
void recursive_cleanup (void);
uerr_t retrieve_tree (const char *);
uerr_t retrieve_tree (const char *, struct iri *);
#endif /* RECUR_H */

View File

@ -532,21 +532,28 @@ res_get_specs (const char *host, int port)
Return true if robots were retrieved OK, false otherwise. */
bool
res_retrieve_file (const char *url, char **file)
res_retrieve_file (const char *url, char **file, struct iri *iri)
{
struct iri *i = iri_new ();
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping;
int saved_sp_val = opt.spider;
/* Copy server URI encoding for a possible IDNA transformation, no need to
encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
set_uri_encoding (i, iri->uri_encoding, false);
i->utf8_encode = false;
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
opt.timestamping = false;
opt.spider = false;
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
opt.timestamping = saved_ts_val;
opt.spider = saved_sp_val;
opt.spider = saved_sp_val;
xfree (robots_url);
iri_free (i);
if (err != RETROK && *file != NULL)
{

View File

@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *);
void res_register_specs (const char *, int, struct robot_specs *);
struct robot_specs *res_get_specs (const char *, int);
bool res_retrieve_file (const char *, char **);
bool res_retrieve_file (const char *, char **, struct iri *);
bool is_robots_txt_url (const char *);

View File

@ -597,7 +597,7 @@ static char *getproxy (struct url *);
uerr_t
retrieve_url (const char *origurl, char **file, char **newloc,
const char *refurl, int *dt, bool recursive)
const char *refurl, int *dt, bool recursive, struct iri *iri)
{
uerr_t result;
char *url;
@ -625,7 +625,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
if (file)
*file = NULL;
u = url_parse (url, &up_error_code);
second_try:
u = url_parse (url, &up_error_code, iri);
if (!u)
{
char *error = url_error (url, up_error_code);
@ -635,6 +636,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
return URLERROR;
}
DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url),
iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None",
iri->utf8_encode));
if (!refurl)
refurl = opt.referer;
@ -648,8 +653,12 @@ retrieve_url (const char *origurl, char **file, char **newloc,
proxy = getproxy (u);
if (proxy)
{
struct iri *pi = iri_new ();
set_uri_encoding (pi, opt.locale, true);
pi->utf8_encode = false;
/* Parse the proxy URL. */
proxy_url = url_parse (proxy, &up_error_code);
proxy_url = url_parse (proxy, &up_error_code, NULL);
if (!proxy_url)
{
char *error = url_error (proxy, up_error_code);
@ -676,7 +685,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
#endif
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
{
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
}
else if (u->scheme == SCHEME_FTP)
{
@ -726,8 +735,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
xfree (mynewloc);
mynewloc = construced_newloc;
/* Reset UTF-8 encoding state, keep the URI encoding and reset
the content encoding. */
iri->utf8_encode = opt.enable_iri;
set_content_encoding (iri, NULL);
/* Now, see if this new location makes sense. */
newloc_parsed = url_parse (mynewloc, &up_error_code);
newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
if (!newloc_parsed)
{
char *error = url_error (mynewloc, up_error_code);
@ -776,8 +790,21 @@ retrieve_url (const char *origurl, char **file, char **newloc,
goto redirected;
}
if (local_file)
/* Try to not encode in UTF-8 if fetching failed */
if (!(*dt & RETROKF) && iri->utf8_encode)
{
iri->utf8_encode = false;
DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url)));
goto second_try;
}
if (local_file && *dt & RETROKF)
{
register_download (u->url, local_file);
if (redirection_count && 0 != strcmp (origurl, u->url))
register_redirection (origurl, u->url);
if (*dt & TEXTHTML)
register_html (u->url, local_file);
if (*dt & RETROKF)
{
register_download (u->url, local_file);
@ -827,13 +854,18 @@ retrieve_from_file (const char *file, bool html, int *count)
{
uerr_t status;
struct urlpos *url_list, *cur_url;
struct iri *iri = iri_new();
char *input_file = NULL;
const char *url = file;
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
/* sXXXav : Assume filename and links in the file are in the locale */
set_uri_encoding (iri, opt.locale, true);
set_content_encoding (iri, opt.locale);
if (url_has_scheme (url))
{
int dt;
@ -842,17 +874,21 @@ retrieve_from_file (const char *file, bool html, int *count)
if (!opt.base_href)
opt.base_href = xstrdup (url);
status = retrieve_url (url, &input_file, NULL, NULL, &dt, false);
status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri);
if (status != RETROK)
return status;
if (dt & TEXTHTML)
html = true;
/* If we have a found a content encoding, use it */
if (iri->content_encoding)
set_uri_encoding (iri, iri->content_encoding, false);
}
else
input_file = (char *) file;
url_list = (html ? get_urls_html (input_file, NULL, NULL)
url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
: get_urls_file (input_file));
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
@ -868,21 +904,26 @@ retrieve_from_file (const char *file, bool html, int *count)
status = QUOTEXC;
break;
}
/* Reset UTF-8 encode status */
iri->utf8_encode = opt.enable_iri;
if ((opt.recursive || opt.page_requisites)
&& (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
{
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
if (cur_url->url->scheme == SCHEME_FTP)
if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
status = retrieve_tree (cur_url->url->url);
status = retrieve_tree (cur_url->url->url, iri);
opt.follow_ftp = old_follow_ftp;
}
else
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
&dt, opt.recursive, iri);
if (filename && opt.delete_after && file_exists_p (filename))
{
@ -901,6 +942,8 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
/* Free the linked list of URL-s. */
free_urlpos (url_list);
iri_free (iri);
return status;
}
@ -1053,7 +1096,11 @@ bool
url_uses_proxy (const char *url)
{
bool ret;
struct url *u = url_parse (url, NULL);
struct url *u;
struct iri *i = iri_new();
/* url was given in the command line, so use locale as encoding */
set_uri_encoding (i, opt.locale, true);
u= url_parse (url, NULL, i);
if (!u)
return false;
ret = getproxy (u) != NULL;

View File

@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
char *fd_read_hunk (int, hunk_terminator_t, long, long);
char *fd_read_line (int);
uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
bool, struct iri *);
uerr_t retrieve_from_file (const char *, bool, int *);
const char *retr_rate (wgint, double);

View File

@ -640,7 +640,7 @@ static const char *parse_errors[] = {
error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */
struct url *
url_parse (const char *url, int *error)
url_parse (const char *url, int *error, struct iri *iri)
{
struct url *u;
const char *p;
@ -659,7 +659,7 @@ url_parse (const char *url, int *error)
int port;
char *user = NULL, *passwd = NULL;
char *url_encoded = NULL;
char *url_encoded = NULL, *new_url = NULL;
int error_code;
@ -670,9 +670,20 @@ url_parse (const char *url, int *error)
goto error;
}
url_encoded = reencode_escapes (url);
if (iri && iri->utf8_encode)
{
url_unescape ((char *) url);
iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
if (!iri->utf8_encode)
new_url = NULL;
}
url_encoded = reencode_escapes (new_url ? new_url : url);
p = url_encoded;
if (new_url && url_encoded != new_url)
xfree (new_url);
p += strlen (supported_schemes[scheme].leading_string);
uname_b = p;
p = url_skip_credentials (p);
@ -842,6 +853,18 @@ url_parse (const char *url, int *error)
{
url_unescape (u->host);
host_modified = true;
/* Apply IDNA regardless of iri->utf8_encode status */
if (opt.enable_iri && iri)
{
char *new = idn_encode (iri, u->host);
if (new)
{
xfree (u->host);
u->host = new;
host_modified = true;
}
}
}
if (params_b)
@ -851,7 +874,7 @@ url_parse (const char *url, int *error)
if (fragment_b)
u->fragment = strdupdelim (fragment_b, fragment_e);
if (path_modified || u->fragment || host_modified || path_b == path_e)
if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
{
/* If we suspect that a transformation has rendered what
url_string might return different from URL_ENCODED, rebuild

View File

@ -84,7 +84,7 @@ struct url
char *url_escape (const char *);
struct url *url_parse (const char *, int *);
struct url *url_parse (const char *, int *, struct iri *iri);
char *url_error (const char *, int);
char *url_full_path (const struct url *);
void url_set_dir (struct url *, const char *);

View File

@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT;
#include "quote.h"
#include "quotearg.h"
/* Likewise for struct iri definition */
#include "iri.h"
/* Useful macros used across the code: */
/* The number of elements in an array. For example:

View File

@ -1,3 +1,30 @@
2008-08-14 Xavier Saint <wget@sxav.eu>
* Test-iri-list.px : Fetch files from a remote list.
2008-08-03 Xavier Saint <wget@sxav.eu>
* Test-iri.px : HTTP recursive fetch for testing IRI support and
fallback.
* Test-iri-disabled.px : Same file structure as Test-iri.px but with
IRI support disabled
* Test-iri-forced-remote.px : There's a difference between ISO-8859-1
and ISO-8859-15 for character 0xA4 (respectively currency sign and
euro sign). So with a forced ISO-8859-1 remote encoding, wget should
see 0xA4 as a currency sign and transcode it correctly in UTF-8 instead
of using the ISO-8859-15 given by the server.
* Test-ftp-iri.px : Give a file to fetch via FTP in a specific locale
and expect wget to fetch the file UTF-8 encoded.
* Test-ftp-iri-fallback.px : Same as above but wget should fallback on
locale encoding to fetch the file.
* Test-ftp-iri.px : Same as Test-ftp-iri.px but with IRI support
disabled. The UTF-8 encoded file should not be retrieved.
2008-06-22 Micah Cowan <micah@cowan.name>
* Test-proxied-https-auth.px: Shift exit code so it falls in the

50
tests/Test-ftp-iri-disabled.px Executable file
View File

@ -0,0 +1,50 @@
#!/usr/bin/perl -w
use strict;
use FTPTest;
###############################################################################
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $francais = <<EOF;
Some text.
EOF
$francais =~ s/\n/\r\n/;
# code, msg, headers, content
my %urls = (
"/fran${ccedilla_u8}ais.txt" => {
content => $francais,
},
"/fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri=no --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
###############################################################################
my $the_test = FTPTest->new (name => "Test-ftp-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

46
tests/Test-ftp-iri-fallback.px Executable file
View File

@ -0,0 +1,46 @@
#!/usr/bin/perl -w
use strict;
use FTPTest;
###############################################################################
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $francais = <<EOF;
Some text.
EOF
$francais =~ s/\n/\r\n/;
# code, msg, headers, content
my %urls = (
"/fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
###############################################################################
my $the_test = FTPTest->new (name => "Test-ftp-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

47
tests/Test-ftp-iri.px Executable file
View File

@ -0,0 +1,47 @@
#!/usr/bin/perl -w
use strict;
use FTPTest;
###############################################################################
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $francais = <<EOF;
Some text.
EOF
$francais =~ s/\n/\r\n/;
# code, msg, headers, content
my %urls = (
"/fran${ccedilla_u8}ais.txt" => {
content => $francais,
},
);
my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"fran${ccedilla_u8}ais.txt" => {
content => $francais,
},
);
###############################################################################
my $the_test = FTPTest->new (name => "Test-ftp-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

196
tests/Test-iri-disabled.px Executable file
View File

@ -0,0 +1,196 @@
#!/usr/bin/perl -w
use strict;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html mime + file = iso-8859-1
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
#
my $ccedilla_l15 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $eurosign_l15 = "\xA4";
my $eurosign_u8 = "\xE2\x82\xAC";
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
</head>
<body>
<p>
&Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
</p>
</body>
</html>
EOF
my $pageeuro = <<EOF;
<html>
<head>
<title>Euro page</title>
</head>
<body>
<p>
My tailor isn't rich anymore.
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "200",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
'/p3_%A4%A4%A4.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri=no -nH -r http://localhost:{{port}}/";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'index.html' => {
content => $pageindex,
},
'robots.txt' => {
content => "",
},
"p1_fran${ccedilla_l15}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_l1}${eacute_l1}n.html" => {
content => $pageeen,
},
"p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html" => {
content => $pageeuro,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri-disabled",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

207
tests/Test-iri-forced-remote.px Executable file
View File

@ -0,0 +1,207 @@
#!/usr/bin/perl -w
use strict;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
# Force remote encoding to ISO-8859-1
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html mime + file = iso-8859-1
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
#
my $ccedilla_l15 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $eurosign_l15 = "\xA4";
my $eurosign_u8 = "\xE2\x82\xAC";
my $currency_l1 = "\xA4";
my $currency_u8 = "\xC2\xA4";
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
</head>
<body>
<p>
&Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
</p>
</body>
</html>
EOF
my $pageeuro = <<EOF;
<html>
<head>
<title>Euro page</title>
</head>
<body>
<p>
My tailor isn't rich anymore.
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "404",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $page404,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
'/p3_%A4%A4%A4.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
'/p3_%C2%A4%C2%A4%C2%A4.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'index.html' => {
content => $pageindex,
},
'robots.txt' => {
content => "",
},
"p1_fran${ccedilla_l15}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
content => $pageeen,
},
"p3_${currency_u8}${currency_u8}${currency_u8}.html" => {
content => $pageeuro,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri-forced-remote",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

173
tests/Test-iri-list.px Executable file
View File

@ -0,0 +1,173 @@
#!/usr/bin/perl -w
use strict;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html meta + file = utf-8, mime =iso-8859-1
#
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $urllist = <<EOF;
http://localhost:{{port}}/
http://localhost:{{port}}/p1_fran${ccedilla_l1}ais.html
http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html
EOF
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Main page.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
French page.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>
<p>
Dutch page.
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "404",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $page404,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/url_list.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=ISO-8859-1",
},
content => $urllist,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri -d -i http://localhost:{{port}}/url_list.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'url_list.txt' => {
content => $urllist,
},
'index.html' => {
content => $pageindex,
},
"p1_fran${ccedilla_l1}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
content => $pageeen,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri-list",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

224
tests/Test-iri.px Executable file
View File

@ -0,0 +1,224 @@
#!/usr/bin/perl -w
use strict;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html meta + file = utf-8, mime =iso-8859-1
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
# p4_méér.html mime + file = utf-8
#
my $ccedilla_l15 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $eurosign_l15 = "\xA4";
my $eurosign_u8 = "\xE2\x82\xAC";
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>
<p>
&Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)<br/>
<a href="http://localhost:{{port}}/p4_m${eacute_u8}${eacute_u8}r.html">M&eacute&eacute;r</a>
</p>
</body>
</html>
EOF
my $pageeuro = <<EOF;
<html>
<head>
<title>Euro page</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>
<p>
My tailor isn't rich anymore.
</p>
</body>
</html>
EOF
my $pagemeer = <<EOF;
<html>
<head>
<title>Bekende supermarkt</title>
</head>
<body>
<p>
Ik ben toch niet gek !
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "404",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $page404,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=ISO-8859-1",
},
content => $pageeuro,
},
'/p3_%A4%A4%A4.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=ISO-8859-1",
},
content => $pageeuro,
},
'/p4_m%C3%A9%C3%A9r.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=UTF-8",
},
content => $pagemeer,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri --restrict-file-names=nocontrol -nH -r http://localhost:{{port}}/";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'index.html' => {
content => $pageindex,
},
'robots.txt' => {
content => "",
},
"p1_fran${ccedilla_l15}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
content => $pageeen,
},
"p3_${eurosign_u8}${eurosign_u8}${eurosign_u8}.html" => {
content => $pageeuro,
},
"p4_m${eacute_u8}${eacute_u8}r.html" => {
content => $pagemeer,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

View File

@ -17,9 +17,16 @@ my @tests = (
'Test-E-k-K.px',
'Test-E-k.px',
'Test-ftp.px',
'Test-ftp-iri.px',
'Test-ftp-iri-fallback.px',
'Test-ftp-iri-disabled.px',
'Test-HTTP-Content-Disposition-1.px',
'Test-HTTP-Content-Disposition-2.px',
'Test-HTTP-Content-Disposition.px',
'Test-iri.px',
'Test-iri-disabled.px',
'Test-iri-forced-remote.px',
'Test-iri-list.px',
'Test-N-current.px',
'Test-N-smaller.px',
'Test-N-no-info.px',