1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

Automated merge.

This commit is contained in:
Micah Cowan 2008-08-28 01:21:59 -07:00
commit 090f1596ae
40 changed files with 1936 additions and 104 deletions

View File

@ -9,6 +9,14 @@
* AUTHORS: Added Steven Schubiger. * AUTHORS: Added Steven Schubiger.
2008-06-26 Xavier Saint <wget@sxav.eu>
* configure.ac : IRIs support required libiconv, check it.
2008-06-14 Xavier Saint <wget@sxav.eu>
* configure.ac: Add support for IRIs
2008-05-29 Micah Cowan <micah@cowan.name> 2008-05-29 Micah Cowan <micah@cowan.name>
* po/*.po: Updated from TP (the 1.11.3 set). * po/*.po: Updated from TP (the 1.11.3 set).

View File

@ -460,6 +460,77 @@ else
fi fi
AC_SUBST(COMMENT_IF_NO_POD2MAN) AC_SUBST(COMMENT_IF_NO_POD2MAN)
dnl
dnl Check for IDN/IRIs
dnl
AC_ARG_ENABLE(iri,
AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
[case "${enable_iri}" in
no)
dnl Disable IRIs checking
AC_MSG_NOTICE([disabling IRIs at user request])
iri=no
;;
yes)
dnl IRIs explicitly enabled
iri=yes
force_iri=yes
;;
auto)
dnl Auto-detect IRI
iri=yes
;;
*)
AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
;;
esac
], [
dnl If nothing is specified, assume auto-detection
iri=yes
]
)
AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
[Support IDN/IRIs (needs GNU Libidn)]),
libidn=$withval, libidn="")
if test "X$iri" != "Xno"; then
AM_ICONV
if test "X$am_cv_func_iconv" != "Xyes"; then
iri=no
if test "X$force_iri" = "Xyes"; then
AC_MSG_ERROR([Libiconv is required for IRIs support])
else
AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found])
fi
fi
fi
if test "X$iri" != "Xno"; then
if test "$libidn" != ""; then
LDFLAGS="${LDFLAGS} -L$libidn/lib"
CPPFLAGS="${CPPFLAGS} -I$libidn/include"
fi
AC_CHECK_HEADER(idna.h,
AC_CHECK_LIB(idn, stringprep_check_version,
[iri=yes LIBS="${LIBS} -lidn"], iri=no),
iri=no)
if test "X$iri" != "Xno" ; then
AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
AC_MSG_NOTICE([Enabling support for IRI.])
else
AC_MSG_WARN([Libidn not found])
fi
fi
dnl Needed by src/Makefile.am
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
dnl dnl
dnl Create output dnl Create output
dnl dnl

View File

@ -1,3 +1,12 @@
2008-08-03 Xavier Saint <wget@sxav.eu>
* wget.texi : Add option descriptions for the three new
options --iri, --locale and --remote-encoding related to
IRI support.
* sample.wgetrc : Add commented lines for the three new
command iri, locale and encoding related to IRI support.
2008-08-03 Micah Cowan <micah@cowan.name> 2008-08-03 Micah Cowan <micah@cowan.name>
* wget.texi: Don't set UPDATED; already set by version.texi. * wget.texi: Don't set UPDATED; already set by version.texi.

View File

@ -113,3 +113,12 @@ waitretry = 10
# To try ipv6 addresses first: # To try ipv6 addresses first:
#prefer-family = IPv6 #prefer-family = IPv6
# Set default IRI support state
#iri = off
# Force the default system encoding
#locale = UTF-8
# Force the default remote server encoding
#remoteencoding = UTF-8

View File

@ -674,6 +674,30 @@ Another instance where you'll get a garbled file if you try to use
Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http} Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http}
servers that support the @code{Range} header. servers that support the @code{Range} header.
@cindex iri support
@cindex idn support
@item --iri
Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to
turn it off. IRI support is activated by default.
You can set the default state of IRI support using @code{iri} command in
@file{.wgetrc}. That setting may be overridden from the command line.
@cindex local encoding
@cindex locale
@item --locale=@var{encoding}
Force Wget to use @var{encoding} as the default system encoding. That affects
how Wget converts URLs specified as arguments from locale to @sc{utf-8} for
IRI support.
Wget use the function @code{nl_langinfo()} and then the @code{CHARSET}
environment variable to get the locale. If it fails, @sc{ascii} is used.
You can set the default locale using the @code{locale} command in
@file{.wgetrc}. That setting may be overridden from the command line.
@cindex progress indicator @cindex progress indicator
@cindex dot style @cindex dot style
@item --progress=@var{type} @item --progress=@var{type}
@ -705,6 +729,21 @@ command line. The exception is that, when the output is not a TTY, the
``dot'' progress will be favored over ``bar''. To force the bar output, ``dot'' progress will be favored over ``bar''. To force the bar output,
use @samp{--progress=bar:force}. use @samp{--progress=bar:force}.
@cindex remote encoding
@item --remote-encoding=@var{encoding}
Force Wget to use encoding as the default remote server encoding. That
affects how Wget converts URIs found in files from remote encoding to
@sc{utf-8} during a recursive fetch. This options is only useful for
IRI support, for the interpretation of non-@sc{ascii} characters.
For HTTP, remote encoding can be found in HTTP @code{Content-Type}
header and in HTML @code{Content-Type http-equiv} meta tag.
You can set the default encoding using the @code{remoteencoding}
command in @file{.wgetrc}. That setting may be overridden from the
command line.
@item -N @item -N
@itemx --timestamping @itemx --timestamping
Turn on time-stamping. @xref{Time-Stamping}, for details. Turn on time-stamping. @xref{Time-Stamping}, for details.

View File

@ -32,11 +32,27 @@
* init.c (cleanup): Free the memory associated with the base * init.c (cleanup): Free the memory associated with the base
option (when DEBUG_MALLOC is defined). option (when DEBUG_MALLOC is defined).
2008-07-02 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New function idn_decode() to decode ASCII
encoded hostname to the locale.
* host.c : Show hostname to be resolved both in locale and
ASCII encoded.
2008-06-28 Steven Schubiger <stsc@members.fsf.org> 2008-06-28 Steven Schubiger <stsc@members.fsf.org>
* retr.c (retrieve_from_file): Allow for reading the links from * retr.c (retrieve_from_file): Allow for reading the links from
an external file (HTTP/FTP). an external file (HTTP/FTP).
2008-06-26 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New functions locale_to_utf8() and
idn_encode() adding basic capabilities of IRI/IDN.
* url.c : Convert URLs from locale to UTF-8 allowing a basic
support of IRI/IDN
2008-06-25 Steven Schubiger <stsc@members.fsf.org> 2008-06-25 Steven Schubiger <stsc@members.fsf.org>
* ftp.c (getftp): When spidering a FTP URL, emit a diagnostic * ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
@ -69,12 +85,57 @@
string vars pointers-to-const, and moved line lengths string vars pointers-to-const, and moved line lengths
below 80 (in Makefile.am, not in version.c). below 80 (in Makefile.am, not in version.c).
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New function check_encoding_name() as
a preliminary encoding name check.
* main.c, iri.c : Make use of check_encoding_name().
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c : Include missing stringprep.h file and add a
cast.
* init.c : set a default initial value for opt.enable_iri,
opt.locale and opt.encoding_remote.
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : Add a new function find_locale() to find
out the local system encoding.
* main.c : Make use of find_locale().
2008-06-19 Xavier Saint <wget@sxav.eu>
* html-url.c : Add "content-type" meta tag parsing for
retrieving page encoding.
* iri.h : Make no-op version of parse_charset() return
NULL.
2008-06-16 Micah Cowan <micah@cowan.name> 2008-06-16 Micah Cowan <micah@cowan.name>
* http.c (http_loop): When hstat.len is higher than the * http.c (http_loop): When hstat.len is higher than the
successfully completed content's length, but it's because we successfully completed content's length, but it's because we
_set_ it that way, don't abort. _set_ it that way, don't abort.
2008-06-14 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New files.
* Makefile.am : Add files iri.h and conditional iri.c.
* build_info.c : Add compiled feature "iri".
* http.c : include iri.h and parse charset from Content-Type
header.
* init.c, main.c, options.h : if an options isn't supported
at compiled time, don't get rid off it and show a dummy
message instead if they are used.
2008-06-13 Micah Cowan <micah@cowan.name> 2008-06-13 Micah Cowan <micah@cowan.name>
* build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL * build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL

View File

@ -30,6 +30,10 @@
# Version: @VERSION@ # Version: @VERSION@
# #
if IRI_IS_ENABLED
IRI_OBJ = iri.c
endif
# The following line is losing on some versions of make! # The following line is losing on some versions of make!
DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\" DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@ LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
@ -40,7 +44,7 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c \
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \ ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
http.c init.c log.c main.c netrc.c progress.c ptimer.c \ http.c init.c log.c main.c netrc.c progress.c ptimer.c \
recur.c res.c retr.c snprintf.c spider.c url.c \ recur.c res.c retr.c snprintf.c spider.c url.c \
utils.c \ utils.c $(IRI_OBJ) \
css-url.h connect.h convert.h cookies.h \ css-url.h connect.h convert.h cookies.h \
ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \ ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ http.h http-ntlm.h init.h log.h mswindows.h netrc.h \

View File

@ -100,6 +100,13 @@ const char* (compiled_features[]) =
#else #else
"-gettext", "-gettext",
#endif #endif
#ifdef ENABLE_IRI
"+iri",
#else
"-iri",
#endif
/* sentinel value */ /* sentinel value */
NULL NULL
}; };

View File

@ -266,9 +266,25 @@ connect_to_ip (const ip_address *ip, int port, const char *print)
if (print) if (print)
{ {
const char *txt_addr = print_address (ip); const char *txt_addr = print_address (ip);
if (print && 0 != strcmp (print, txt_addr)) if (0 != strcmp (print, txt_addr))
{
char *str = NULL, *name;
if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL)
{
int len = strlen (print) + strlen (name) + 4;
str = xmalloc (len);
snprintf (str, len, "%s (%s)", name, print);
str[len-1] = '\0';
xfree (name);
}
logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "), logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
escnonprint_uri (print), txt_addr, port); str ? str : escnonprint_uri (print), txt_addr, port);
if (str)
xfree (str);
}
else else
logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port); logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port);
} }

View File

@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
/* Parse the file... */ /* Parse the file... */
urls = is_css ? get_urls_css_file (file, url) : urls = is_css ? get_urls_css_file (file, url) :
get_urls_html (file, url, NULL); get_urls_html (file, url, NULL, NULL);
/* We don't respect meta_disallow_follow here because, even if /* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the the file is not followed, we might still want to convert the

View File

@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line)
return FTPRERR; return FTPRERR;
/* Strip trailing CRLF before printing the line, so that /* Strip trailing CRLF before printing the line, so that
escnonprint doesn't include bogus \012 and \015. */ quotting doesn't include bogus \012 and \015. */
p = strchr (line, '\0'); p = strchr (line, '\0');
if (p > line && p[-1] == '\n') if (p > line && p[-1] == '\n')
*--p = '\0'; *--p = '\0';

View File

@ -712,8 +712,24 @@ lookup_host (const char *host, int flags)
/* No luck with the cache; resolve HOST. */ /* No luck with the cache; resolve HOST. */
if (!silent && !numeric_address) if (!silent && !numeric_address)
{
char *str = NULL, *name;
if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
{
int len = strlen (host) + strlen (name) + 4;
str = xmalloc (len);
snprintf (str, len, "%s (%s)", name, host);
str[len-1] = '\0';
xfree (name);
}
logprintf (LOG_VERBOSE, _("Resolving %s... "), logprintf (LOG_VERBOSE, _("Resolving %s... "),
quotearg_style (escape_quoting_style, host)); quotearg_style (escape_quoting_style, str ? str : host));
if (str)
xfree (str);
}
#ifdef ENABLE_IPV6 #ifdef ENABLE_IPV6
{ {

View File

@ -174,6 +174,10 @@ static const char *additional_attributes[] = {
static struct hash_table *interesting_tags; static struct hash_table *interesting_tags;
static struct hash_table *interesting_attributes; static struct hash_table *interesting_attributes;
/* Will contains the (last) charset found in 'http-equiv=content-type'
meta tags */
static char *meta_charset;
static void static void
init_interesting (void) init_interesting (void)
{ {
@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size,
return NULL; return NULL;
} }
url = url_parse (link_uri, NULL); url = url_parse (link_uri, NULL, NULL);
if (!url) if (!url)
{ {
DEBUGP (("%s: link \"%s\" doesn't parse.\n", DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@ -303,7 +307,7 @@ append_url (const char *link_uri, int position, int size,
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri)); ctx->document_file, base, link_uri, complete_uri));
url = url_parse (complete_uri, NULL); url = url_parse (complete_uri, NULL, NULL);
if (!url) if (!url)
{ {
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@ -553,6 +557,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
entry->link_expect_html = 1; entry->link_expect_html = 1;
} }
} }
else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
{
/* Handle stuff like:
<meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
char *mcharset;
char *content = find_attr (tag, "content", NULL);
if (!content)
return;
mcharset = parse_charset (content);
if (!mcharset)
return;
xfree_null (meta_charset);
meta_charset = mcharset;
}
else if (name && 0 == strcasecmp (name, "robots")) else if (name && 0 == strcasecmp (name, "robots"))
{ {
/* Handle stuff like: /* Handle stuff like:
@ -617,7 +638,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
<base href=...> and does the right thing. */ <base href=...> and does the right thing. */
struct urlpos * struct urlpos *
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
struct iri *iri)
{ {
struct file_memory *fm; struct file_memory *fm;
struct map_context ctx; struct map_context ctx;
@ -657,6 +679,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
NULL, interesting_attributes); NULL, interesting_attributes);
/* If meta charset isn't null, override content encoding */
if (iri && meta_charset)
set_content_encoding (iri, meta_charset);
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow) if (meta_disallow_follow)
*meta_disallow_follow = ctx.nofollow; *meta_disallow_follow = ctx.nofollow;
@ -726,7 +752,7 @@ get_urls_file (const char *file)
url_text = merged; url_text = merged;
} }
url = url_parse (url_text, &up_error_code); url = url_parse (url_text, &up_error_code, NULL);
if (!url) if (!url)
{ {
char *error = url_error (url_text, up_error_code); char *error = url_error (url_text, up_error_code);

View File

@ -44,7 +44,7 @@ struct map_context {
}; };
struct urlpos *get_urls_file (const char *); struct urlpos *get_urls_file (const char *);
struct urlpos *get_urls_html (const char *, const char *, bool *); struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
struct urlpos *append_url (const char *, int, int, struct map_context *); struct urlpos *append_url (const char *, int, int, struct map_context *);
void free_urlpos (struct urlpos *); void free_urlpos (struct urlpos *);

View File

@ -1364,7 +1364,8 @@ free_hstat (struct http_stat *hs)
If PROXY is non-NULL, the connection will be made to the proxy If PROXY is non-NULL, the connection will be made to the proxy
server, and u->url will be requested. */ server, and u->url will be requested. */
static uerr_t static uerr_t
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
struct iri *iri)
{ {
struct request *req; struct request *req;
@ -2048,9 +2049,20 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
char *tmp = strchr (type, ';'); char *tmp = strchr (type, ';');
if (tmp) if (tmp)
{ {
/* sXXXav: only needed if IRI support is enabled */
char *tmp2 = tmp + 1;
while (tmp > type && c_isspace (tmp[-1])) while (tmp > type && c_isspace (tmp[-1]))
--tmp; --tmp;
*tmp = '\0'; *tmp = '\0';
/* Try to get remote encoding if needed */
if (opt.enable_iri && !opt.encoding_remote)
{
tmp = parse_charset (tmp2);
if (tmp)
set_content_encoding (iri, tmp);
}
} }
} }
hs->newloc = resp_header_strdup (resp, "Location"); hs->newloc = resp_header_strdup (resp, "Location");
@ -2325,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
retried, and retried, and retried, and... */ retried, and retried, and retried, and... */
uerr_t uerr_t
http_loop (struct url *u, char **newloc, char **local_file, const char *referer, http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
int *dt, struct url *proxy) int *dt, struct url *proxy, struct iri *iri)
{ {
int count; int count;
bool got_head = false; /* used for time-stamping and filename detection */ bool got_head = false; /* used for time-stamping and filename detection */
@ -2489,7 +2501,7 @@ Spider mode enabled. Check if remote file exists.\n"));
*dt &= ~SEND_NOCACHE; *dt &= ~SEND_NOCACHE;
/* Try fetching the document, or at least its head. */ /* Try fetching the document, or at least its head. */
err = gethttp (u, &hstat, dt, proxy); err = gethttp (u, &hstat, dt, proxy, iri);
/* Time? */ /* Time? */
tms = datetime_str (time (NULL)); tms = datetime_str (time (NULL));
@ -2567,8 +2579,10 @@ Spider mode enabled. Check if remote file exists.\n"));
continue; continue;
} }
/* Maybe we should always keep track of broken links, not just in /* Maybe we should always keep track of broken links, not just in
* spider mode. */ * spider mode.
else if (opt.spider) * Don't log error if it was UTF-8 encoded because we will try
* once unencoded. */
else if (opt.spider && !iri->utf8_encode)
{ {
/* #### Again: ugly ugly ugly! */ /* #### Again: ugly ugly ugly! */
if (!hurl) if (!hurl)

View File

@ -33,7 +33,7 @@ as that of the covered work. */
struct url; struct url;
uerr_t http_loop (struct url *, char **, char **, const char *, int *, uerr_t http_loop (struct url *, char **, char **, const char *, int *,
struct url *); struct url *, struct iri *);
void save_cookies (void); void save_cookies (void);
void http_cleanup (void); void http_cleanup (void);
time_t http_atotm (const char *); time_t http_atotm (const char *);

View File

@ -182,9 +182,11 @@ static const struct {
{ "inet6only", &opt.ipv6_only, cmd_boolean }, { "inet6only", &opt.ipv6_only, cmd_boolean },
#endif #endif
{ "input", &opt.input_filename, cmd_file }, { "input", &opt.input_filename, cmd_file },
{ "iri", &opt.enable_iri, cmd_boolean },
{ "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean }, { "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
{ "limitrate", &opt.limit_rate, cmd_bytes }, { "limitrate", &opt.limit_rate, cmd_bytes },
{ "loadcookies", &opt.cookies_input, cmd_file }, { "loadcookies", &opt.cookies_input, cmd_file },
{ "locale", &opt.locale, cmd_string },
{ "logfile", &opt.lfilename, cmd_file }, { "logfile", &opt.lfilename, cmd_file },
{ "login", &opt.ftp_user, cmd_string },/* deprecated*/ { "login", &opt.ftp_user, cmd_string },/* deprecated*/
{ "maxredirect", &opt.max_redirect, cmd_number }, { "maxredirect", &opt.max_redirect, cmd_number },
@ -224,6 +226,7 @@ static const struct {
{ "referer", &opt.referer, cmd_string }, { "referer", &opt.referer, cmd_string },
{ "reject", &opt.rejects, cmd_vector }, { "reject", &opt.rejects, cmd_vector },
{ "relativeonly", &opt.relative_only, cmd_boolean }, { "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean }, { "removelisting", &opt.remove_listing, cmd_boolean },
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names }, { "restrictfilenames", NULL, cmd_spec_restrict_file_names },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean }, { "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
@ -331,6 +334,14 @@ defaults (void)
opt.restrict_files_case = restrict_no_case_restriction; opt.restrict_files_case = restrict_no_case_restriction;
opt.max_redirect = 20; opt.max_redirect = 20;
#ifdef ENABLE_IRI
opt.enable_iri = true;
#else
opt.enable_iri = false;
#endif
opt.locale = NULL;
opt.encoding_remote = NULL;
} }
/* Return the user's home directory (strdup-ed), or NULL if none is /* Return the user's home directory (strdup-ed), or NULL if none is

348
src/iri.c Normal file
View File

@ -0,0 +1,348 @@
/* IRI related functions.
Copyright (C) 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
Additional permission under GNU GPL version 3 section 7
If you modify this program, or any covered work, by linking or
combining it with the OpenSSL project's OpenSSL library (or a
modified version of that library), containing parts covered by the
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
grants you additional permission to convey the resulting work.
Corresponding Source for a non-source form of such a combination
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <iconv.h>
#include <stringprep.h>
#include <idna.h>
#include <errno.h>
#include "utils.h"
/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
/* Note: locale encoding is kept in options struct (opt.locale) */
static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
/* Given a string containing "charset=XXX", return the encoding if found,
or NULL otherwise */
char *
parse_charset (char *str)
{
char *charset;
if (!str || !*str)
return NULL;
str = strcasestr (str, "charset=");
if (!str)
return NULL;
str += 8;
charset = str;
/* sXXXav: which chars should be banned ??? */
while (*charset && !c_isspace (*charset))
charset++;
/* sXXXav: could strdupdelim return NULL ? */
charset = strdupdelim (str, charset);
/* Do a minimum check on the charset value */
if (!check_encoding_name (charset))
{
xfree (charset);
return NULL;
}
/*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
return charset;
}
/* Find the locale used, or fall back on a default value */
char *
find_locale (void)
{
return (char *) stringprep_locale_charset ();
}
/* Basic check of an encoding name. */
bool
check_encoding_name (char *encoding)
{
char *s = encoding;
while (*s)
{
if (!c_isascii (*s) || c_isspace (*s))
{
logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
return false;
}
s++;
}
return true;
}
/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
static bool
open_locale_to_utf8 (void)
{
}
/* Try converting string str from locale to UTF-8. Return a new string
on success, or str on error or if conversion isn't needed. */
const char *
locale_to_utf8 (const char *str)
{
iconv_t l2u;
char *new;
/* That shouldn't happen, just in case */
if (!opt.locale)
{
logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
opt.locale = find_locale ();
}
if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
return str;
l2u = iconv_open ("UTF-8", opt.locale);
if (l2u != (iconv_t)(-1))
{
logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
quote (opt.locale), quote ("UTF-8"));
return str;
}
if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new))
return (const char *) new;
return str;
}
/* Do the conversion according to the passed conversion descriptor cd. *out
will contain the transcoded string on success. *out content is
unspecified otherwise. */
static bool
do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
{
/* sXXXav : hummm hard to guess... */
size_t len, done, outlen = inlen * 2;
int invalid = 0, tooshort = 0;
char *s;
s = xmalloc (outlen + 1);
*out = s;
len = outlen;
done = 0;
for (;;)
{
if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
{
*out = s;
*(s + len - outlen - done) = '\0';
return true;
}
/* Incomplete or invalid multibyte sequence */
if (errno == EINVAL || errno == EILSEQ)
{
if (!invalid)
logprintf (LOG_VERBOSE,
"Incomplete or invalide multibyte sequence encountered\n");
invalid++;
**out = *in;
in++;
inlen--;
(*out)++;
outlen--;
}
else if (errno == E2BIG) /* Output buffer full */
{
char *new;
tooshort++;
done = len;
outlen = done + inlen * 2;
new = xmalloc (outlen + 1);
memcpy (new, s, done);
xfree (s);
s = new;
len = outlen;
*out = s + done;
}
else /* Weird, we got an unspecified error */
{
logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
break;
}
}
return false;
}
/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
on error. */
char *
idn_encode (struct iri *i, char *host)
{
char *new;
int ret;
/* Encode to UTF-8 if not done */
if (!i->utf8_encode)
{
if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
return NULL; /* Nothing to encode or an error occured */
host = new;
}
/* toASCII UTF-8 NULL terminated string */
ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
if (ret != IDNA_SUCCESS)
{
/* sXXXav : free new when needed ! */
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
}
return new;
}
/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
on success or NULL on error. */
char *
idn_decode (char *host)
{
char *new;
int ret;
ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
if (ret != IDNA_SUCCESS)
{
logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
}
return new;
}
/* Try to transcode string str from remote encoding to UTF-8. On success, *new
contains the transcoded string. *new content is unspecified otherwise. */
bool
remote_to_utf8 (struct iri *i, const char *str, const char **new)
{
iconv_t cd;
bool ret = false;
if (!i->uri_encoding)
return false;
cd = iconv_open ("UTF-8", i->uri_encoding);
if (cd == (iconv_t)(-1))
return false;
if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
ret = true;
iconv_close (cd);
/* Test if something was converted */
if (!strcmp (str, *new))
{
xfree ((char *) *new);
return false;
}
return ret;
}
/* Allocate a new iri structure and return a pointer to it. */
struct iri *
iri_new (void)
{
struct iri *i = xmalloc (sizeof (struct iri));
i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
i->content_encoding = NULL;
i->utf8_encode = opt.enable_iri;
return i;
}
/* Completely free an iri structure. */
void
iri_free (struct iri *i)
{
xfree_null (i->uri_encoding);
xfree_null (i->content_encoding);
xfree (i);
}
/* Set uri_encoding of struct iri i. If a remote encoding was specified, use
it unless force is true. */
void
set_uri_encoding (struct iri *i, char *charset, bool force)
{
DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None"));
if (!force && opt.encoding_remote)
return;
if (i->uri_encoding)
{
if (charset && !strcasecmp (i->uri_encoding, charset))
return;
xfree (i->uri_encoding);
}
i->uri_encoding = charset ? xstrdup (charset) : NULL;
}
/* Set content_encoding of struct iri i. */
void
set_content_encoding (struct iri *i, char *charset)
{
DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None"));
if (opt.encoding_remote)
return;
if (i->content_encoding)
{
if (charset && !strcasecmp (i->content_encoding, charset))
return;
xfree (i->content_encoding);
}
i->content_encoding = charset ? xstrdup (charset) : NULL;
}

70
src/iri.h Normal file
View File

@ -0,0 +1,70 @@
/* Internationalization related declarations.
Copyright (C) 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
Additional permission under GNU GPL version 3 section 7
If you modify this program, or any covered work, by linking or
combining it with the OpenSSL project's OpenSSL library (or a
modified version of that library), containing parts covered by the
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
grants you additional permission to convey the resulting work.
Corresponding Source for a non-source form of such a combination
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
#ifndef IRI_H
#define IRI_H
struct iri {
char *uri_encoding; /* Encoding of the uri to fetch */
char *content_encoding; /* Encoding of links inside the fetched file */
bool utf8_encode; /* Will/Is the current url encoded in utf8 */
};
#ifdef ENABLE_IRI
char *parse_charset (char *str);
char *find_locale (void);
bool check_encoding_name (char *encoding);
const char *locale_to_utf8 (const char *str);
char *idn_encode (struct iri *i, char *host);
char *idn_decode (char *host);
bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
struct iri *iri_new (void);
void iri_free (struct iri *i);
void set_uri_encoding (struct iri *i, char *charset, bool force);
void set_content_encoding (struct iri *i, char *charset);
#else /* ENABLE_IRI */
struct iri dummy_iri;
#define parse_charset(str) NULL
#define find_locale() NULL
#define check_encoding_name(str) false
#define locale_to_utf8(str) (str)
#define idn_encode(a,b) NULL
#define idn_decode(str) NULL
#define remote_to_utf8(a,b,c) false
#define iri_new() (&dummy_iri)
#define iri_free(a)
#define set_uri_encoding(a,b,c)
#define set_content_encoding(a,b)
#endif /* ENABLE_IRI */
#endif /* IRI_H */

View File

@ -43,7 +43,7 @@ as that of the covered work. */
#include "utils.h" #include "utils.h"
#include "log.h" #include "log.h"
/* This file impplement support for "logging". Logging means printing /* This file implement support for "logging". Logging means printing
output, plus several additional features: output, plus several additional features:
- Cataloguing output by importance. You can specify that a log - Cataloguing output by importance. You can specify that a log

View File

@ -201,10 +201,12 @@ static struct cmdline_option option_data[] =
{ "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 }, { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
#endif #endif
{ "input-file", 'i', OPT_VALUE, "input", -1 }, { "input-file", 'i', OPT_VALUE, "input", -1 },
{ "iri", 0, OPT_BOOLEAN, "iri", -1 },
{ "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 }, { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
{ "level", 'l', OPT_VALUE, "reclevel", -1 }, { "level", 'l', OPT_VALUE, "reclevel", -1 },
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 }, { "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 }, { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
{ "locale", 0, OPT_VALUE, "locale", -1 },
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 }, { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
{ "mirror", 'm', OPT_BOOLEAN, "mirror", -1 }, { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
{ "no", 'n', OPT__NO, NULL, required_argument }, { "no", 'n', OPT__NO, NULL, required_argument },
@ -238,6 +240,7 @@ static struct cmdline_option option_data[] =
{ "referer", 0, OPT_VALUE, "referer", -1 }, { "referer", 0, OPT_VALUE, "referer", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 },
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
{ "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 }, { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
{ "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 }, { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
@ -1062,6 +1065,27 @@ for details.\n\n"));
exit (1); exit (1);
} }
#ifdef ENABLE_IRI
if (opt.enable_iri)
{
if (opt.locale && !check_encoding_name (opt.locale))
opt.locale = NULL;
if (!opt.locale)
opt.locale = find_locale ();
if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
opt.encoding_remote = NULL;
}
#else
if (opt.enable_iri || opt.locale || opt.encoding_remote)
{
/* sXXXav : be more specific... */
printf(_("This version does not have support for IRIs\n"));
exit(1);
}
#endif
if (opt.ask_passwd) if (opt.ask_passwd)
{ {
opt.passwd = prompt_for_password (); opt.passwd = prompt_for_password ();
@ -1174,12 +1198,18 @@ WARNING: Can't reopen standard output in binary mode;\n\
if (url_scheme (*t) == SCHEME_FTP) if (url_scheme (*t) == SCHEME_FTP)
opt.follow_ftp = 1; opt.follow_ftp = 1;
status = retrieve_tree (*t); status = retrieve_tree (*t, NULL);
opt.follow_ftp = old_follow_ftp; opt.follow_ftp = old_follow_ftp;
} }
else else
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); {
struct iri *i = iri_new ();
set_uri_encoding (i, opt.locale, true);
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
opt.recursive, i);
iri_free (i);
}
if (opt.delete_after && file_exists_p(filename)) if (opt.delete_after && file_exists_p(filename))
{ {

View File

@ -239,6 +239,10 @@ struct options
bool content_disposition; /* Honor HTTP Content-Disposition header. */ bool content_disposition; /* Honor HTTP Content-Disposition header. */
bool auth_without_challenge; /* Issue Basic authentication creds without bool auth_without_challenge; /* Issue Basic authentication creds without
waiting for a challenge. */ waiting for a challenge. */
bool enable_iri;
char *encoding_remote;
char *locale;
}; };
extern struct options opt; extern struct options opt;

View File

@ -51,7 +51,7 @@ as that of the covered work. */
#include "html-url.h" #include "html-url.h"
#include "css-url.h" #include "css-url.h"
#include "spider.h" #include "spider.h"
/* Functions for maintaining the URL queue. */ /* Functions for maintaining the URL queue. */
struct queue_element { struct queue_element {
@ -60,6 +60,7 @@ struct queue_element {
int depth; /* the depth */ int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to bool html_allowed; /* whether the document is allowed to
be treated as HTML. */ be treated as HTML. */
struct iri *iri; /* sXXXav */
bool css_allowed; /* whether the document is allowed to bool css_allowed; /* whether the document is allowed to
be treated as CSS. */ be treated as CSS. */
struct queue_element *next; /* next element in queue */ struct queue_element *next; /* next element in queue */
@ -93,11 +94,12 @@ url_queue_delete (struct url_queue *queue)
into it. */ into it. */
static void static void
url_enqueue (struct url_queue *queue, url_enqueue (struct url_queue *queue, struct iri *i,
const char *url, const char *referer, int depth, const char *url, const char *referer, int depth,
bool html_allowed, bool css_allowed) bool html_allowed, bool css_allowed)
{ {
struct queue_element *qel = xnew (struct queue_element); struct queue_element *qel = xnew (struct queue_element);
qel->iri = i;
qel->url = url; qel->url = url;
qel->referer = referer; qel->referer = referer;
qel->depth = depth; qel->depth = depth;
@ -112,6 +114,10 @@ url_enqueue (struct url_queue *queue,
DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
if (i)
DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url),
i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
if (queue->tail) if (queue->tail)
queue->tail->next = qel; queue->tail->next = qel;
queue->tail = qel; queue->tail = qel;
@ -124,7 +130,7 @@ url_enqueue (struct url_queue *queue,
succeeded, or false if the queue is empty. */ succeeded, or false if the queue is empty. */
static bool static bool
url_dequeue (struct url_queue *queue, url_dequeue (struct url_queue *queue, struct iri **i,
const char **url, const char **referer, int *depth, const char **url, const char **referer, int *depth,
bool *html_allowed, bool *css_allowed) bool *html_allowed, bool *css_allowed)
{ {
@ -137,6 +143,7 @@ url_dequeue (struct url_queue *queue,
if (!queue->head) if (!queue->head)
queue->tail = NULL; queue->tail = NULL;
*i = qel->iri;
*url = qel->url; *url = qel->url;
*referer = qel->referer; *referer = qel->referer;
*depth = qel->depth; *depth = qel->depth;
@ -153,9 +160,9 @@ url_dequeue (struct url_queue *queue,
} }
static bool download_child_p (const struct urlpos *, struct url *, int, static bool download_child_p (const struct urlpos *, struct url *, int,
struct url *, struct hash_table *); struct url *, struct hash_table *, struct iri *);
static bool descend_redirect_p (const char *, const char *, int, static bool descend_redirect_p (const char *, const char *, int,
struct url *, struct hash_table *); struct url *, struct hash_table *, struct iri *);
/* Retrieve a part of the web beginning with START_URL. This used to /* Retrieve a part of the web beginning with START_URL. This used to
@ -180,7 +187,7 @@ static bool descend_redirect_p (const char *, const char *, int,
options, add it to the queue. */ options, add it to the queue. */
uerr_t uerr_t
retrieve_tree (const char *start_url) retrieve_tree (const char *start_url, struct iri *pi)
{ {
uerr_t status = RETROK; uerr_t status = RETROK;
@ -192,8 +199,22 @@ retrieve_tree (const char *start_url)
struct hash_table *blacklist; struct hash_table *blacklist;
int up_error_code; int up_error_code;
struct url *start_url_parsed = url_parse (start_url, &up_error_code); struct url *start_url_parsed;
struct iri *i = iri_new ();
#define COPYSTR(x) (x) ? xstrdup(x) : NULL;
/* Duplicate pi struct if not NULL */
if (pi)
{
i->uri_encoding = COPYSTR (pi->uri_encoding);
i->content_encoding = COPYSTR (pi->content_encoding);
i->utf8_encode = pi->utf8_encode;
}
else
set_uri_encoding (i, opt.locale, true);
#undef COPYSTR
start_url_parsed = url_parse (start_url, &up_error_code, i);
if (!start_url_parsed) if (!start_url_parsed)
{ {
char *error = url_error (start_url, up_error_code); char *error = url_error (start_url, up_error_code);
@ -207,7 +228,8 @@ retrieve_tree (const char *start_url)
/* Enqueue the starting URL. Use start_url_parsed->url rather than /* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */ just URL so we enqueue the canonical form of the URL. */
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false); url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
false);
string_set_add (blacklist, start_url_parsed->url); string_set_add (blacklist, start_url_parsed->url);
while (1) while (1)
@ -226,7 +248,7 @@ retrieve_tree (const char *start_url)
/* Get the next URL from the queue... */ /* Get the next URL from the queue... */
if (!url_dequeue (queue, if (!url_dequeue (queue, (struct iri **) &i,
(const char **)&url, (const char **)&referer, (const char **)&url, (const char **)&referer,
&depth, &html_allowed, &css_allowed)) &depth, &html_allowed, &css_allowed))
break; break;
@ -267,7 +289,8 @@ retrieve_tree (const char *start_url)
int dt = 0; int dt = 0;
char *redirected = NULL; char *redirected = NULL;
status = retrieve_url (url, &file, &redirected, referer, &dt, false); status = retrieve_url (url, &file, &redirected, referer, &dt,
false, i);
if (html_allowed && file && status == RETROK if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML)) && (dt & RETROKF) && (dt & TEXTHTML))
@ -295,7 +318,7 @@ retrieve_tree (const char *start_url)
if (descend) if (descend)
{ {
if (!descend_redirect_p (redirected, url, depth, if (!descend_redirect_p (redirected, url, depth,
start_url_parsed, blacklist)) start_url_parsed, blacklist, i))
descend = false; descend = false;
else else
/* Make sure that the old pre-redirect form gets /* Make sure that the old pre-redirect form gets
@ -347,7 +370,7 @@ retrieve_tree (const char *start_url)
bool meta_disallow_follow = false; bool meta_disallow_follow = false;
struct urlpos *children struct urlpos *children
= is_css ? get_urls_css_file (file, url) : = is_css ? get_urls_css_file (file, url) :
get_urls_html (file, url, &meta_disallow_follow); get_urls_html (file, url, &meta_disallow_follow, i);
if (opt.use_robots && meta_disallow_follow) if (opt.use_robots && meta_disallow_follow)
{ {
@ -358,7 +381,8 @@ retrieve_tree (const char *start_url)
if (children) if (children)
{ {
struct urlpos *child = children; struct urlpos *child = children;
struct url *url_parsed = url_parsed = url_parse (url, NULL); struct url *url_parsed = url_parse (url, NULL, i);
struct iri *ci;
char *referer_url = url; char *referer_url = url;
bool strip_auth = (url_parsed != NULL bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL); && url_parsed->user != NULL);
@ -375,9 +399,11 @@ retrieve_tree (const char *start_url)
if (dash_p_leaf_HTML && !child->link_inline_p) if (dash_p_leaf_HTML && !child->link_inline_p)
continue; continue;
if (download_child_p (child, url_parsed, depth, start_url_parsed, if (download_child_p (child, url_parsed, depth, start_url_parsed,
blacklist)) blacklist, i))
{ {
url_enqueue (queue, xstrdup (child->url->url), ci = iri_new ();
set_uri_encoding (ci, i->content_encoding, false);
url_enqueue (queue, ci, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1, xstrdup (referer_url), depth + 1,
child->link_expect_html, child->link_expect_html,
child->link_expect_css); child->link_expect_css);
@ -422,6 +448,7 @@ retrieve_tree (const char *start_url)
xfree (url); xfree (url);
xfree_null (referer); xfree_null (referer);
xfree_null (file); xfree_null (file);
iri_free (i);
} }
/* If anything is left of the queue due to a premature exit, free it /* If anything is left of the queue due to a premature exit, free it
@ -430,9 +457,11 @@ retrieve_tree (const char *start_url)
char *d1, *d2; char *d1, *d2;
int d3; int d3;
bool d4, d5; bool d4, d5;
while (url_dequeue (queue, struct iri *d6;
while (url_dequeue (queue, (struct iri **)&d6,
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{ {
iri_free (d6);
xfree (d1); xfree (d1);
xfree_null (d2); xfree_null (d2);
} }
@ -461,7 +490,8 @@ retrieve_tree (const char *start_url)
static bool static bool
download_child_p (const struct urlpos *upos, struct url *parent, int depth, download_child_p (const struct urlpos *upos, struct url *parent, int depth,
struct url *start_url_parsed, struct hash_table *blacklist) struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{ {
struct url *u = upos->url; struct url *u = upos->url;
const char *url = u->url; const char *url = u->url;
@ -602,7 +632,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (!specs) if (!specs)
{ {
char *rfile; char *rfile;
if (res_retrieve_file (url, &rfile)) if (res_retrieve_file (url, &rfile, iri))
{ {
specs = res_parse_from_file (rfile); specs = res_parse_from_file (rfile);
@ -657,23 +687,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
static bool static bool
descend_redirect_p (const char *redirected, const char *original, int depth, descend_redirect_p (const char *redirected, const char *original, int depth,
struct url *start_url_parsed, struct hash_table *blacklist) struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{ {
struct url *orig_parsed, *new_parsed; struct url *orig_parsed, *new_parsed;
struct urlpos *upos; struct urlpos *upos;
bool success; bool success;
orig_parsed = url_parse (original, NULL); orig_parsed = url_parse (original, NULL, NULL);
assert (orig_parsed != NULL); assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL); new_parsed = url_parse (redirected, NULL, NULL);
assert (new_parsed != NULL); assert (new_parsed != NULL);
upos = xnew0 (struct urlpos); upos = xnew0 (struct urlpos);
upos->url = new_parsed; upos->url = new_parsed;
success = download_child_p (upos, orig_parsed, depth, success = download_child_p (upos, orig_parsed, depth,
start_url_parsed, blacklist); start_url_parsed, blacklist, iri);
url_free (orig_parsed); url_free (orig_parsed);
url_free (new_parsed); url_free (new_parsed);

View File

@ -42,6 +42,6 @@ as that of the covered work. */
struct urlpos; struct urlpos;
void recursive_cleanup (void); void recursive_cleanup (void);
uerr_t retrieve_tree (const char *); uerr_t retrieve_tree (const char *, struct iri *);
#endif /* RECUR_H */ #endif /* RECUR_H */

View File

@ -532,21 +532,28 @@ res_get_specs (const char *host, int port)
Return true if robots were retrieved OK, false otherwise. */ Return true if robots were retrieved OK, false otherwise. */
bool bool
res_retrieve_file (const char *url, char **file) res_retrieve_file (const char *url, char **file, struct iri *iri)
{ {
struct iri *i = iri_new ();
uerr_t err; uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION); char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping; int saved_ts_val = opt.timestamping;
int saved_sp_val = opt.spider; int saved_sp_val = opt.spider;
/* Copy server URI encoding for a possible IDNA transformation, no need to
encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
set_uri_encoding (i, iri->uri_encoding, false);
i->utf8_encode = false;
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL; *file = NULL;
opt.timestamping = false; opt.timestamping = false;
opt.spider = false; opt.spider = false;
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false); err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
opt.timestamping = saved_ts_val; opt.timestamping = saved_ts_val;
opt.spider = saved_sp_val; opt.spider = saved_sp_val;
xfree (robots_url); xfree (robots_url);
iri_free (i);
if (err != RETROK && *file != NULL) if (err != RETROK && *file != NULL)
{ {

View File

@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *);
void res_register_specs (const char *, int, struct robot_specs *); void res_register_specs (const char *, int, struct robot_specs *);
struct robot_specs *res_get_specs (const char *, int); struct robot_specs *res_get_specs (const char *, int);
bool res_retrieve_file (const char *, char **); bool res_retrieve_file (const char *, char **, struct iri *);
bool is_robots_txt_url (const char *); bool is_robots_txt_url (const char *);

View File

@ -597,7 +597,7 @@ static char *getproxy (struct url *);
uerr_t uerr_t
retrieve_url (const char *origurl, char **file, char **newloc, retrieve_url (const char *origurl, char **file, char **newloc,
const char *refurl, int *dt, bool recursive) const char *refurl, int *dt, bool recursive, struct iri *iri)
{ {
uerr_t result; uerr_t result;
char *url; char *url;
@ -625,7 +625,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
if (file) if (file)
*file = NULL; *file = NULL;
u = url_parse (url, &up_error_code); second_try:
u = url_parse (url, &up_error_code, iri);
if (!u) if (!u)
{ {
char *error = url_error (url, up_error_code); char *error = url_error (url, up_error_code);
@ -635,6 +636,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
return URLERROR; return URLERROR;
} }
DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url),
iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None",
iri->utf8_encode));
if (!refurl) if (!refurl)
refurl = opt.referer; refurl = opt.referer;
@ -648,8 +653,12 @@ retrieve_url (const char *origurl, char **file, char **newloc,
proxy = getproxy (u); proxy = getproxy (u);
if (proxy) if (proxy)
{ {
struct iri *pi = iri_new ();
set_uri_encoding (pi, opt.locale, true);
pi->utf8_encode = false;
/* Parse the proxy URL. */ /* Parse the proxy URL. */
proxy_url = url_parse (proxy, &up_error_code); proxy_url = url_parse (proxy, &up_error_code, NULL);
if (!proxy_url) if (!proxy_url)
{ {
char *error = url_error (proxy, up_error_code); char *error = url_error (proxy, up_error_code);
@ -676,7 +685,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
#endif #endif
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP)) || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
{ {
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url); result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
} }
else if (u->scheme == SCHEME_FTP) else if (u->scheme == SCHEME_FTP)
{ {
@ -726,8 +735,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
xfree (mynewloc); xfree (mynewloc);
mynewloc = construced_newloc; mynewloc = construced_newloc;
/* Reset UTF-8 encoding state, keep the URI encoding and reset
the content encoding. */
iri->utf8_encode = opt.enable_iri;
set_content_encoding (iri, NULL);
/* Now, see if this new location makes sense. */ /* Now, see if this new location makes sense. */
newloc_parsed = url_parse (mynewloc, &up_error_code); newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
if (!newloc_parsed) if (!newloc_parsed)
{ {
char *error = url_error (mynewloc, up_error_code); char *error = url_error (mynewloc, up_error_code);
@ -776,8 +790,21 @@ retrieve_url (const char *origurl, char **file, char **newloc,
goto redirected; goto redirected;
} }
if (local_file) /* Try to not encode in UTF-8 if fetching failed */
if (!(*dt & RETROKF) && iri->utf8_encode)
{ {
iri->utf8_encode = false;
DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url)));
goto second_try;
}
if (local_file && *dt & RETROKF)
{
register_download (u->url, local_file);
if (redirection_count && 0 != strcmp (origurl, u->url))
register_redirection (origurl, u->url);
if (*dt & TEXTHTML)
register_html (u->url, local_file);
if (*dt & RETROKF) if (*dt & RETROKF)
{ {
register_download (u->url, local_file); register_download (u->url, local_file);
@ -827,6 +854,7 @@ retrieve_from_file (const char *file, bool html, int *count)
{ {
uerr_t status; uerr_t status;
struct urlpos *url_list, *cur_url; struct urlpos *url_list, *cur_url;
struct iri *iri = iri_new();
char *input_file = NULL; char *input_file = NULL;
const char *url = file; const char *url = file;
@ -834,6 +862,10 @@ retrieve_from_file (const char *file, bool html, int *count)
status = RETROK; /* Suppose everything is OK. */ status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */ *count = 0; /* Reset the URL count. */
/* sXXXav : Assume filename and links in the file are in the locale */
set_uri_encoding (iri, opt.locale, true);
set_content_encoding (iri, opt.locale);
if (url_has_scheme (url)) if (url_has_scheme (url))
{ {
int dt; int dt;
@ -842,17 +874,21 @@ retrieve_from_file (const char *file, bool html, int *count)
if (!opt.base_href) if (!opt.base_href)
opt.base_href = xstrdup (url); opt.base_href = xstrdup (url);
status = retrieve_url (url, &input_file, NULL, NULL, &dt, false); status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri);
if (status != RETROK) if (status != RETROK)
return status; return status;
if (dt & TEXTHTML) if (dt & TEXTHTML)
html = true; html = true;
/* If we have a found a content encoding, use it */
if (iri->content_encoding)
set_uri_encoding (iri, iri->content_encoding, false);
} }
else else
input_file = (char *) file; input_file = (char *) file;
url_list = (html ? get_urls_html (input_file, NULL, NULL) url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
: get_urls_file (input_file)); : get_urls_file (input_file));
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
@ -868,6 +904,10 @@ retrieve_from_file (const char *file, bool html, int *count)
status = QUOTEXC; status = QUOTEXC;
break; break;
} }
/* Reset UTF-8 encode status */
iri->utf8_encode = opt.enable_iri;
if ((opt.recursive || opt.page_requisites) if ((opt.recursive || opt.page_requisites)
&& (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url))) && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
{ {
@ -877,12 +917,13 @@ retrieve_from_file (const char *file, bool html, int *count)
if (cur_url->url->scheme == SCHEME_FTP) if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1; opt.follow_ftp = 1;
status = retrieve_tree (cur_url->url->url); status = retrieve_tree (cur_url->url->url, iri);
opt.follow_ftp = old_follow_ftp; opt.follow_ftp = old_follow_ftp;
} }
else else
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive); status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
&dt, opt.recursive, iri);
if (filename && opt.delete_after && file_exists_p (filename)) if (filename && opt.delete_after && file_exists_p (filename))
{ {
@ -901,6 +942,8 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
/* Free the linked list of URL-s. */ /* Free the linked list of URL-s. */
free_urlpos (url_list); free_urlpos (url_list);
iri_free (iri);
return status; return status;
} }
@ -1053,7 +1096,11 @@ bool
url_uses_proxy (const char *url) url_uses_proxy (const char *url)
{ {
bool ret; bool ret;
struct url *u = url_parse (url, NULL); struct url *u;
struct iri *i = iri_new();
/* url was given in the command line, so use locale as encoding */
set_uri_encoding (i, opt.locale, true);
u= url_parse (url, NULL, i);
if (!u) if (!u)
return false; return false;
ret = getproxy (u) != NULL; ret = getproxy (u) != NULL;

View File

@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
char *fd_read_hunk (int, hunk_terminator_t, long, long); char *fd_read_hunk (int, hunk_terminator_t, long, long);
char *fd_read_line (int); char *fd_read_line (int);
uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool); uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
bool, struct iri *);
uerr_t retrieve_from_file (const char *, bool, int *); uerr_t retrieve_from_file (const char *, bool, int *);
const char *retr_rate (wgint, double); const char *retr_rate (wgint, double);

View File

@ -640,7 +640,7 @@ static const char *parse_errors[] = {
error, and if ERROR is not NULL, also set *ERROR to the appropriate error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */ error code. */
struct url * struct url *
url_parse (const char *url, int *error) url_parse (const char *url, int *error, struct iri *iri)
{ {
struct url *u; struct url *u;
const char *p; const char *p;
@ -659,7 +659,7 @@ url_parse (const char *url, int *error)
int port; int port;
char *user = NULL, *passwd = NULL; char *user = NULL, *passwd = NULL;
char *url_encoded = NULL; char *url_encoded = NULL, *new_url = NULL;
int error_code; int error_code;
@ -670,9 +670,20 @@ url_parse (const char *url, int *error)
goto error; goto error;
} }
url_encoded = reencode_escapes (url); if (iri && iri->utf8_encode)
{
url_unescape ((char *) url);
iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
if (!iri->utf8_encode)
new_url = NULL;
}
url_encoded = reencode_escapes (new_url ? new_url : url);
p = url_encoded; p = url_encoded;
if (new_url && url_encoded != new_url)
xfree (new_url);
p += strlen (supported_schemes[scheme].leading_string); p += strlen (supported_schemes[scheme].leading_string);
uname_b = p; uname_b = p;
p = url_skip_credentials (p); p = url_skip_credentials (p);
@ -842,6 +853,18 @@ url_parse (const char *url, int *error)
{ {
url_unescape (u->host); url_unescape (u->host);
host_modified = true; host_modified = true;
/* Apply IDNA regardless of iri->utf8_encode status */
if (opt.enable_iri && iri)
{
char *new = idn_encode (iri, u->host);
if (new)
{
xfree (u->host);
u->host = new;
host_modified = true;
}
}
} }
if (params_b) if (params_b)
@ -851,7 +874,7 @@ url_parse (const char *url, int *error)
if (fragment_b) if (fragment_b)
u->fragment = strdupdelim (fragment_b, fragment_e); u->fragment = strdupdelim (fragment_b, fragment_e);
if (path_modified || u->fragment || host_modified || path_b == path_e) if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
{ {
/* If we suspect that a transformation has rendered what /* If we suspect that a transformation has rendered what
url_string might return different from URL_ENCODED, rebuild url_string might return different from URL_ENCODED, rebuild

View File

@ -84,7 +84,7 @@ struct url
char *url_escape (const char *); char *url_escape (const char *);
struct url *url_parse (const char *, int *); struct url *url_parse (const char *, int *, struct iri *iri);
char *url_error (const char *, int); char *url_error (const char *, int);
char *url_full_path (const struct url *); char *url_full_path (const struct url *);
void url_set_dir (struct url *, const char *); void url_set_dir (struct url *, const char *);

View File

@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT;
#include "quote.h" #include "quote.h"
#include "quotearg.h" #include "quotearg.h"
/* Likewise for struct iri definition */
#include "iri.h"
/* Useful macros used across the code: */ /* Useful macros used across the code: */
/* The number of elements in an array. For example: /* The number of elements in an array. For example:

View File

@ -1,3 +1,30 @@
2008-08-14 Xavier Saint <wget@sxav.eu>
* Test-iri-list.px : Fetch files from a remote list.
2008-08-03 Xavier Saint <wget@sxav.eu>
* Test-iri.px : HTTP recursive fetch for testing IRI support and
fallback.
* Test-iri-disabled.px : Same file structure as Test-iri.px but with
IRI support disabled
* Test-iri-forced-remote.px : There's a difference between ISO-8859-1
and ISO-8859-15 for character 0xA4 (respectively currency sign and
euro sign). So with a forced ISO-8859-1 remote encoding, wget should
see 0xA4 as a currency sign and transcode it correctly in UTF-8 instead
of using the ISO-8859-15 given by the server.
* Test-ftp-iri.px : Give a file to fetch via FTP in a specific locale
and expect wget to fetch the file UTF-8 encoded.
* Test-ftp-iri-fallback.px : Same as above but wget should fallback on
locale encoding to fetch the file.
* Test-ftp-iri.px : Same as Test-ftp-iri.px but with IRI support
disabled. The UTF-8 encoded file should not be retrieved.
2008-06-22 Micah Cowan <micah@cowan.name> 2008-06-22 Micah Cowan <micah@cowan.name>
* Test-proxied-https-auth.px: Shift exit code so it falls in the * Test-proxied-https-auth.px: Shift exit code so it falls in the

50
tests/Test-ftp-iri-disabled.px Executable file
View File

@ -0,0 +1,50 @@
#!/usr/bin/perl -w
use strict;
use FTPTest;
###############################################################################
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $francais = <<EOF;
Some text.
EOF
$francais =~ s/\n/\r\n/;
# code, msg, headers, content
my %urls = (
"/fran${ccedilla_u8}ais.txt" => {
content => $francais,
},
"/fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri=no --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
###############################################################################
my $the_test = FTPTest->new (name => "Test-ftp-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

46
tests/Test-ftp-iri-fallback.px Executable file
View File

@ -0,0 +1,46 @@
#!/usr/bin/perl -w
use strict;
use FTPTest;
###############################################################################
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $francais = <<EOF;
Some text.
EOF
$francais =~ s/\n/\r\n/;
# code, msg, headers, content
my %urls = (
"/fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
###############################################################################
my $the_test = FTPTest->new (name => "Test-ftp-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

47
tests/Test-ftp-iri.px Executable file
View File

@ -0,0 +1,47 @@
#!/usr/bin/perl -w
use strict;
use FTPTest;
###############################################################################
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $francais = <<EOF;
Some text.
EOF
$francais =~ s/\n/\r\n/;
# code, msg, headers, content
my %urls = (
"/fran${ccedilla_u8}ais.txt" => {
content => $francais,
},
);
my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
"fran${ccedilla_u8}ais.txt" => {
content => $francais,
},
);
###############################################################################
my $the_test = FTPTest->new (name => "Test-ftp-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

196
tests/Test-iri-disabled.px Executable file
View File

@ -0,0 +1,196 @@
#!/usr/bin/perl -w
use strict;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html mime + file = iso-8859-1
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
#
my $ccedilla_l15 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $eurosign_l15 = "\xA4";
my $eurosign_u8 = "\xE2\x82\xAC";
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
</head>
<body>
<p>
&Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
</p>
</body>
</html>
EOF
my $pageeuro = <<EOF;
<html>
<head>
<title>Euro page</title>
</head>
<body>
<p>
My tailor isn't rich anymore.
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "200",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
'/p3_%A4%A4%A4.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri=no -nH -r http://localhost:{{port}}/";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'index.html' => {
content => $pageindex,
},
'robots.txt' => {
content => "",
},
"p1_fran${ccedilla_l15}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_l1}${eacute_l1}n.html" => {
content => $pageeen,
},
"p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html" => {
content => $pageeuro,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri-disabled",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

207
tests/Test-iri-forced-remote.px Executable file
View File

@ -0,0 +1,207 @@
#!/usr/bin/perl -w
use strict;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
# Force remote encoding to ISO-8859-1
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html mime + file = iso-8859-1
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
#
my $ccedilla_l15 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $eurosign_l15 = "\xA4";
my $eurosign_u8 = "\xE2\x82\xAC";
my $currency_l1 = "\xA4";
my $currency_u8 = "\xC2\xA4";
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
</head>
<body>
<p>
&Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
</p>
</body>
</html>
EOF
my $pageeuro = <<EOF;
<html>
<head>
<title>Euro page</title>
</head>
<body>
<p>
My tailor isn't rich anymore.
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "404",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $page404,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
'/p3_%A4%A4%A4.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
'/p3_%C2%A4%C2%A4%C2%A4.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => $pageeuro,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'index.html' => {
content => $pageindex,
},
'robots.txt' => {
content => "",
},
"p1_fran${ccedilla_l15}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
content => $pageeen,
},
"p3_${currency_u8}${currency_u8}${currency_u8}.html" => {
content => $pageeuro,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri-forced-remote",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

173
tests/Test-iri-list.px Executable file
View File

@ -0,0 +1,173 @@
#!/usr/bin/perl -w
use strict;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html meta + file = utf-8, mime =iso-8859-1
#
my $ccedilla_l1 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $urllist = <<EOF;
http://localhost:{{port}}/
http://localhost:{{port}}/p1_fran${ccedilla_l1}ais.html
http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html
EOF
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Main page.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
French page.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>
<p>
Dutch page.
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "404",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $page404,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/url_list.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=ISO-8859-1",
},
content => $urllist,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri -d -i http://localhost:{{port}}/url_list.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'url_list.txt' => {
content => $urllist,
},
'index.html' => {
content => $pageindex,
},
"p1_fran${ccedilla_l1}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
content => $pageeen,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri-list",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

224
tests/Test-iri.px Executable file
View File

@ -0,0 +1,224 @@
#!/usr/bin/perl -w
use strict;
use HTTPTest;
# cf. http://en.wikipedia.org/wiki/Latin1
# http://en.wikipedia.org/wiki/ISO-8859-15
###############################################################################
#
# mime : charset found in Content-Type HTTP MIME header
# meta : charset found in Content-Type meta tag
#
# index.html mime + file = iso-8859-15
# p1_français.html meta + file = iso-8859-1, mime = utf-8
# p2_één.html meta + file = utf-8, mime =iso-8859-1
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
# p4_méér.html mime + file = utf-8
#
my $ccedilla_l15 = "\xE7";
my $ccedilla_u8 = "\xC3\xA7";
my $eacute_l1 = "\xE9";
my $eacute_u8 = "\xC3\xA9";
my $eurosign_l15 = "\xA4";
my $eurosign_u8 = "\xE2\x82\xAC";
my $pageindex = <<EOF;
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en fran&ccedil;ais</a>.
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
</p>
</body>
</html>
EOF
my $pagefrancais = <<EOF;
<html>
<head>
<title>La seule page en français</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
</head>
<body>
<p>
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
</p>
</body>
</html>
EOF
my $pageeen = <<EOF;
<html>
<head>
<title>Die enkele nederlandstalige pagina</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>
<p>
&Eacute;&eacute;n is niet veel maar toch meer dan nul.<br/>
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)<br/>
<a href="http://localhost:{{port}}/p4_m${eacute_u8}${eacute_u8}r.html">M&eacute&eacute;r</a>
</p>
</body>
</html>
EOF
my $pageeuro = <<EOF;
<html>
<head>
<title>Euro page</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>
<p>
My tailor isn't rich anymore.
</p>
</body>
</html>
EOF
my $pagemeer = <<EOF;
<html>
<head>
<title>Bekende supermarkt</title>
</head>
<body>
<p>
Ik ben toch niet gek !
</p>
</body>
</html>
EOF
my $page404 = <<EOF;
<html>
<head>
<title>404</title>
</head>
<body>
<p>
Nop nop nop...
</p>
</body>
</html>
EOF
# code, msg, headers, content
my %urls = (
'/index.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-15",
},
content => $pageindex,
},
'/robots.txt' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain",
},
content => "",
},
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
code => "404",
msg => "File not found",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $page404,
},
'/p1_fran%E7ais.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=UTF-8",
},
content => $pagefrancais,
},
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p2_%E9%E9n.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/html; charset=ISO-8859-1",
},
content => $pageeen,
},
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=ISO-8859-1",
},
content => $pageeuro,
},
'/p3_%A4%A4%A4.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=ISO-8859-1",
},
content => $pageeuro,
},
'/p4_m%C3%A9%C3%A9r.html' => {
code => "200",
msg => "Ok",
headers => {
"Content-type" => "text/plain; charset=UTF-8",
},
content => $pagemeer,
},
);
my $cmdline = $WgetTest::WGETPATH . " --iri --restrict-file-names=nocontrol -nH -r http://localhost:{{port}}/";
my $expected_error_code = 0;
my %expected_downloaded_files = (
'index.html' => {
content => $pageindex,
},
'robots.txt' => {
content => "",
},
"p1_fran${ccedilla_l15}ais.html" => {
content => $pagefrancais,
},
"p2_${eacute_u8}${eacute_u8}n.html" => {
content => $pageeen,
},
"p3_${eurosign_u8}${eurosign_u8}${eurosign_u8}.html" => {
content => $pageeuro,
},
"p4_m${eacute_u8}${eacute_u8}r.html" => {
content => $pagemeer,
},
);
###############################################################################
my $the_test = HTTPTest->new (name => "Test-iri",
input => \%urls,
cmdline => $cmdline,
errcode => $expected_error_code,
output => \%expected_downloaded_files);
exit $the_test->run();
# vim: et ts=4 sw=4

View File

@ -17,9 +17,16 @@ my @tests = (
'Test-E-k-K.px', 'Test-E-k-K.px',
'Test-E-k.px', 'Test-E-k.px',
'Test-ftp.px', 'Test-ftp.px',
'Test-ftp-iri.px',
'Test-ftp-iri-fallback.px',
'Test-ftp-iri-disabled.px',
'Test-HTTP-Content-Disposition-1.px', 'Test-HTTP-Content-Disposition-1.px',
'Test-HTTP-Content-Disposition-2.px', 'Test-HTTP-Content-Disposition-2.px',
'Test-HTTP-Content-Disposition.px', 'Test-HTTP-Content-Disposition.px',
'Test-iri.px',
'Test-iri-disabled.px',
'Test-iri-forced-remote.px',
'Test-iri-list.px',
'Test-N-current.px', 'Test-N-current.px',
'Test-N-smaller.px', 'Test-N-smaller.px',
'Test-N-no-info.px', 'Test-N-no-info.px',