mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Automated merge.
This commit is contained in:
commit
090f1596ae
@ -9,6 +9,14 @@
|
|||||||
|
|
||||||
* AUTHORS: Added Steven Schubiger.
|
* AUTHORS: Added Steven Schubiger.
|
||||||
|
|
||||||
|
2008-06-26 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* configure.ac : IRIs support required libiconv, check it.
|
||||||
|
|
||||||
|
2008-06-14 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* configure.ac: Add support for IRIs
|
||||||
|
|
||||||
2008-05-29 Micah Cowan <micah@cowan.name>
|
2008-05-29 Micah Cowan <micah@cowan.name>
|
||||||
|
|
||||||
* po/*.po: Updated from TP (the 1.11.3 set).
|
* po/*.po: Updated from TP (the 1.11.3 set).
|
||||||
|
71
configure.ac
71
configure.ac
@ -460,6 +460,77 @@ else
|
|||||||
fi
|
fi
|
||||||
AC_SUBST(COMMENT_IF_NO_POD2MAN)
|
AC_SUBST(COMMENT_IF_NO_POD2MAN)
|
||||||
|
|
||||||
|
|
||||||
|
dnl
|
||||||
|
dnl Check for IDN/IRIs
|
||||||
|
dnl
|
||||||
|
|
||||||
|
AC_ARG_ENABLE(iri,
|
||||||
|
AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
|
||||||
|
[case "${enable_iri}" in
|
||||||
|
no)
|
||||||
|
dnl Disable IRIs checking
|
||||||
|
AC_MSG_NOTICE([disabling IRIs at user request])
|
||||||
|
iri=no
|
||||||
|
;;
|
||||||
|
yes)
|
||||||
|
dnl IRIs explicitly enabled
|
||||||
|
iri=yes
|
||||||
|
force_iri=yes
|
||||||
|
;;
|
||||||
|
auto)
|
||||||
|
dnl Auto-detect IRI
|
||||||
|
iri=yes
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
], [
|
||||||
|
dnl If nothing is specified, assume auto-detection
|
||||||
|
iri=yes
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
|
||||||
|
[Support IDN/IRIs (needs GNU Libidn)]),
|
||||||
|
libidn=$withval, libidn="")
|
||||||
|
if test "X$iri" != "Xno"; then
|
||||||
|
AM_ICONV
|
||||||
|
|
||||||
|
if test "X$am_cv_func_iconv" != "Xyes"; then
|
||||||
|
iri=no
|
||||||
|
if test "X$force_iri" = "Xyes"; then
|
||||||
|
AC_MSG_ERROR([Libiconv is required for IRIs support])
|
||||||
|
else
|
||||||
|
AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found])
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if test "X$iri" != "Xno"; then
|
||||||
|
if test "$libidn" != ""; then
|
||||||
|
LDFLAGS="${LDFLAGS} -L$libidn/lib"
|
||||||
|
CPPFLAGS="${CPPFLAGS} -I$libidn/include"
|
||||||
|
fi
|
||||||
|
AC_CHECK_HEADER(idna.h,
|
||||||
|
AC_CHECK_LIB(idn, stringprep_check_version,
|
||||||
|
[iri=yes LIBS="${LIBS} -lidn"], iri=no),
|
||||||
|
iri=no)
|
||||||
|
|
||||||
|
if test "X$iri" != "Xno" ; then
|
||||||
|
AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
|
||||||
|
AC_MSG_NOTICE([Enabling support for IRI.])
|
||||||
|
else
|
||||||
|
AC_MSG_WARN([Libidn not found])
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
dnl Needed by src/Makefile.am
|
||||||
|
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
|
||||||
|
|
||||||
|
|
||||||
dnl
|
dnl
|
||||||
dnl Create output
|
dnl Create output
|
||||||
dnl
|
dnl
|
||||||
|
@ -1,3 +1,12 @@
|
|||||||
|
2008-08-03 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* wget.texi : Add option descriptions for the three new
|
||||||
|
options --iri, --locale and --remote-encoding related to
|
||||||
|
IRI support.
|
||||||
|
|
||||||
|
* sample.wgetrc : Add commented lines for the three new
|
||||||
|
command iri, locale and encoding related to IRI support.
|
||||||
|
|
||||||
2008-08-03 Micah Cowan <micah@cowan.name>
|
2008-08-03 Micah Cowan <micah@cowan.name>
|
||||||
|
|
||||||
* wget.texi: Don't set UPDATED; already set by version.texi.
|
* wget.texi: Don't set UPDATED; already set by version.texi.
|
||||||
|
@ -113,3 +113,12 @@ waitretry = 10
|
|||||||
|
|
||||||
# To try ipv6 addresses first:
|
# To try ipv6 addresses first:
|
||||||
#prefer-family = IPv6
|
#prefer-family = IPv6
|
||||||
|
|
||||||
|
# Set default IRI support state
|
||||||
|
#iri = off
|
||||||
|
|
||||||
|
# Force the default system encoding
|
||||||
|
#locale = UTF-8
|
||||||
|
|
||||||
|
# Force the default remote server encoding
|
||||||
|
#remoteencoding = UTF-8
|
||||||
|
@ -674,6 +674,30 @@ Another instance where you'll get a garbled file if you try to use
|
|||||||
Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http}
|
Note that @samp{-c} only works with @sc{ftp} servers and with @sc{http}
|
||||||
servers that support the @code{Range} header.
|
servers that support the @code{Range} header.
|
||||||
|
|
||||||
|
@cindex iri support
|
||||||
|
@cindex idn support
|
||||||
|
@item --iri
|
||||||
|
|
||||||
|
Turn on internationalized URI (IRI) support. Use @samp{--iri=no} to
|
||||||
|
turn it off. IRI support is activated by default.
|
||||||
|
|
||||||
|
You can set the default state of IRI support using @code{iri} command in
|
||||||
|
@file{.wgetrc}. That setting may be overridden from the command line.
|
||||||
|
|
||||||
|
@cindex local encoding
|
||||||
|
@cindex locale
|
||||||
|
@item --locale=@var{encoding}
|
||||||
|
|
||||||
|
Force Wget to use @var{encoding} as the default system encoding. That affects
|
||||||
|
how Wget converts URLs specified as arguments from locale to @sc{utf-8} for
|
||||||
|
IRI support.
|
||||||
|
|
||||||
|
Wget use the function @code{nl_langinfo()} and then the @code{CHARSET}
|
||||||
|
environment variable to get the locale. If it fails, @sc{ascii} is used.
|
||||||
|
|
||||||
|
You can set the default locale using the @code{locale} command in
|
||||||
|
@file{.wgetrc}. That setting may be overridden from the command line.
|
||||||
|
|
||||||
@cindex progress indicator
|
@cindex progress indicator
|
||||||
@cindex dot style
|
@cindex dot style
|
||||||
@item --progress=@var{type}
|
@item --progress=@var{type}
|
||||||
@ -705,6 +729,21 @@ command line. The exception is that, when the output is not a TTY, the
|
|||||||
``dot'' progress will be favored over ``bar''. To force the bar output,
|
``dot'' progress will be favored over ``bar''. To force the bar output,
|
||||||
use @samp{--progress=bar:force}.
|
use @samp{--progress=bar:force}.
|
||||||
|
|
||||||
|
@cindex remote encoding
|
||||||
|
@item --remote-encoding=@var{encoding}
|
||||||
|
|
||||||
|
Force Wget to use encoding as the default remote server encoding. That
|
||||||
|
affects how Wget converts URIs found in files from remote encoding to
|
||||||
|
@sc{utf-8} during a recursive fetch. This options is only useful for
|
||||||
|
IRI support, for the interpretation of non-@sc{ascii} characters.
|
||||||
|
|
||||||
|
For HTTP, remote encoding can be found in HTTP @code{Content-Type}
|
||||||
|
header and in HTML @code{Content-Type http-equiv} meta tag.
|
||||||
|
|
||||||
|
You can set the default encoding using the @code{remoteencoding}
|
||||||
|
command in @file{.wgetrc}. That setting may be overridden from the
|
||||||
|
command line.
|
||||||
|
|
||||||
@item -N
|
@item -N
|
||||||
@itemx --timestamping
|
@itemx --timestamping
|
||||||
Turn on time-stamping. @xref{Time-Stamping}, for details.
|
Turn on time-stamping. @xref{Time-Stamping}, for details.
|
||||||
|
@ -32,11 +32,27 @@
|
|||||||
* init.c (cleanup): Free the memory associated with the base
|
* init.c (cleanup): Free the memory associated with the base
|
||||||
option (when DEBUG_MALLOC is defined).
|
option (when DEBUG_MALLOC is defined).
|
||||||
|
|
||||||
|
2008-07-02 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* iri.c, iri.h : New function idn_decode() to decode ASCII
|
||||||
|
encoded hostname to the locale.
|
||||||
|
|
||||||
|
* host.c : Show hostname to be resolved both in locale and
|
||||||
|
ASCII encoded.
|
||||||
|
|
||||||
2008-06-28 Steven Schubiger <stsc@members.fsf.org>
|
2008-06-28 Steven Schubiger <stsc@members.fsf.org>
|
||||||
|
|
||||||
* retr.c (retrieve_from_file): Allow for reading the links from
|
* retr.c (retrieve_from_file): Allow for reading the links from
|
||||||
an external file (HTTP/FTP).
|
an external file (HTTP/FTP).
|
||||||
|
|
||||||
|
2008-06-26 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* iri.c, iri.h : New functions locale_to_utf8() and
|
||||||
|
idn_encode() adding basic capabilities of IRI/IDN.
|
||||||
|
|
||||||
|
* url.c : Convert URLs from locale to UTF-8 allowing a basic
|
||||||
|
support of IRI/IDN
|
||||||
|
|
||||||
2008-06-25 Steven Schubiger <stsc@members.fsf.org>
|
2008-06-25 Steven Schubiger <stsc@members.fsf.org>
|
||||||
|
|
||||||
* ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
|
* ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
|
||||||
@ -69,12 +85,57 @@
|
|||||||
string vars pointers-to-const, and moved line lengths
|
string vars pointers-to-const, and moved line lengths
|
||||||
below 80 (in Makefile.am, not in version.c).
|
below 80 (in Makefile.am, not in version.c).
|
||||||
|
|
||||||
|
2008-06-19 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* iri.c, iri.h : New function check_encoding_name() as
|
||||||
|
a preliminary encoding name check.
|
||||||
|
|
||||||
|
* main.c, iri.c : Make use of check_encoding_name().
|
||||||
|
|
||||||
|
2008-06-19 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* iri.c : Include missing stringprep.h file and add a
|
||||||
|
cast.
|
||||||
|
|
||||||
|
* init.c : set a default initial value for opt.enable_iri,
|
||||||
|
opt.locale and opt.encoding_remote.
|
||||||
|
|
||||||
|
2008-06-19 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* iri.c, iri.h : Add a new function find_locale() to find
|
||||||
|
out the local system encoding.
|
||||||
|
|
||||||
|
* main.c : Make use of find_locale().
|
||||||
|
|
||||||
|
2008-06-19 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* html-url.c : Add "content-type" meta tag parsing for
|
||||||
|
retrieving page encoding.
|
||||||
|
|
||||||
|
* iri.h : Make no-op version of parse_charset() return
|
||||||
|
NULL.
|
||||||
|
|
||||||
2008-06-16 Micah Cowan <micah@cowan.name>
|
2008-06-16 Micah Cowan <micah@cowan.name>
|
||||||
|
|
||||||
* http.c (http_loop): When hstat.len is higher than the
|
* http.c (http_loop): When hstat.len is higher than the
|
||||||
successfully completed content's length, but it's because we
|
successfully completed content's length, but it's because we
|
||||||
_set_ it that way, don't abort.
|
_set_ it that way, don't abort.
|
||||||
|
|
||||||
|
2008-06-14 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* iri.c, iri.h : New files.
|
||||||
|
|
||||||
|
* Makefile.am : Add files iri.h and conditional iri.c.
|
||||||
|
|
||||||
|
* build_info.c : Add compiled feature "iri".
|
||||||
|
|
||||||
|
* http.c : include iri.h and parse charset from Content-Type
|
||||||
|
header.
|
||||||
|
|
||||||
|
* init.c, main.c, options.h : if an options isn't supported
|
||||||
|
at compiled time, don't get rid off it and show a dummy
|
||||||
|
message instead if they are used.
|
||||||
|
|
||||||
2008-06-13 Micah Cowan <micah@cowan.name>
|
2008-06-13 Micah Cowan <micah@cowan.name>
|
||||||
|
|
||||||
* build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL
|
* build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL
|
||||||
|
@ -30,6 +30,10 @@
|
|||||||
# Version: @VERSION@
|
# Version: @VERSION@
|
||||||
#
|
#
|
||||||
|
|
||||||
|
if IRI_IS_ENABLED
|
||||||
|
IRI_OBJ = iri.c
|
||||||
|
endif
|
||||||
|
|
||||||
# The following line is losing on some versions of make!
|
# The following line is losing on some versions of make!
|
||||||
DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
|
DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
|
||||||
LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
|
LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
|
||||||
@ -40,7 +44,7 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c \
|
|||||||
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
|
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
|
||||||
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
|
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
|
||||||
recur.c res.c retr.c snprintf.c spider.c url.c \
|
recur.c res.c retr.c snprintf.c spider.c url.c \
|
||||||
utils.c \
|
utils.c $(IRI_OBJ) \
|
||||||
css-url.h connect.h convert.h cookies.h \
|
css-url.h connect.h convert.h cookies.h \
|
||||||
ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
|
ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
|
||||||
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
|
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
|
||||||
|
@ -100,6 +100,13 @@ const char* (compiled_features[]) =
|
|||||||
#else
|
#else
|
||||||
"-gettext",
|
"-gettext",
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_IRI
|
||||||
|
"+iri",
|
||||||
|
#else
|
||||||
|
"-iri",
|
||||||
|
#endif
|
||||||
|
|
||||||
/* sentinel value */
|
/* sentinel value */
|
||||||
NULL
|
NULL
|
||||||
};
|
};
|
||||||
|
@ -266,9 +266,25 @@ connect_to_ip (const ip_address *ip, int port, const char *print)
|
|||||||
if (print)
|
if (print)
|
||||||
{
|
{
|
||||||
const char *txt_addr = print_address (ip);
|
const char *txt_addr = print_address (ip);
|
||||||
if (print && 0 != strcmp (print, txt_addr))
|
if (0 != strcmp (print, txt_addr))
|
||||||
|
{
|
||||||
|
char *str = NULL, *name;
|
||||||
|
|
||||||
|
if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL)
|
||||||
|
{
|
||||||
|
int len = strlen (print) + strlen (name) + 4;
|
||||||
|
str = xmalloc (len);
|
||||||
|
snprintf (str, len, "%s (%s)", name, print);
|
||||||
|
str[len-1] = '\0';
|
||||||
|
xfree (name);
|
||||||
|
}
|
||||||
|
|
||||||
logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
|
logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
|
||||||
escnonprint_uri (print), txt_addr, port);
|
str ? str : escnonprint_uri (print), txt_addr, port);
|
||||||
|
|
||||||
|
if (str)
|
||||||
|
xfree (str);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port);
|
logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port);
|
||||||
}
|
}
|
||||||
|
@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
|
|||||||
|
|
||||||
/* Parse the file... */
|
/* Parse the file... */
|
||||||
urls = is_css ? get_urls_css_file (file, url) :
|
urls = is_css ? get_urls_css_file (file, url) :
|
||||||
get_urls_html (file, url, NULL);
|
get_urls_html (file, url, NULL, NULL);
|
||||||
|
|
||||||
/* We don't respect meta_disallow_follow here because, even if
|
/* We don't respect meta_disallow_follow here because, even if
|
||||||
the file is not followed, we might still want to convert the
|
the file is not followed, we might still want to convert the
|
||||||
|
@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line)
|
|||||||
return FTPRERR;
|
return FTPRERR;
|
||||||
|
|
||||||
/* Strip trailing CRLF before printing the line, so that
|
/* Strip trailing CRLF before printing the line, so that
|
||||||
escnonprint doesn't include bogus \012 and \015. */
|
quotting doesn't include bogus \012 and \015. */
|
||||||
p = strchr (line, '\0');
|
p = strchr (line, '\0');
|
||||||
if (p > line && p[-1] == '\n')
|
if (p > line && p[-1] == '\n')
|
||||||
*--p = '\0';
|
*--p = '\0';
|
||||||
|
18
src/host.c
18
src/host.c
@ -712,8 +712,24 @@ lookup_host (const char *host, int flags)
|
|||||||
/* No luck with the cache; resolve HOST. */
|
/* No luck with the cache; resolve HOST. */
|
||||||
|
|
||||||
if (!silent && !numeric_address)
|
if (!silent && !numeric_address)
|
||||||
|
{
|
||||||
|
char *str = NULL, *name;
|
||||||
|
|
||||||
|
if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
|
||||||
|
{
|
||||||
|
int len = strlen (host) + strlen (name) + 4;
|
||||||
|
str = xmalloc (len);
|
||||||
|
snprintf (str, len, "%s (%s)", name, host);
|
||||||
|
str[len-1] = '\0';
|
||||||
|
xfree (name);
|
||||||
|
}
|
||||||
|
|
||||||
logprintf (LOG_VERBOSE, _("Resolving %s... "),
|
logprintf (LOG_VERBOSE, _("Resolving %s... "),
|
||||||
quotearg_style (escape_quoting_style, host));
|
quotearg_style (escape_quoting_style, str ? str : host));
|
||||||
|
|
||||||
|
if (str)
|
||||||
|
xfree (str);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef ENABLE_IPV6
|
#ifdef ENABLE_IPV6
|
||||||
{
|
{
|
||||||
|
@ -174,6 +174,10 @@ static const char *additional_attributes[] = {
|
|||||||
static struct hash_table *interesting_tags;
|
static struct hash_table *interesting_tags;
|
||||||
static struct hash_table *interesting_attributes;
|
static struct hash_table *interesting_attributes;
|
||||||
|
|
||||||
|
/* Will contains the (last) charset found in 'http-equiv=content-type'
|
||||||
|
meta tags */
|
||||||
|
static char *meta_charset;
|
||||||
|
|
||||||
static void
|
static void
|
||||||
init_interesting (void)
|
init_interesting (void)
|
||||||
{
|
{
|
||||||
@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size,
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
url = url_parse (link_uri, NULL);
|
url = url_parse (link_uri, NULL, NULL);
|
||||||
if (!url)
|
if (!url)
|
||||||
{
|
{
|
||||||
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
|
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
|
||||||
@ -303,7 +307,7 @@ append_url (const char *link_uri, int position, int size,
|
|||||||
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
|
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
|
||||||
ctx->document_file, base, link_uri, complete_uri));
|
ctx->document_file, base, link_uri, complete_uri));
|
||||||
|
|
||||||
url = url_parse (complete_uri, NULL);
|
url = url_parse (complete_uri, NULL, NULL);
|
||||||
if (!url)
|
if (!url)
|
||||||
{
|
{
|
||||||
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
|
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
|
||||||
@ -553,6 +557,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|||||||
entry->link_expect_html = 1;
|
entry->link_expect_html = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
|
||||||
|
{
|
||||||
|
/* Handle stuff like:
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
|
||||||
|
|
||||||
|
char *mcharset;
|
||||||
|
char *content = find_attr (tag, "content", NULL);
|
||||||
|
if (!content)
|
||||||
|
return;
|
||||||
|
|
||||||
|
mcharset = parse_charset (content);
|
||||||
|
if (!mcharset)
|
||||||
|
return;
|
||||||
|
|
||||||
|
xfree_null (meta_charset);
|
||||||
|
meta_charset = mcharset;
|
||||||
|
}
|
||||||
else if (name && 0 == strcasecmp (name, "robots"))
|
else if (name && 0 == strcasecmp (name, "robots"))
|
||||||
{
|
{
|
||||||
/* Handle stuff like:
|
/* Handle stuff like:
|
||||||
@ -617,7 +638,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
|||||||
<base href=...> and does the right thing. */
|
<base href=...> and does the right thing. */
|
||||||
|
|
||||||
struct urlpos *
|
struct urlpos *
|
||||||
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
|
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
|
||||||
|
struct iri *iri)
|
||||||
{
|
{
|
||||||
struct file_memory *fm;
|
struct file_memory *fm;
|
||||||
struct map_context ctx;
|
struct map_context ctx;
|
||||||
@ -657,6 +679,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
|
|||||||
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
|
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
|
||||||
NULL, interesting_attributes);
|
NULL, interesting_attributes);
|
||||||
|
|
||||||
|
/* If meta charset isn't null, override content encoding */
|
||||||
|
if (iri && meta_charset)
|
||||||
|
set_content_encoding (iri, meta_charset);
|
||||||
|
|
||||||
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
|
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
|
||||||
if (meta_disallow_follow)
|
if (meta_disallow_follow)
|
||||||
*meta_disallow_follow = ctx.nofollow;
|
*meta_disallow_follow = ctx.nofollow;
|
||||||
@ -726,7 +752,7 @@ get_urls_file (const char *file)
|
|||||||
url_text = merged;
|
url_text = merged;
|
||||||
}
|
}
|
||||||
|
|
||||||
url = url_parse (url_text, &up_error_code);
|
url = url_parse (url_text, &up_error_code, NULL);
|
||||||
if (!url)
|
if (!url)
|
||||||
{
|
{
|
||||||
char *error = url_error (url_text, up_error_code);
|
char *error = url_error (url_text, up_error_code);
|
||||||
|
@ -44,7 +44,7 @@ struct map_context {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct urlpos *get_urls_file (const char *);
|
struct urlpos *get_urls_file (const char *);
|
||||||
struct urlpos *get_urls_html (const char *, const char *, bool *);
|
struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
|
||||||
struct urlpos *append_url (const char *, int, int, struct map_context *);
|
struct urlpos *append_url (const char *, int, int, struct map_context *);
|
||||||
void free_urlpos (struct urlpos *);
|
void free_urlpos (struct urlpos *);
|
||||||
|
|
||||||
|
24
src/http.c
24
src/http.c
@ -1364,7 +1364,8 @@ free_hstat (struct http_stat *hs)
|
|||||||
If PROXY is non-NULL, the connection will be made to the proxy
|
If PROXY is non-NULL, the connection will be made to the proxy
|
||||||
server, and u->url will be requested. */
|
server, and u->url will be requested. */
|
||||||
static uerr_t
|
static uerr_t
|
||||||
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
|
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
||||||
|
struct iri *iri)
|
||||||
{
|
{
|
||||||
struct request *req;
|
struct request *req;
|
||||||
|
|
||||||
@ -2048,9 +2049,20 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
|
|||||||
char *tmp = strchr (type, ';');
|
char *tmp = strchr (type, ';');
|
||||||
if (tmp)
|
if (tmp)
|
||||||
{
|
{
|
||||||
|
/* sXXXav: only needed if IRI support is enabled */
|
||||||
|
char *tmp2 = tmp + 1;
|
||||||
|
|
||||||
while (tmp > type && c_isspace (tmp[-1]))
|
while (tmp > type && c_isspace (tmp[-1]))
|
||||||
--tmp;
|
--tmp;
|
||||||
*tmp = '\0';
|
*tmp = '\0';
|
||||||
|
|
||||||
|
/* Try to get remote encoding if needed */
|
||||||
|
if (opt.enable_iri && !opt.encoding_remote)
|
||||||
|
{
|
||||||
|
tmp = parse_charset (tmp2);
|
||||||
|
if (tmp)
|
||||||
|
set_content_encoding (iri, tmp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
hs->newloc = resp_header_strdup (resp, "Location");
|
hs->newloc = resp_header_strdup (resp, "Location");
|
||||||
@ -2325,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
|
|||||||
retried, and retried, and retried, and... */
|
retried, and retried, and retried, and... */
|
||||||
uerr_t
|
uerr_t
|
||||||
http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
|
http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
|
||||||
int *dt, struct url *proxy)
|
int *dt, struct url *proxy, struct iri *iri)
|
||||||
{
|
{
|
||||||
int count;
|
int count;
|
||||||
bool got_head = false; /* used for time-stamping and filename detection */
|
bool got_head = false; /* used for time-stamping and filename detection */
|
||||||
@ -2489,7 +2501,7 @@ Spider mode enabled. Check if remote file exists.\n"));
|
|||||||
*dt &= ~SEND_NOCACHE;
|
*dt &= ~SEND_NOCACHE;
|
||||||
|
|
||||||
/* Try fetching the document, or at least its head. */
|
/* Try fetching the document, or at least its head. */
|
||||||
err = gethttp (u, &hstat, dt, proxy);
|
err = gethttp (u, &hstat, dt, proxy, iri);
|
||||||
|
|
||||||
/* Time? */
|
/* Time? */
|
||||||
tms = datetime_str (time (NULL));
|
tms = datetime_str (time (NULL));
|
||||||
@ -2567,8 +2579,10 @@ Spider mode enabled. Check if remote file exists.\n"));
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* Maybe we should always keep track of broken links, not just in
|
/* Maybe we should always keep track of broken links, not just in
|
||||||
* spider mode. */
|
* spider mode.
|
||||||
else if (opt.spider)
|
* Don't log error if it was UTF-8 encoded because we will try
|
||||||
|
* once unencoded. */
|
||||||
|
else if (opt.spider && !iri->utf8_encode)
|
||||||
{
|
{
|
||||||
/* #### Again: ugly ugly ugly! */
|
/* #### Again: ugly ugly ugly! */
|
||||||
if (!hurl)
|
if (!hurl)
|
||||||
|
@ -33,7 +33,7 @@ as that of the covered work. */
|
|||||||
struct url;
|
struct url;
|
||||||
|
|
||||||
uerr_t http_loop (struct url *, char **, char **, const char *, int *,
|
uerr_t http_loop (struct url *, char **, char **, const char *, int *,
|
||||||
struct url *);
|
struct url *, struct iri *);
|
||||||
void save_cookies (void);
|
void save_cookies (void);
|
||||||
void http_cleanup (void);
|
void http_cleanup (void);
|
||||||
time_t http_atotm (const char *);
|
time_t http_atotm (const char *);
|
||||||
|
11
src/init.c
11
src/init.c
@ -182,9 +182,11 @@ static const struct {
|
|||||||
{ "inet6only", &opt.ipv6_only, cmd_boolean },
|
{ "inet6only", &opt.ipv6_only, cmd_boolean },
|
||||||
#endif
|
#endif
|
||||||
{ "input", &opt.input_filename, cmd_file },
|
{ "input", &opt.input_filename, cmd_file },
|
||||||
|
{ "iri", &opt.enable_iri, cmd_boolean },
|
||||||
{ "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
|
{ "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
|
||||||
{ "limitrate", &opt.limit_rate, cmd_bytes },
|
{ "limitrate", &opt.limit_rate, cmd_bytes },
|
||||||
{ "loadcookies", &opt.cookies_input, cmd_file },
|
{ "loadcookies", &opt.cookies_input, cmd_file },
|
||||||
|
{ "locale", &opt.locale, cmd_string },
|
||||||
{ "logfile", &opt.lfilename, cmd_file },
|
{ "logfile", &opt.lfilename, cmd_file },
|
||||||
{ "login", &opt.ftp_user, cmd_string },/* deprecated*/
|
{ "login", &opt.ftp_user, cmd_string },/* deprecated*/
|
||||||
{ "maxredirect", &opt.max_redirect, cmd_number },
|
{ "maxredirect", &opt.max_redirect, cmd_number },
|
||||||
@ -224,6 +226,7 @@ static const struct {
|
|||||||
{ "referer", &opt.referer, cmd_string },
|
{ "referer", &opt.referer, cmd_string },
|
||||||
{ "reject", &opt.rejects, cmd_vector },
|
{ "reject", &opt.rejects, cmd_vector },
|
||||||
{ "relativeonly", &opt.relative_only, cmd_boolean },
|
{ "relativeonly", &opt.relative_only, cmd_boolean },
|
||||||
|
{ "remoteencoding", &opt.encoding_remote, cmd_string },
|
||||||
{ "removelisting", &opt.remove_listing, cmd_boolean },
|
{ "removelisting", &opt.remove_listing, cmd_boolean },
|
||||||
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
|
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
|
||||||
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
|
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
|
||||||
@ -331,6 +334,14 @@ defaults (void)
|
|||||||
opt.restrict_files_case = restrict_no_case_restriction;
|
opt.restrict_files_case = restrict_no_case_restriction;
|
||||||
|
|
||||||
opt.max_redirect = 20;
|
opt.max_redirect = 20;
|
||||||
|
|
||||||
|
#ifdef ENABLE_IRI
|
||||||
|
opt.enable_iri = true;
|
||||||
|
#else
|
||||||
|
opt.enable_iri = false;
|
||||||
|
#endif
|
||||||
|
opt.locale = NULL;
|
||||||
|
opt.encoding_remote = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Return the user's home directory (strdup-ed), or NULL if none is
|
/* Return the user's home directory (strdup-ed), or NULL if none is
|
||||||
|
348
src/iri.c
Normal file
348
src/iri.c
Normal file
@ -0,0 +1,348 @@
|
|||||||
|
/* IRI related functions.
|
||||||
|
Copyright (C) 2008 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
This file is part of GNU Wget.
|
||||||
|
|
||||||
|
GNU Wget is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 3 of the License, or (at
|
||||||
|
your option) any later version.
|
||||||
|
|
||||||
|
GNU Wget is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with Wget. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
Additional permission under GNU GPL version 3 section 7
|
||||||
|
|
||||||
|
If you modify this program, or any covered work, by linking or
|
||||||
|
combining it with the OpenSSL project's OpenSSL library (or a
|
||||||
|
modified version of that library), containing parts covered by the
|
||||||
|
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
|
||||||
|
grants you additional permission to convey the resulting work.
|
||||||
|
Corresponding Source for a non-source form of such a combination
|
||||||
|
shall include the source code for the parts of OpenSSL used as well
|
||||||
|
as that of the covered work. */
|
||||||
|
|
||||||
|
#include "wget.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <iconv.h>
|
||||||
|
#include <stringprep.h>
|
||||||
|
#include <idna.h>
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
#include "utils.h"
|
||||||
|
|
||||||
|
/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
|
||||||
|
#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
|
||||||
|
|
||||||
|
/* Note: locale encoding is kept in options struct (opt.locale) */
|
||||||
|
|
||||||
|
static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
|
||||||
|
|
||||||
|
|
||||||
|
/* Given a string containing "charset=XXX", return the encoding if found,
|
||||||
|
or NULL otherwise */
|
||||||
|
char *
|
||||||
|
parse_charset (char *str)
|
||||||
|
{
|
||||||
|
char *charset;
|
||||||
|
|
||||||
|
if (!str || !*str)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
str = strcasestr (str, "charset=");
|
||||||
|
if (!str)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
str += 8;
|
||||||
|
charset = str;
|
||||||
|
|
||||||
|
/* sXXXav: which chars should be banned ??? */
|
||||||
|
while (*charset && !c_isspace (*charset))
|
||||||
|
charset++;
|
||||||
|
|
||||||
|
/* sXXXav: could strdupdelim return NULL ? */
|
||||||
|
charset = strdupdelim (str, charset);
|
||||||
|
|
||||||
|
/* Do a minimum check on the charset value */
|
||||||
|
if (!check_encoding_name (charset))
|
||||||
|
{
|
||||||
|
xfree (charset);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
|
||||||
|
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Find the locale used, or fall back on a default value */
|
||||||
|
char *
|
||||||
|
find_locale (void)
|
||||||
|
{
|
||||||
|
return (char *) stringprep_locale_charset ();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Basic check of an encoding name. */
|
||||||
|
bool
|
||||||
|
check_encoding_name (char *encoding)
|
||||||
|
{
|
||||||
|
char *s = encoding;
|
||||||
|
|
||||||
|
while (*s)
|
||||||
|
{
|
||||||
|
if (!c_isascii (*s) || c_isspace (*s))
|
||||||
|
{
|
||||||
|
logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
s++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
|
||||||
|
static bool
|
||||||
|
open_locale_to_utf8 (void)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Try converting string str from locale to UTF-8. Return a new string
|
||||||
|
on success, or str on error or if conversion isn't needed. */
|
||||||
|
const char *
|
||||||
|
locale_to_utf8 (const char *str)
|
||||||
|
{
|
||||||
|
iconv_t l2u;
|
||||||
|
char *new;
|
||||||
|
|
||||||
|
/* That shouldn't happen, just in case */
|
||||||
|
if (!opt.locale)
|
||||||
|
{
|
||||||
|
logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
|
||||||
|
opt.locale = find_locale ();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
|
||||||
|
return str;
|
||||||
|
|
||||||
|
l2u = iconv_open ("UTF-8", opt.locale);
|
||||||
|
if (l2u != (iconv_t)(-1))
|
||||||
|
{
|
||||||
|
logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
|
||||||
|
quote (opt.locale), quote ("UTF-8"));
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new))
|
||||||
|
return (const char *) new;
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Do the conversion according to the passed conversion descriptor cd. *out
|
||||||
|
will contain the transcoded string on success. *out content is
|
||||||
|
unspecified otherwise. */
|
||||||
|
static bool
|
||||||
|
do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
|
||||||
|
{
|
||||||
|
/* sXXXav : hummm hard to guess... */
|
||||||
|
size_t len, done, outlen = inlen * 2;
|
||||||
|
int invalid = 0, tooshort = 0;
|
||||||
|
char *s;
|
||||||
|
|
||||||
|
s = xmalloc (outlen + 1);
|
||||||
|
*out = s;
|
||||||
|
len = outlen;
|
||||||
|
done = 0;
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
|
||||||
|
{
|
||||||
|
*out = s;
|
||||||
|
*(s + len - outlen - done) = '\0';
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Incomplete or invalid multibyte sequence */
|
||||||
|
if (errno == EINVAL || errno == EILSEQ)
|
||||||
|
{
|
||||||
|
if (!invalid)
|
||||||
|
logprintf (LOG_VERBOSE,
|
||||||
|
"Incomplete or invalide multibyte sequence encountered\n");
|
||||||
|
|
||||||
|
invalid++;
|
||||||
|
**out = *in;
|
||||||
|
in++;
|
||||||
|
inlen--;
|
||||||
|
(*out)++;
|
||||||
|
outlen--;
|
||||||
|
}
|
||||||
|
else if (errno == E2BIG) /* Output buffer full */
|
||||||
|
{
|
||||||
|
char *new;
|
||||||
|
|
||||||
|
tooshort++;
|
||||||
|
done = len;
|
||||||
|
outlen = done + inlen * 2;
|
||||||
|
new = xmalloc (outlen + 1);
|
||||||
|
memcpy (new, s, done);
|
||||||
|
xfree (s);
|
||||||
|
s = new;
|
||||||
|
len = outlen;
|
||||||
|
*out = s + done;
|
||||||
|
}
|
||||||
|
else /* Weird, we got an unspecified error */
|
||||||
|
{
|
||||||
|
logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
|
||||||
|
on error. */
|
||||||
|
char *
|
||||||
|
idn_encode (struct iri *i, char *host)
|
||||||
|
{
|
||||||
|
char *new;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/* Encode to UTF-8 if not done */
|
||||||
|
if (!i->utf8_encode)
|
||||||
|
{
|
||||||
|
if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
|
||||||
|
return NULL; /* Nothing to encode or an error occured */
|
||||||
|
host = new;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* toASCII UTF-8 NULL terminated string */
|
||||||
|
ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
|
||||||
|
if (ret != IDNA_SUCCESS)
|
||||||
|
{
|
||||||
|
/* sXXXav : free new when needed ! */
|
||||||
|
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
|
||||||
|
quote (idna_strerror (ret)));
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
|
||||||
|
on success or NULL on error. */
|
||||||
|
char *
|
||||||
|
idn_decode (char *host)
|
||||||
|
{
|
||||||
|
char *new;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
|
||||||
|
if (ret != IDNA_SUCCESS)
|
||||||
|
{
|
||||||
|
logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
|
||||||
|
quote (idna_strerror (ret)));
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Try to transcode string str from remote encoding to UTF-8. On success, *new
|
||||||
|
contains the transcoded string. *new content is unspecified otherwise. */
|
||||||
|
bool
|
||||||
|
remote_to_utf8 (struct iri *i, const char *str, const char **new)
|
||||||
|
{
|
||||||
|
iconv_t cd;
|
||||||
|
bool ret = false;
|
||||||
|
|
||||||
|
if (!i->uri_encoding)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
cd = iconv_open ("UTF-8", i->uri_encoding);
|
||||||
|
if (cd == (iconv_t)(-1))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
|
||||||
|
ret = true;
|
||||||
|
|
||||||
|
iconv_close (cd);
|
||||||
|
|
||||||
|
/* Test if something was converted */
|
||||||
|
if (!strcmp (str, *new))
|
||||||
|
{
|
||||||
|
xfree ((char *) *new);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Allocate a new iri structure and return a pointer to it. */
|
||||||
|
struct iri *
|
||||||
|
iri_new (void)
|
||||||
|
{
|
||||||
|
struct iri *i = xmalloc (sizeof (struct iri));
|
||||||
|
i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
|
||||||
|
i->content_encoding = NULL;
|
||||||
|
i->utf8_encode = opt.enable_iri;
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Completely free an iri structure. */
|
||||||
|
void
|
||||||
|
iri_free (struct iri *i)
|
||||||
|
{
|
||||||
|
xfree_null (i->uri_encoding);
|
||||||
|
xfree_null (i->content_encoding);
|
||||||
|
xfree (i);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Set uri_encoding of struct iri i. If a remote encoding was specified, use
|
||||||
|
it unless force is true. */
|
||||||
|
void
|
||||||
|
set_uri_encoding (struct iri *i, char *charset, bool force)
|
||||||
|
{
|
||||||
|
DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None"));
|
||||||
|
if (!force && opt.encoding_remote)
|
||||||
|
return;
|
||||||
|
if (i->uri_encoding)
|
||||||
|
{
|
||||||
|
if (charset && !strcasecmp (i->uri_encoding, charset))
|
||||||
|
return;
|
||||||
|
xfree (i->uri_encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
i->uri_encoding = charset ? xstrdup (charset) : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Set content_encoding of struct iri i. */
|
||||||
|
void
|
||||||
|
set_content_encoding (struct iri *i, char *charset)
|
||||||
|
{
|
||||||
|
DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None"));
|
||||||
|
if (opt.encoding_remote)
|
||||||
|
return;
|
||||||
|
if (i->content_encoding)
|
||||||
|
{
|
||||||
|
if (charset && !strcasecmp (i->content_encoding, charset))
|
||||||
|
return;
|
||||||
|
xfree (i->content_encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
i->content_encoding = charset ? xstrdup (charset) : NULL;
|
||||||
|
}
|
||||||
|
|
70
src/iri.h
Normal file
70
src/iri.h
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
/* Internationalization related declarations.
|
||||||
|
Copyright (C) 2008 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
This file is part of GNU Wget.
|
||||||
|
|
||||||
|
GNU Wget is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
GNU Wget is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with Wget. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
Additional permission under GNU GPL version 3 section 7
|
||||||
|
|
||||||
|
If you modify this program, or any covered work, by linking or
|
||||||
|
combining it with the OpenSSL project's OpenSSL library (or a
|
||||||
|
modified version of that library), containing parts covered by the
|
||||||
|
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
|
||||||
|
grants you additional permission to convey the resulting work.
|
||||||
|
Corresponding Source for a non-source form of such a combination
|
||||||
|
shall include the source code for the parts of OpenSSL used as well
|
||||||
|
as that of the covered work. */
|
||||||
|
|
||||||
|
#ifndef IRI_H
|
||||||
|
#define IRI_H
|
||||||
|
|
||||||
|
struct iri {
|
||||||
|
char *uri_encoding; /* Encoding of the uri to fetch */
|
||||||
|
char *content_encoding; /* Encoding of links inside the fetched file */
|
||||||
|
bool utf8_encode; /* Will/Is the current url encoded in utf8 */
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef ENABLE_IRI
|
||||||
|
|
||||||
|
char *parse_charset (char *str);
|
||||||
|
char *find_locale (void);
|
||||||
|
bool check_encoding_name (char *encoding);
|
||||||
|
const char *locale_to_utf8 (const char *str);
|
||||||
|
char *idn_encode (struct iri *i, char *host);
|
||||||
|
char *idn_decode (char *host);
|
||||||
|
bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
|
||||||
|
struct iri *iri_new (void);
|
||||||
|
void iri_free (struct iri *i);
|
||||||
|
void set_uri_encoding (struct iri *i, char *charset, bool force);
|
||||||
|
void set_content_encoding (struct iri *i, char *charset);
|
||||||
|
|
||||||
|
#else /* ENABLE_IRI */
|
||||||
|
|
||||||
|
struct iri dummy_iri;
|
||||||
|
|
||||||
|
#define parse_charset(str) NULL
|
||||||
|
#define find_locale() NULL
|
||||||
|
#define check_encoding_name(str) false
|
||||||
|
#define locale_to_utf8(str) (str)
|
||||||
|
#define idn_encode(a,b) NULL
|
||||||
|
#define idn_decode(str) NULL
|
||||||
|
#define remote_to_utf8(a,b,c) false
|
||||||
|
#define iri_new() (&dummy_iri)
|
||||||
|
#define iri_free(a)
|
||||||
|
#define set_uri_encoding(a,b,c)
|
||||||
|
#define set_content_encoding(a,b)
|
||||||
|
|
||||||
|
#endif /* ENABLE_IRI */
|
||||||
|
#endif /* IRI_H */
|
@ -43,7 +43,7 @@ as that of the covered work. */
|
|||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
/* This file impplement support for "logging". Logging means printing
|
/* This file implement support for "logging". Logging means printing
|
||||||
output, plus several additional features:
|
output, plus several additional features:
|
||||||
|
|
||||||
- Cataloguing output by importance. You can specify that a log
|
- Cataloguing output by importance. You can specify that a log
|
||||||
|
34
src/main.c
34
src/main.c
@ -201,10 +201,12 @@ static struct cmdline_option option_data[] =
|
|||||||
{ "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
|
{ "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
|
||||||
#endif
|
#endif
|
||||||
{ "input-file", 'i', OPT_VALUE, "input", -1 },
|
{ "input-file", 'i', OPT_VALUE, "input", -1 },
|
||||||
|
{ "iri", 0, OPT_BOOLEAN, "iri", -1 },
|
||||||
{ "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
|
{ "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
|
||||||
{ "level", 'l', OPT_VALUE, "reclevel", -1 },
|
{ "level", 'l', OPT_VALUE, "reclevel", -1 },
|
||||||
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
|
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
|
||||||
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
|
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
|
||||||
|
{ "locale", 0, OPT_VALUE, "locale", -1 },
|
||||||
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
|
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
|
||||||
{ "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
|
{ "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
|
||||||
{ "no", 'n', OPT__NO, NULL, required_argument },
|
{ "no", 'n', OPT__NO, NULL, required_argument },
|
||||||
@ -238,6 +240,7 @@ static struct cmdline_option option_data[] =
|
|||||||
{ "referer", 0, OPT_VALUE, "referer", -1 },
|
{ "referer", 0, OPT_VALUE, "referer", -1 },
|
||||||
{ "reject", 'R', OPT_VALUE, "reject", -1 },
|
{ "reject", 'R', OPT_VALUE, "reject", -1 },
|
||||||
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
|
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
|
||||||
|
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
|
||||||
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
|
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
|
||||||
{ "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
|
{ "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
|
||||||
{ "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
|
{ "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
|
||||||
@ -1062,6 +1065,27 @@ for details.\n\n"));
|
|||||||
exit (1);
|
exit (1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef ENABLE_IRI
|
||||||
|
if (opt.enable_iri)
|
||||||
|
{
|
||||||
|
if (opt.locale && !check_encoding_name (opt.locale))
|
||||||
|
opt.locale = NULL;
|
||||||
|
|
||||||
|
if (!opt.locale)
|
||||||
|
opt.locale = find_locale ();
|
||||||
|
|
||||||
|
if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
|
||||||
|
opt.encoding_remote = NULL;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (opt.enable_iri || opt.locale || opt.encoding_remote)
|
||||||
|
{
|
||||||
|
/* sXXXav : be more specific... */
|
||||||
|
printf(_("This version does not have support for IRIs\n"));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (opt.ask_passwd)
|
if (opt.ask_passwd)
|
||||||
{
|
{
|
||||||
opt.passwd = prompt_for_password ();
|
opt.passwd = prompt_for_password ();
|
||||||
@ -1174,12 +1198,18 @@ WARNING: Can't reopen standard output in binary mode;\n\
|
|||||||
if (url_scheme (*t) == SCHEME_FTP)
|
if (url_scheme (*t) == SCHEME_FTP)
|
||||||
opt.follow_ftp = 1;
|
opt.follow_ftp = 1;
|
||||||
|
|
||||||
status = retrieve_tree (*t);
|
status = retrieve_tree (*t, NULL);
|
||||||
|
|
||||||
opt.follow_ftp = old_follow_ftp;
|
opt.follow_ftp = old_follow_ftp;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
|
{
|
||||||
|
struct iri *i = iri_new ();
|
||||||
|
set_uri_encoding (i, opt.locale, true);
|
||||||
|
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
|
||||||
|
opt.recursive, i);
|
||||||
|
iri_free (i);
|
||||||
|
}
|
||||||
|
|
||||||
if (opt.delete_after && file_exists_p(filename))
|
if (opt.delete_after && file_exists_p(filename))
|
||||||
{
|
{
|
||||||
|
@ -239,6 +239,10 @@ struct options
|
|||||||
bool content_disposition; /* Honor HTTP Content-Disposition header. */
|
bool content_disposition; /* Honor HTTP Content-Disposition header. */
|
||||||
bool auth_without_challenge; /* Issue Basic authentication creds without
|
bool auth_without_challenge; /* Issue Basic authentication creds without
|
||||||
waiting for a challenge. */
|
waiting for a challenge. */
|
||||||
|
|
||||||
|
bool enable_iri;
|
||||||
|
char *encoding_remote;
|
||||||
|
char *locale;
|
||||||
};
|
};
|
||||||
|
|
||||||
extern struct options opt;
|
extern struct options opt;
|
||||||
|
75
src/recur.c
75
src/recur.c
@ -51,7 +51,7 @@ as that of the covered work. */
|
|||||||
#include "html-url.h"
|
#include "html-url.h"
|
||||||
#include "css-url.h"
|
#include "css-url.h"
|
||||||
#include "spider.h"
|
#include "spider.h"
|
||||||
|
|
||||||
/* Functions for maintaining the URL queue. */
|
/* Functions for maintaining the URL queue. */
|
||||||
|
|
||||||
struct queue_element {
|
struct queue_element {
|
||||||
@ -60,6 +60,7 @@ struct queue_element {
|
|||||||
int depth; /* the depth */
|
int depth; /* the depth */
|
||||||
bool html_allowed; /* whether the document is allowed to
|
bool html_allowed; /* whether the document is allowed to
|
||||||
be treated as HTML. */
|
be treated as HTML. */
|
||||||
|
struct iri *iri; /* sXXXav */
|
||||||
bool css_allowed; /* whether the document is allowed to
|
bool css_allowed; /* whether the document is allowed to
|
||||||
be treated as CSS. */
|
be treated as CSS. */
|
||||||
struct queue_element *next; /* next element in queue */
|
struct queue_element *next; /* next element in queue */
|
||||||
@ -93,11 +94,12 @@ url_queue_delete (struct url_queue *queue)
|
|||||||
into it. */
|
into it. */
|
||||||
|
|
||||||
static void
|
static void
|
||||||
url_enqueue (struct url_queue *queue,
|
url_enqueue (struct url_queue *queue, struct iri *i,
|
||||||
const char *url, const char *referer, int depth,
|
const char *url, const char *referer, int depth,
|
||||||
bool html_allowed, bool css_allowed)
|
bool html_allowed, bool css_allowed)
|
||||||
{
|
{
|
||||||
struct queue_element *qel = xnew (struct queue_element);
|
struct queue_element *qel = xnew (struct queue_element);
|
||||||
|
qel->iri = i;
|
||||||
qel->url = url;
|
qel->url = url;
|
||||||
qel->referer = referer;
|
qel->referer = referer;
|
||||||
qel->depth = depth;
|
qel->depth = depth;
|
||||||
@ -112,6 +114,10 @@ url_enqueue (struct url_queue *queue,
|
|||||||
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
|
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
|
||||||
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
|
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
|
||||||
|
|
||||||
|
if (i)
|
||||||
|
DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url),
|
||||||
|
i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
|
||||||
|
|
||||||
if (queue->tail)
|
if (queue->tail)
|
||||||
queue->tail->next = qel;
|
queue->tail->next = qel;
|
||||||
queue->tail = qel;
|
queue->tail = qel;
|
||||||
@ -124,7 +130,7 @@ url_enqueue (struct url_queue *queue,
|
|||||||
succeeded, or false if the queue is empty. */
|
succeeded, or false if the queue is empty. */
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
url_dequeue (struct url_queue *queue,
|
url_dequeue (struct url_queue *queue, struct iri **i,
|
||||||
const char **url, const char **referer, int *depth,
|
const char **url, const char **referer, int *depth,
|
||||||
bool *html_allowed, bool *css_allowed)
|
bool *html_allowed, bool *css_allowed)
|
||||||
{
|
{
|
||||||
@ -137,6 +143,7 @@ url_dequeue (struct url_queue *queue,
|
|||||||
if (!queue->head)
|
if (!queue->head)
|
||||||
queue->tail = NULL;
|
queue->tail = NULL;
|
||||||
|
|
||||||
|
*i = qel->iri;
|
||||||
*url = qel->url;
|
*url = qel->url;
|
||||||
*referer = qel->referer;
|
*referer = qel->referer;
|
||||||
*depth = qel->depth;
|
*depth = qel->depth;
|
||||||
@ -153,9 +160,9 @@ url_dequeue (struct url_queue *queue,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool download_child_p (const struct urlpos *, struct url *, int,
|
static bool download_child_p (const struct urlpos *, struct url *, int,
|
||||||
struct url *, struct hash_table *);
|
struct url *, struct hash_table *, struct iri *);
|
||||||
static bool descend_redirect_p (const char *, const char *, int,
|
static bool descend_redirect_p (const char *, const char *, int,
|
||||||
struct url *, struct hash_table *);
|
struct url *, struct hash_table *, struct iri *);
|
||||||
|
|
||||||
|
|
||||||
/* Retrieve a part of the web beginning with START_URL. This used to
|
/* Retrieve a part of the web beginning with START_URL. This used to
|
||||||
@ -180,7 +187,7 @@ static bool descend_redirect_p (const char *, const char *, int,
|
|||||||
options, add it to the queue. */
|
options, add it to the queue. */
|
||||||
|
|
||||||
uerr_t
|
uerr_t
|
||||||
retrieve_tree (const char *start_url)
|
retrieve_tree (const char *start_url, struct iri *pi)
|
||||||
{
|
{
|
||||||
uerr_t status = RETROK;
|
uerr_t status = RETROK;
|
||||||
|
|
||||||
@ -192,8 +199,22 @@ retrieve_tree (const char *start_url)
|
|||||||
struct hash_table *blacklist;
|
struct hash_table *blacklist;
|
||||||
|
|
||||||
int up_error_code;
|
int up_error_code;
|
||||||
struct url *start_url_parsed = url_parse (start_url, &up_error_code);
|
struct url *start_url_parsed;
|
||||||
|
struct iri *i = iri_new ();
|
||||||
|
|
||||||
|
#define COPYSTR(x) (x) ? xstrdup(x) : NULL;
|
||||||
|
/* Duplicate pi struct if not NULL */
|
||||||
|
if (pi)
|
||||||
|
{
|
||||||
|
i->uri_encoding = COPYSTR (pi->uri_encoding);
|
||||||
|
i->content_encoding = COPYSTR (pi->content_encoding);
|
||||||
|
i->utf8_encode = pi->utf8_encode;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
set_uri_encoding (i, opt.locale, true);
|
||||||
|
#undef COPYSTR
|
||||||
|
|
||||||
|
start_url_parsed = url_parse (start_url, &up_error_code, i);
|
||||||
if (!start_url_parsed)
|
if (!start_url_parsed)
|
||||||
{
|
{
|
||||||
char *error = url_error (start_url, up_error_code);
|
char *error = url_error (start_url, up_error_code);
|
||||||
@ -207,7 +228,8 @@ retrieve_tree (const char *start_url)
|
|||||||
|
|
||||||
/* Enqueue the starting URL. Use start_url_parsed->url rather than
|
/* Enqueue the starting URL. Use start_url_parsed->url rather than
|
||||||
just URL so we enqueue the canonical form of the URL. */
|
just URL so we enqueue the canonical form of the URL. */
|
||||||
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
|
url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
|
||||||
|
false);
|
||||||
string_set_add (blacklist, start_url_parsed->url);
|
string_set_add (blacklist, start_url_parsed->url);
|
||||||
|
|
||||||
while (1)
|
while (1)
|
||||||
@ -226,7 +248,7 @@ retrieve_tree (const char *start_url)
|
|||||||
|
|
||||||
/* Get the next URL from the queue... */
|
/* Get the next URL from the queue... */
|
||||||
|
|
||||||
if (!url_dequeue (queue,
|
if (!url_dequeue (queue, (struct iri **) &i,
|
||||||
(const char **)&url, (const char **)&referer,
|
(const char **)&url, (const char **)&referer,
|
||||||
&depth, &html_allowed, &css_allowed))
|
&depth, &html_allowed, &css_allowed))
|
||||||
break;
|
break;
|
||||||
@ -267,7 +289,8 @@ retrieve_tree (const char *start_url)
|
|||||||
int dt = 0;
|
int dt = 0;
|
||||||
char *redirected = NULL;
|
char *redirected = NULL;
|
||||||
|
|
||||||
status = retrieve_url (url, &file, &redirected, referer, &dt, false);
|
status = retrieve_url (url, &file, &redirected, referer, &dt,
|
||||||
|
false, i);
|
||||||
|
|
||||||
if (html_allowed && file && status == RETROK
|
if (html_allowed && file && status == RETROK
|
||||||
&& (dt & RETROKF) && (dt & TEXTHTML))
|
&& (dt & RETROKF) && (dt & TEXTHTML))
|
||||||
@ -295,7 +318,7 @@ retrieve_tree (const char *start_url)
|
|||||||
if (descend)
|
if (descend)
|
||||||
{
|
{
|
||||||
if (!descend_redirect_p (redirected, url, depth,
|
if (!descend_redirect_p (redirected, url, depth,
|
||||||
start_url_parsed, blacklist))
|
start_url_parsed, blacklist, i))
|
||||||
descend = false;
|
descend = false;
|
||||||
else
|
else
|
||||||
/* Make sure that the old pre-redirect form gets
|
/* Make sure that the old pre-redirect form gets
|
||||||
@ -347,7 +370,7 @@ retrieve_tree (const char *start_url)
|
|||||||
bool meta_disallow_follow = false;
|
bool meta_disallow_follow = false;
|
||||||
struct urlpos *children
|
struct urlpos *children
|
||||||
= is_css ? get_urls_css_file (file, url) :
|
= is_css ? get_urls_css_file (file, url) :
|
||||||
get_urls_html (file, url, &meta_disallow_follow);
|
get_urls_html (file, url, &meta_disallow_follow, i);
|
||||||
|
|
||||||
if (opt.use_robots && meta_disallow_follow)
|
if (opt.use_robots && meta_disallow_follow)
|
||||||
{
|
{
|
||||||
@ -358,7 +381,8 @@ retrieve_tree (const char *start_url)
|
|||||||
if (children)
|
if (children)
|
||||||
{
|
{
|
||||||
struct urlpos *child = children;
|
struct urlpos *child = children;
|
||||||
struct url *url_parsed = url_parsed = url_parse (url, NULL);
|
struct url *url_parsed = url_parse (url, NULL, i);
|
||||||
|
struct iri *ci;
|
||||||
char *referer_url = url;
|
char *referer_url = url;
|
||||||
bool strip_auth = (url_parsed != NULL
|
bool strip_auth = (url_parsed != NULL
|
||||||
&& url_parsed->user != NULL);
|
&& url_parsed->user != NULL);
|
||||||
@ -375,9 +399,11 @@ retrieve_tree (const char *start_url)
|
|||||||
if (dash_p_leaf_HTML && !child->link_inline_p)
|
if (dash_p_leaf_HTML && !child->link_inline_p)
|
||||||
continue;
|
continue;
|
||||||
if (download_child_p (child, url_parsed, depth, start_url_parsed,
|
if (download_child_p (child, url_parsed, depth, start_url_parsed,
|
||||||
blacklist))
|
blacklist, i))
|
||||||
{
|
{
|
||||||
url_enqueue (queue, xstrdup (child->url->url),
|
ci = iri_new ();
|
||||||
|
set_uri_encoding (ci, i->content_encoding, false);
|
||||||
|
url_enqueue (queue, ci, xstrdup (child->url->url),
|
||||||
xstrdup (referer_url), depth + 1,
|
xstrdup (referer_url), depth + 1,
|
||||||
child->link_expect_html,
|
child->link_expect_html,
|
||||||
child->link_expect_css);
|
child->link_expect_css);
|
||||||
@ -422,6 +448,7 @@ retrieve_tree (const char *start_url)
|
|||||||
xfree (url);
|
xfree (url);
|
||||||
xfree_null (referer);
|
xfree_null (referer);
|
||||||
xfree_null (file);
|
xfree_null (file);
|
||||||
|
iri_free (i);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If anything is left of the queue due to a premature exit, free it
|
/* If anything is left of the queue due to a premature exit, free it
|
||||||
@ -430,9 +457,11 @@ retrieve_tree (const char *start_url)
|
|||||||
char *d1, *d2;
|
char *d1, *d2;
|
||||||
int d3;
|
int d3;
|
||||||
bool d4, d5;
|
bool d4, d5;
|
||||||
while (url_dequeue (queue,
|
struct iri *d6;
|
||||||
|
while (url_dequeue (queue, (struct iri **)&d6,
|
||||||
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
|
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
|
||||||
{
|
{
|
||||||
|
iri_free (d6);
|
||||||
xfree (d1);
|
xfree (d1);
|
||||||
xfree_null (d2);
|
xfree_null (d2);
|
||||||
}
|
}
|
||||||
@ -461,7 +490,8 @@ retrieve_tree (const char *start_url)
|
|||||||
|
|
||||||
static bool
|
static bool
|
||||||
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||||
struct url *start_url_parsed, struct hash_table *blacklist)
|
struct url *start_url_parsed, struct hash_table *blacklist,
|
||||||
|
struct iri *iri)
|
||||||
{
|
{
|
||||||
struct url *u = upos->url;
|
struct url *u = upos->url;
|
||||||
const char *url = u->url;
|
const char *url = u->url;
|
||||||
@ -602,7 +632,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
if (!specs)
|
if (!specs)
|
||||||
{
|
{
|
||||||
char *rfile;
|
char *rfile;
|
||||||
if (res_retrieve_file (url, &rfile))
|
if (res_retrieve_file (url, &rfile, iri))
|
||||||
{
|
{
|
||||||
specs = res_parse_from_file (rfile);
|
specs = res_parse_from_file (rfile);
|
||||||
|
|
||||||
@ -657,23 +687,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
|
|
||||||
static bool
|
static bool
|
||||||
descend_redirect_p (const char *redirected, const char *original, int depth,
|
descend_redirect_p (const char *redirected, const char *original, int depth,
|
||||||
struct url *start_url_parsed, struct hash_table *blacklist)
|
struct url *start_url_parsed, struct hash_table *blacklist,
|
||||||
|
struct iri *iri)
|
||||||
{
|
{
|
||||||
struct url *orig_parsed, *new_parsed;
|
struct url *orig_parsed, *new_parsed;
|
||||||
struct urlpos *upos;
|
struct urlpos *upos;
|
||||||
bool success;
|
bool success;
|
||||||
|
|
||||||
orig_parsed = url_parse (original, NULL);
|
orig_parsed = url_parse (original, NULL, NULL);
|
||||||
assert (orig_parsed != NULL);
|
assert (orig_parsed != NULL);
|
||||||
|
|
||||||
new_parsed = url_parse (redirected, NULL);
|
new_parsed = url_parse (redirected, NULL, NULL);
|
||||||
assert (new_parsed != NULL);
|
assert (new_parsed != NULL);
|
||||||
|
|
||||||
upos = xnew0 (struct urlpos);
|
upos = xnew0 (struct urlpos);
|
||||||
upos->url = new_parsed;
|
upos->url = new_parsed;
|
||||||
|
|
||||||
success = download_child_p (upos, orig_parsed, depth,
|
success = download_child_p (upos, orig_parsed, depth,
|
||||||
start_url_parsed, blacklist);
|
start_url_parsed, blacklist, iri);
|
||||||
|
|
||||||
url_free (orig_parsed);
|
url_free (orig_parsed);
|
||||||
url_free (new_parsed);
|
url_free (new_parsed);
|
||||||
|
@ -42,6 +42,6 @@ as that of the covered work. */
|
|||||||
struct urlpos;
|
struct urlpos;
|
||||||
|
|
||||||
void recursive_cleanup (void);
|
void recursive_cleanup (void);
|
||||||
uerr_t retrieve_tree (const char *);
|
uerr_t retrieve_tree (const char *, struct iri *);
|
||||||
|
|
||||||
#endif /* RECUR_H */
|
#endif /* RECUR_H */
|
||||||
|
11
src/res.c
11
src/res.c
@ -532,21 +532,28 @@ res_get_specs (const char *host, int port)
|
|||||||
Return true if robots were retrieved OK, false otherwise. */
|
Return true if robots were retrieved OK, false otherwise. */
|
||||||
|
|
||||||
bool
|
bool
|
||||||
res_retrieve_file (const char *url, char **file)
|
res_retrieve_file (const char *url, char **file, struct iri *iri)
|
||||||
{
|
{
|
||||||
|
struct iri *i = iri_new ();
|
||||||
uerr_t err;
|
uerr_t err;
|
||||||
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
|
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
|
||||||
int saved_ts_val = opt.timestamping;
|
int saved_ts_val = opt.timestamping;
|
||||||
int saved_sp_val = opt.spider;
|
int saved_sp_val = opt.spider;
|
||||||
|
|
||||||
|
/* Copy server URI encoding for a possible IDNA transformation, no need to
|
||||||
|
encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
|
||||||
|
set_uri_encoding (i, iri->uri_encoding, false);
|
||||||
|
i->utf8_encode = false;
|
||||||
|
|
||||||
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
|
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
|
||||||
*file = NULL;
|
*file = NULL;
|
||||||
opt.timestamping = false;
|
opt.timestamping = false;
|
||||||
opt.spider = false;
|
opt.spider = false;
|
||||||
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
|
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
|
||||||
opt.timestamping = saved_ts_val;
|
opt.timestamping = saved_ts_val;
|
||||||
opt.spider = saved_sp_val;
|
opt.spider = saved_sp_val;
|
||||||
xfree (robots_url);
|
xfree (robots_url);
|
||||||
|
iri_free (i);
|
||||||
|
|
||||||
if (err != RETROK && *file != NULL)
|
if (err != RETROK && *file != NULL)
|
||||||
{
|
{
|
||||||
|
@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *);
|
|||||||
void res_register_specs (const char *, int, struct robot_specs *);
|
void res_register_specs (const char *, int, struct robot_specs *);
|
||||||
struct robot_specs *res_get_specs (const char *, int);
|
struct robot_specs *res_get_specs (const char *, int);
|
||||||
|
|
||||||
bool res_retrieve_file (const char *, char **);
|
bool res_retrieve_file (const char *, char **, struct iri *);
|
||||||
|
|
||||||
bool is_robots_txt_url (const char *);
|
bool is_robots_txt_url (const char *);
|
||||||
|
|
||||||
|
69
src/retr.c
69
src/retr.c
@ -597,7 +597,7 @@ static char *getproxy (struct url *);
|
|||||||
|
|
||||||
uerr_t
|
uerr_t
|
||||||
retrieve_url (const char *origurl, char **file, char **newloc,
|
retrieve_url (const char *origurl, char **file, char **newloc,
|
||||||
const char *refurl, int *dt, bool recursive)
|
const char *refurl, int *dt, bool recursive, struct iri *iri)
|
||||||
{
|
{
|
||||||
uerr_t result;
|
uerr_t result;
|
||||||
char *url;
|
char *url;
|
||||||
@ -625,7 +625,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
if (file)
|
if (file)
|
||||||
*file = NULL;
|
*file = NULL;
|
||||||
|
|
||||||
u = url_parse (url, &up_error_code);
|
second_try:
|
||||||
|
u = url_parse (url, &up_error_code, iri);
|
||||||
if (!u)
|
if (!u)
|
||||||
{
|
{
|
||||||
char *error = url_error (url, up_error_code);
|
char *error = url_error (url, up_error_code);
|
||||||
@ -635,6 +636,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
return URLERROR;
|
return URLERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url),
|
||||||
|
iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None",
|
||||||
|
iri->utf8_encode));
|
||||||
|
|
||||||
if (!refurl)
|
if (!refurl)
|
||||||
refurl = opt.referer;
|
refurl = opt.referer;
|
||||||
|
|
||||||
@ -648,8 +653,12 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
proxy = getproxy (u);
|
proxy = getproxy (u);
|
||||||
if (proxy)
|
if (proxy)
|
||||||
{
|
{
|
||||||
|
struct iri *pi = iri_new ();
|
||||||
|
set_uri_encoding (pi, opt.locale, true);
|
||||||
|
pi->utf8_encode = false;
|
||||||
|
|
||||||
/* Parse the proxy URL. */
|
/* Parse the proxy URL. */
|
||||||
proxy_url = url_parse (proxy, &up_error_code);
|
proxy_url = url_parse (proxy, &up_error_code, NULL);
|
||||||
if (!proxy_url)
|
if (!proxy_url)
|
||||||
{
|
{
|
||||||
char *error = url_error (proxy, up_error_code);
|
char *error = url_error (proxy, up_error_code);
|
||||||
@ -676,7 +685,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
#endif
|
#endif
|
||||||
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
|
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
|
||||||
{
|
{
|
||||||
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
|
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
|
||||||
}
|
}
|
||||||
else if (u->scheme == SCHEME_FTP)
|
else if (u->scheme == SCHEME_FTP)
|
||||||
{
|
{
|
||||||
@ -726,8 +735,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
xfree (mynewloc);
|
xfree (mynewloc);
|
||||||
mynewloc = construced_newloc;
|
mynewloc = construced_newloc;
|
||||||
|
|
||||||
|
/* Reset UTF-8 encoding state, keep the URI encoding and reset
|
||||||
|
the content encoding. */
|
||||||
|
iri->utf8_encode = opt.enable_iri;
|
||||||
|
set_content_encoding (iri, NULL);
|
||||||
|
|
||||||
/* Now, see if this new location makes sense. */
|
/* Now, see if this new location makes sense. */
|
||||||
newloc_parsed = url_parse (mynewloc, &up_error_code);
|
newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
|
||||||
if (!newloc_parsed)
|
if (!newloc_parsed)
|
||||||
{
|
{
|
||||||
char *error = url_error (mynewloc, up_error_code);
|
char *error = url_error (mynewloc, up_error_code);
|
||||||
@ -776,8 +790,21 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
goto redirected;
|
goto redirected;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (local_file)
|
/* Try to not encode in UTF-8 if fetching failed */
|
||||||
|
if (!(*dt & RETROKF) && iri->utf8_encode)
|
||||||
{
|
{
|
||||||
|
iri->utf8_encode = false;
|
||||||
|
DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url)));
|
||||||
|
goto second_try;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (local_file && *dt & RETROKF)
|
||||||
|
{
|
||||||
|
register_download (u->url, local_file);
|
||||||
|
if (redirection_count && 0 != strcmp (origurl, u->url))
|
||||||
|
register_redirection (origurl, u->url);
|
||||||
|
if (*dt & TEXTHTML)
|
||||||
|
register_html (u->url, local_file);
|
||||||
if (*dt & RETROKF)
|
if (*dt & RETROKF)
|
||||||
{
|
{
|
||||||
register_download (u->url, local_file);
|
register_download (u->url, local_file);
|
||||||
@ -827,6 +854,7 @@ retrieve_from_file (const char *file, bool html, int *count)
|
|||||||
{
|
{
|
||||||
uerr_t status;
|
uerr_t status;
|
||||||
struct urlpos *url_list, *cur_url;
|
struct urlpos *url_list, *cur_url;
|
||||||
|
struct iri *iri = iri_new();
|
||||||
|
|
||||||
char *input_file = NULL;
|
char *input_file = NULL;
|
||||||
const char *url = file;
|
const char *url = file;
|
||||||
@ -834,6 +862,10 @@ retrieve_from_file (const char *file, bool html, int *count)
|
|||||||
status = RETROK; /* Suppose everything is OK. */
|
status = RETROK; /* Suppose everything is OK. */
|
||||||
*count = 0; /* Reset the URL count. */
|
*count = 0; /* Reset the URL count. */
|
||||||
|
|
||||||
|
/* sXXXav : Assume filename and links in the file are in the locale */
|
||||||
|
set_uri_encoding (iri, opt.locale, true);
|
||||||
|
set_content_encoding (iri, opt.locale);
|
||||||
|
|
||||||
if (url_has_scheme (url))
|
if (url_has_scheme (url))
|
||||||
{
|
{
|
||||||
int dt;
|
int dt;
|
||||||
@ -842,17 +874,21 @@ retrieve_from_file (const char *file, bool html, int *count)
|
|||||||
if (!opt.base_href)
|
if (!opt.base_href)
|
||||||
opt.base_href = xstrdup (url);
|
opt.base_href = xstrdup (url);
|
||||||
|
|
||||||
status = retrieve_url (url, &input_file, NULL, NULL, &dt, false);
|
status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri);
|
||||||
if (status != RETROK)
|
if (status != RETROK)
|
||||||
return status;
|
return status;
|
||||||
|
|
||||||
if (dt & TEXTHTML)
|
if (dt & TEXTHTML)
|
||||||
html = true;
|
html = true;
|
||||||
|
|
||||||
|
/* If we have a found a content encoding, use it */
|
||||||
|
if (iri->content_encoding)
|
||||||
|
set_uri_encoding (iri, iri->content_encoding, false);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
input_file = (char *) file;
|
input_file = (char *) file;
|
||||||
|
|
||||||
url_list = (html ? get_urls_html (input_file, NULL, NULL)
|
url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
|
||||||
: get_urls_file (input_file));
|
: get_urls_file (input_file));
|
||||||
|
|
||||||
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
|
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
|
||||||
@ -868,6 +904,10 @@ retrieve_from_file (const char *file, bool html, int *count)
|
|||||||
status = QUOTEXC;
|
status = QUOTEXC;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Reset UTF-8 encode status */
|
||||||
|
iri->utf8_encode = opt.enable_iri;
|
||||||
|
|
||||||
if ((opt.recursive || opt.page_requisites)
|
if ((opt.recursive || opt.page_requisites)
|
||||||
&& (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
|
&& (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
|
||||||
{
|
{
|
||||||
@ -877,12 +917,13 @@ retrieve_from_file (const char *file, bool html, int *count)
|
|||||||
if (cur_url->url->scheme == SCHEME_FTP)
|
if (cur_url->url->scheme == SCHEME_FTP)
|
||||||
opt.follow_ftp = 1;
|
opt.follow_ftp = 1;
|
||||||
|
|
||||||
status = retrieve_tree (cur_url->url->url);
|
status = retrieve_tree (cur_url->url->url, iri);
|
||||||
|
|
||||||
opt.follow_ftp = old_follow_ftp;
|
opt.follow_ftp = old_follow_ftp;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
|
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
|
||||||
|
&dt, opt.recursive, iri);
|
||||||
|
|
||||||
if (filename && opt.delete_after && file_exists_p (filename))
|
if (filename && opt.delete_after && file_exists_p (filename))
|
||||||
{
|
{
|
||||||
@ -901,6 +942,8 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
|
|||||||
/* Free the linked list of URL-s. */
|
/* Free the linked list of URL-s. */
|
||||||
free_urlpos (url_list);
|
free_urlpos (url_list);
|
||||||
|
|
||||||
|
iri_free (iri);
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1053,7 +1096,11 @@ bool
|
|||||||
url_uses_proxy (const char *url)
|
url_uses_proxy (const char *url)
|
||||||
{
|
{
|
||||||
bool ret;
|
bool ret;
|
||||||
struct url *u = url_parse (url, NULL);
|
struct url *u;
|
||||||
|
struct iri *i = iri_new();
|
||||||
|
/* url was given in the command line, so use locale as encoding */
|
||||||
|
set_uri_encoding (i, opt.locale, true);
|
||||||
|
u= url_parse (url, NULL, i);
|
||||||
if (!u)
|
if (!u)
|
||||||
return false;
|
return false;
|
||||||
ret = getproxy (u) != NULL;
|
ret = getproxy (u) != NULL;
|
||||||
|
@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
|
|||||||
char *fd_read_hunk (int, hunk_terminator_t, long, long);
|
char *fd_read_hunk (int, hunk_terminator_t, long, long);
|
||||||
char *fd_read_line (int);
|
char *fd_read_line (int);
|
||||||
|
|
||||||
uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
|
uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
|
||||||
|
bool, struct iri *);
|
||||||
uerr_t retrieve_from_file (const char *, bool, int *);
|
uerr_t retrieve_from_file (const char *, bool, int *);
|
||||||
|
|
||||||
const char *retr_rate (wgint, double);
|
const char *retr_rate (wgint, double);
|
||||||
|
31
src/url.c
31
src/url.c
@ -640,7 +640,7 @@ static const char *parse_errors[] = {
|
|||||||
error, and if ERROR is not NULL, also set *ERROR to the appropriate
|
error, and if ERROR is not NULL, also set *ERROR to the appropriate
|
||||||
error code. */
|
error code. */
|
||||||
struct url *
|
struct url *
|
||||||
url_parse (const char *url, int *error)
|
url_parse (const char *url, int *error, struct iri *iri)
|
||||||
{
|
{
|
||||||
struct url *u;
|
struct url *u;
|
||||||
const char *p;
|
const char *p;
|
||||||
@ -659,7 +659,7 @@ url_parse (const char *url, int *error)
|
|||||||
int port;
|
int port;
|
||||||
char *user = NULL, *passwd = NULL;
|
char *user = NULL, *passwd = NULL;
|
||||||
|
|
||||||
char *url_encoded = NULL;
|
char *url_encoded = NULL, *new_url = NULL;
|
||||||
|
|
||||||
int error_code;
|
int error_code;
|
||||||
|
|
||||||
@ -670,9 +670,20 @@ url_parse (const char *url, int *error)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
url_encoded = reencode_escapes (url);
|
if (iri && iri->utf8_encode)
|
||||||
|
{
|
||||||
|
url_unescape ((char *) url);
|
||||||
|
iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
|
||||||
|
if (!iri->utf8_encode)
|
||||||
|
new_url = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
url_encoded = reencode_escapes (new_url ? new_url : url);
|
||||||
p = url_encoded;
|
p = url_encoded;
|
||||||
|
|
||||||
|
if (new_url && url_encoded != new_url)
|
||||||
|
xfree (new_url);
|
||||||
|
|
||||||
p += strlen (supported_schemes[scheme].leading_string);
|
p += strlen (supported_schemes[scheme].leading_string);
|
||||||
uname_b = p;
|
uname_b = p;
|
||||||
p = url_skip_credentials (p);
|
p = url_skip_credentials (p);
|
||||||
@ -842,6 +853,18 @@ url_parse (const char *url, int *error)
|
|||||||
{
|
{
|
||||||
url_unescape (u->host);
|
url_unescape (u->host);
|
||||||
host_modified = true;
|
host_modified = true;
|
||||||
|
|
||||||
|
/* Apply IDNA regardless of iri->utf8_encode status */
|
||||||
|
if (opt.enable_iri && iri)
|
||||||
|
{
|
||||||
|
char *new = idn_encode (iri, u->host);
|
||||||
|
if (new)
|
||||||
|
{
|
||||||
|
xfree (u->host);
|
||||||
|
u->host = new;
|
||||||
|
host_modified = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params_b)
|
if (params_b)
|
||||||
@ -851,7 +874,7 @@ url_parse (const char *url, int *error)
|
|||||||
if (fragment_b)
|
if (fragment_b)
|
||||||
u->fragment = strdupdelim (fragment_b, fragment_e);
|
u->fragment = strdupdelim (fragment_b, fragment_e);
|
||||||
|
|
||||||
if (path_modified || u->fragment || host_modified || path_b == path_e)
|
if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
|
||||||
{
|
{
|
||||||
/* If we suspect that a transformation has rendered what
|
/* If we suspect that a transformation has rendered what
|
||||||
url_string might return different from URL_ENCODED, rebuild
|
url_string might return different from URL_ENCODED, rebuild
|
||||||
|
@ -84,7 +84,7 @@ struct url
|
|||||||
|
|
||||||
char *url_escape (const char *);
|
char *url_escape (const char *);
|
||||||
|
|
||||||
struct url *url_parse (const char *, int *);
|
struct url *url_parse (const char *, int *, struct iri *iri);
|
||||||
char *url_error (const char *, int);
|
char *url_error (const char *, int);
|
||||||
char *url_full_path (const struct url *);
|
char *url_full_path (const struct url *);
|
||||||
void url_set_dir (struct url *, const char *);
|
void url_set_dir (struct url *, const char *);
|
||||||
|
@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT;
|
|||||||
#include "quote.h"
|
#include "quote.h"
|
||||||
#include "quotearg.h"
|
#include "quotearg.h"
|
||||||
|
|
||||||
|
/* Likewise for struct iri definition */
|
||||||
|
#include "iri.h"
|
||||||
|
|
||||||
/* Useful macros used across the code: */
|
/* Useful macros used across the code: */
|
||||||
|
|
||||||
/* The number of elements in an array. For example:
|
/* The number of elements in an array. For example:
|
||||||
|
@ -1,3 +1,30 @@
|
|||||||
|
2008-08-14 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* Test-iri-list.px : Fetch files from a remote list.
|
||||||
|
|
||||||
|
2008-08-03 Xavier Saint <wget@sxav.eu>
|
||||||
|
|
||||||
|
* Test-iri.px : HTTP recursive fetch for testing IRI support and
|
||||||
|
fallback.
|
||||||
|
|
||||||
|
* Test-iri-disabled.px : Same file structure as Test-iri.px but with
|
||||||
|
IRI support disabled
|
||||||
|
|
||||||
|
* Test-iri-forced-remote.px : There's a difference between ISO-8859-1
|
||||||
|
and ISO-8859-15 for character 0xA4 (respectively currency sign and
|
||||||
|
euro sign). So with a forced ISO-8859-1 remote encoding, wget should
|
||||||
|
see 0xA4 as a currency sign and transcode it correctly in UTF-8 instead
|
||||||
|
of using the ISO-8859-15 given by the server.
|
||||||
|
|
||||||
|
* Test-ftp-iri.px : Give a file to fetch via FTP in a specific locale
|
||||||
|
and expect wget to fetch the file UTF-8 encoded.
|
||||||
|
|
||||||
|
* Test-ftp-iri-fallback.px : Same as above but wget should fallback on
|
||||||
|
locale encoding to fetch the file.
|
||||||
|
|
||||||
|
* Test-ftp-iri.px : Same as Test-ftp-iri.px but with IRI support
|
||||||
|
disabled. The UTF-8 encoded file should not be retrieved.
|
||||||
|
|
||||||
2008-06-22 Micah Cowan <micah@cowan.name>
|
2008-06-22 Micah Cowan <micah@cowan.name>
|
||||||
|
|
||||||
* Test-proxied-https-auth.px: Shift exit code so it falls in the
|
* Test-proxied-https-auth.px: Shift exit code so it falls in the
|
||||||
|
50
tests/Test-ftp-iri-disabled.px
Executable file
50
tests/Test-ftp-iri-disabled.px
Executable file
@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use FTPTest;
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $ccedilla_l1 = "\xE7";
|
||||||
|
my $ccedilla_u8 = "\xC3\xA7";
|
||||||
|
|
||||||
|
my $francais = <<EOF;
|
||||||
|
Some text.
|
||||||
|
EOF
|
||||||
|
|
||||||
|
$francais =~ s/\n/\r\n/;
|
||||||
|
|
||||||
|
|
||||||
|
# code, msg, headers, content
|
||||||
|
my %urls = (
|
||||||
|
"/fran${ccedilla_u8}ais.txt" => {
|
||||||
|
content => $francais,
|
||||||
|
},
|
||||||
|
"/fran${ccedilla_l1}ais.txt" => {
|
||||||
|
content => $francais,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
my $cmdline = $WgetTest::WGETPATH . " --iri=no --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
|
||||||
|
|
||||||
|
my $expected_error_code = 0;
|
||||||
|
|
||||||
|
my %expected_downloaded_files = (
|
||||||
|
"fran${ccedilla_l1}ais.txt" => {
|
||||||
|
content => $francais,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $the_test = FTPTest->new (name => "Test-ftp-iri",
|
||||||
|
input => \%urls,
|
||||||
|
cmdline => $cmdline,
|
||||||
|
errcode => $expected_error_code,
|
||||||
|
output => \%expected_downloaded_files);
|
||||||
|
exit $the_test->run();
|
||||||
|
|
||||||
|
# vim: et ts=4 sw=4
|
||||||
|
|
46
tests/Test-ftp-iri-fallback.px
Executable file
46
tests/Test-ftp-iri-fallback.px
Executable file
@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use FTPTest;
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $ccedilla_l1 = "\xE7";
|
||||||
|
my $ccedilla_u8 = "\xC3\xA7";
|
||||||
|
|
||||||
|
my $francais = <<EOF;
|
||||||
|
Some text.
|
||||||
|
EOF
|
||||||
|
|
||||||
|
$francais =~ s/\n/\r\n/;
|
||||||
|
|
||||||
|
# code, msg, headers, content
|
||||||
|
my %urls = (
|
||||||
|
"/fran${ccedilla_l1}ais.txt" => {
|
||||||
|
content => $francais,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
|
||||||
|
|
||||||
|
my $expected_error_code = 0;
|
||||||
|
|
||||||
|
my %expected_downloaded_files = (
|
||||||
|
"fran${ccedilla_l1}ais.txt" => {
|
||||||
|
content => $francais,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $the_test = FTPTest->new (name => "Test-ftp-iri",
|
||||||
|
input => \%urls,
|
||||||
|
cmdline => $cmdline,
|
||||||
|
errcode => $expected_error_code,
|
||||||
|
output => \%expected_downloaded_files);
|
||||||
|
exit $the_test->run();
|
||||||
|
|
||||||
|
# vim: et ts=4 sw=4
|
||||||
|
|
47
tests/Test-ftp-iri.px
Executable file
47
tests/Test-ftp-iri.px
Executable file
@ -0,0 +1,47 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use FTPTest;
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $ccedilla_l1 = "\xE7";
|
||||||
|
my $ccedilla_u8 = "\xC3\xA7";
|
||||||
|
|
||||||
|
my $francais = <<EOF;
|
||||||
|
Some text.
|
||||||
|
EOF
|
||||||
|
|
||||||
|
$francais =~ s/\n/\r\n/;
|
||||||
|
|
||||||
|
|
||||||
|
# code, msg, headers, content
|
||||||
|
my %urls = (
|
||||||
|
"/fran${ccedilla_u8}ais.txt" => {
|
||||||
|
content => $francais,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
my $cmdline = $WgetTest::WGETPATH . " --locale=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
|
||||||
|
|
||||||
|
my $expected_error_code = 0;
|
||||||
|
|
||||||
|
my %expected_downloaded_files = (
|
||||||
|
"fran${ccedilla_u8}ais.txt" => {
|
||||||
|
content => $francais,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $the_test = FTPTest->new (name => "Test-ftp-iri",
|
||||||
|
input => \%urls,
|
||||||
|
cmdline => $cmdline,
|
||||||
|
errcode => $expected_error_code,
|
||||||
|
output => \%expected_downloaded_files);
|
||||||
|
exit $the_test->run();
|
||||||
|
|
||||||
|
# vim: et ts=4 sw=4
|
||||||
|
|
196
tests/Test-iri-disabled.px
Executable file
196
tests/Test-iri-disabled.px
Executable file
@ -0,0 +1,196 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use HTTPTest;
|
||||||
|
|
||||||
|
# cf. http://en.wikipedia.org/wiki/Latin1
|
||||||
|
# http://en.wikipedia.org/wiki/ISO-8859-15
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
#
|
||||||
|
# mime : charset found in Content-Type HTTP MIME header
|
||||||
|
# meta : charset found in Content-Type meta tag
|
||||||
|
#
|
||||||
|
# index.html mime + file = iso-8859-15
|
||||||
|
# p1_français.html meta + file = iso-8859-1, mime = utf-8
|
||||||
|
# p2_één.html mime + file = iso-8859-1
|
||||||
|
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
|
||||||
|
#
|
||||||
|
|
||||||
|
my $ccedilla_l15 = "\xE7";
|
||||||
|
my $ccedilla_u8 = "\xC3\xA7";
|
||||||
|
my $eacute_l1 = "\xE9";
|
||||||
|
my $eacute_u8 = "\xC3\xA9";
|
||||||
|
my $eurosign_l15 = "\xA4";
|
||||||
|
my $eurosign_u8 = "\xE2\x82\xAC";
|
||||||
|
|
||||||
|
my $pageindex = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Main Page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en français</a>.
|
||||||
|
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pagefrancais = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>La seule page en français</title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pageeen = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Die enkele nederlandstalige pagina</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Één is niet veel maar toch meer dan nul.<br/>
|
||||||
|
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pageeuro = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Euro page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
My tailor isn't rich anymore.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $page404 = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>404</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Nop nop nop...
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# code, msg, headers, content
|
||||||
|
my %urls = (
|
||||||
|
'/index.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=ISO-8859-15",
|
||||||
|
},
|
||||||
|
content => $pageindex,
|
||||||
|
},
|
||||||
|
'/robots.txt' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => "",
|
||||||
|
},
|
||||||
|
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
|
||||||
|
code => "200",
|
||||||
|
msg => "File not found",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $pagefrancais,
|
||||||
|
},
|
||||||
|
'/p1_fran%E7ais.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $pagefrancais,
|
||||||
|
},
|
||||||
|
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
'/p2_%E9%E9n.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=ISO-8859-1",
|
||||||
|
},
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => $pageeuro,
|
||||||
|
},
|
||||||
|
'/p3_%A4%A4%A4.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => $pageeuro,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
my $cmdline = $WgetTest::WGETPATH . " --iri=no -nH -r http://localhost:{{port}}/";
|
||||||
|
|
||||||
|
my $expected_error_code = 0;
|
||||||
|
|
||||||
|
my %expected_downloaded_files = (
|
||||||
|
'index.html' => {
|
||||||
|
content => $pageindex,
|
||||||
|
},
|
||||||
|
'robots.txt' => {
|
||||||
|
content => "",
|
||||||
|
},
|
||||||
|
"p1_fran${ccedilla_l15}ais.html" => {
|
||||||
|
content => $pagefrancais,
|
||||||
|
},
|
||||||
|
"p2_${eacute_l1}${eacute_l1}n.html" => {
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
"p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html" => {
|
||||||
|
content => $pageeuro,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $the_test = HTTPTest->new (name => "Test-iri-disabled",
|
||||||
|
input => \%urls,
|
||||||
|
cmdline => $cmdline,
|
||||||
|
errcode => $expected_error_code,
|
||||||
|
output => \%expected_downloaded_files);
|
||||||
|
exit $the_test->run();
|
||||||
|
|
||||||
|
# vim: et ts=4 sw=4
|
||||||
|
|
207
tests/Test-iri-forced-remote.px
Executable file
207
tests/Test-iri-forced-remote.px
Executable file
@ -0,0 +1,207 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use HTTPTest;
|
||||||
|
|
||||||
|
# cf. http://en.wikipedia.org/wiki/Latin1
|
||||||
|
# http://en.wikipedia.org/wiki/ISO-8859-15
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Force remote encoding to ISO-8859-1
|
||||||
|
#
|
||||||
|
# mime : charset found in Content-Type HTTP MIME header
|
||||||
|
# meta : charset found in Content-Type meta tag
|
||||||
|
#
|
||||||
|
# index.html mime + file = iso-8859-15
|
||||||
|
# p1_français.html meta + file = iso-8859-1, mime = utf-8
|
||||||
|
# p2_één.html mime + file = iso-8859-1
|
||||||
|
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
|
||||||
|
#
|
||||||
|
|
||||||
|
my $ccedilla_l15 = "\xE7";
|
||||||
|
my $ccedilla_u8 = "\xC3\xA7";
|
||||||
|
my $eacute_l1 = "\xE9";
|
||||||
|
my $eacute_u8 = "\xC3\xA9";
|
||||||
|
my $eurosign_l15 = "\xA4";
|
||||||
|
my $eurosign_u8 = "\xE2\x82\xAC";
|
||||||
|
my $currency_l1 = "\xA4";
|
||||||
|
my $currency_u8 = "\xC2\xA4";
|
||||||
|
|
||||||
|
my $pageindex = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Main Page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en français</a>.
|
||||||
|
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pagefrancais = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>La seule page en français</title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pageeen = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Die enkele nederlandstalige pagina</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Één is niet veel maar toch meer dan nul.<br/>
|
||||||
|
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pageeuro = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Euro page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
My tailor isn't rich anymore.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $page404 = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>404</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Nop nop nop...
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# code, msg, headers, content
|
||||||
|
my %urls = (
|
||||||
|
'/index.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=ISO-8859-15",
|
||||||
|
},
|
||||||
|
content => $pageindex,
|
||||||
|
},
|
||||||
|
'/robots.txt' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => "",
|
||||||
|
},
|
||||||
|
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
|
||||||
|
code => "404",
|
||||||
|
msg => "File not found",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $page404,
|
||||||
|
},
|
||||||
|
'/p1_fran%E7ais.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $pagefrancais,
|
||||||
|
},
|
||||||
|
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
'/p2_%E9%E9n.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=ISO-8859-1",
|
||||||
|
},
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => $pageeuro,
|
||||||
|
},
|
||||||
|
'/p3_%A4%A4%A4.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => $pageeuro,
|
||||||
|
},
|
||||||
|
'/p3_%C2%A4%C2%A4%C2%A4.html' => { # UTF-8 encoded
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => $pageeuro,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
my $cmdline = $WgetTest::WGETPATH . " --iri --remote-encoding=iso-8859-1 -nH -r http://localhost:{{port}}/";
|
||||||
|
|
||||||
|
my $expected_error_code = 0;
|
||||||
|
|
||||||
|
my %expected_downloaded_files = (
|
||||||
|
'index.html' => {
|
||||||
|
content => $pageindex,
|
||||||
|
},
|
||||||
|
'robots.txt' => {
|
||||||
|
content => "",
|
||||||
|
},
|
||||||
|
"p1_fran${ccedilla_l15}ais.html" => {
|
||||||
|
content => $pagefrancais,
|
||||||
|
},
|
||||||
|
"p2_${eacute_u8}${eacute_u8}n.html" => {
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
"p3_${currency_u8}${currency_u8}${currency_u8}.html" => {
|
||||||
|
content => $pageeuro,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $the_test = HTTPTest->new (name => "Test-iri-forced-remote",
|
||||||
|
input => \%urls,
|
||||||
|
cmdline => $cmdline,
|
||||||
|
errcode => $expected_error_code,
|
||||||
|
output => \%expected_downloaded_files);
|
||||||
|
exit $the_test->run();
|
||||||
|
|
||||||
|
# vim: et ts=4 sw=4
|
||||||
|
|
173
tests/Test-iri-list.px
Executable file
173
tests/Test-iri-list.px
Executable file
@ -0,0 +1,173 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use HTTPTest;
|
||||||
|
|
||||||
|
# cf. http://en.wikipedia.org/wiki/Latin1
|
||||||
|
# http://en.wikipedia.org/wiki/ISO-8859-15
|
||||||
|
###############################################################################
|
||||||
|
#
|
||||||
|
# mime : charset found in Content-Type HTTP MIME header
|
||||||
|
# meta : charset found in Content-Type meta tag
|
||||||
|
#
|
||||||
|
# index.html mime + file = iso-8859-15
|
||||||
|
# p1_français.html meta + file = iso-8859-1, mime = utf-8
|
||||||
|
# p2_één.html meta + file = utf-8, mime =iso-8859-1
|
||||||
|
#
|
||||||
|
|
||||||
|
my $ccedilla_l1 = "\xE7";
|
||||||
|
my $ccedilla_u8 = "\xC3\xA7";
|
||||||
|
my $eacute_l1 = "\xE9";
|
||||||
|
my $eacute_u8 = "\xC3\xA9";
|
||||||
|
|
||||||
|
my $urllist = <<EOF;
|
||||||
|
http://localhost:{{port}}/
|
||||||
|
http://localhost:{{port}}/p1_fran${ccedilla_l1}ais.html
|
||||||
|
http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pageindex = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Main Page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Main page.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pagefrancais = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>La seule page en français</title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
French page.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pageeen = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Die enkele nederlandstalige pagina</title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Dutch page.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $page404 = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>404</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Nop nop nop...
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# code, msg, headers, content
|
||||||
|
my %urls = (
|
||||||
|
'/index.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=ISO-8859-15",
|
||||||
|
},
|
||||||
|
content => $pageindex,
|
||||||
|
},
|
||||||
|
'/robots.txt' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => "",
|
||||||
|
},
|
||||||
|
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
|
||||||
|
code => "404",
|
||||||
|
msg => "File not found",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $page404,
|
||||||
|
},
|
||||||
|
'/p1_fran%E7ais.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $pagefrancais,
|
||||||
|
},
|
||||||
|
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=ISO-8859-1",
|
||||||
|
},
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
'/p2_%E9%E9n.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=ISO-8859-1",
|
||||||
|
},
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
'/url_list.txt' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain; charset=ISO-8859-1",
|
||||||
|
},
|
||||||
|
content => $urllist,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
my $cmdline = $WgetTest::WGETPATH . " --iri -d -i http://localhost:{{port}}/url_list.txt";
|
||||||
|
|
||||||
|
my $expected_error_code = 0;
|
||||||
|
|
||||||
|
my %expected_downloaded_files = (
|
||||||
|
'url_list.txt' => {
|
||||||
|
content => $urllist,
|
||||||
|
},
|
||||||
|
'index.html' => {
|
||||||
|
content => $pageindex,
|
||||||
|
},
|
||||||
|
"p1_fran${ccedilla_l1}ais.html" => {
|
||||||
|
content => $pagefrancais,
|
||||||
|
},
|
||||||
|
"p2_${eacute_u8}${eacute_u8}n.html" => {
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $the_test = HTTPTest->new (name => "Test-iri-list",
|
||||||
|
input => \%urls,
|
||||||
|
cmdline => $cmdline,
|
||||||
|
errcode => $expected_error_code,
|
||||||
|
output => \%expected_downloaded_files);
|
||||||
|
exit $the_test->run();
|
||||||
|
|
||||||
|
# vim: et ts=4 sw=4
|
||||||
|
|
224
tests/Test-iri.px
Executable file
224
tests/Test-iri.px
Executable file
@ -0,0 +1,224 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use HTTPTest;
|
||||||
|
|
||||||
|
# cf. http://en.wikipedia.org/wiki/Latin1
|
||||||
|
# http://en.wikipedia.org/wiki/ISO-8859-15
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
#
|
||||||
|
# mime : charset found in Content-Type HTTP MIME header
|
||||||
|
# meta : charset found in Content-Type meta tag
|
||||||
|
#
|
||||||
|
# index.html mime + file = iso-8859-15
|
||||||
|
# p1_français.html meta + file = iso-8859-1, mime = utf-8
|
||||||
|
# p2_één.html meta + file = utf-8, mime =iso-8859-1
|
||||||
|
# p3_€€€.html meta + file = utf-8, mime = iso-8859-1
|
||||||
|
# p4_méér.html mime + file = utf-8
|
||||||
|
#
|
||||||
|
|
||||||
|
my $ccedilla_l15 = "\xE7";
|
||||||
|
my $ccedilla_u8 = "\xC3\xA7";
|
||||||
|
my $eacute_l1 = "\xE9";
|
||||||
|
my $eacute_u8 = "\xC3\xA9";
|
||||||
|
my $eurosign_l15 = "\xA4";
|
||||||
|
my $eurosign_u8 = "\xE2\x82\xAC";
|
||||||
|
|
||||||
|
my $pageindex = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Main Page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Link to page 1 <a href="http://localhost:{{port}}/p1_fran${ccedilla_l15}ais.html">La seule page en français</a>.
|
||||||
|
Link to page 3 <a href="http://localhost:{{port}}/p3_${eurosign_l15}${eurosign_l15}${eurosign_l15}.html">My tailor is rich</a>.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pagefrancais = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>La seule page en français</title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Link to page 2 <a href="http://localhost:{{port}}/p2_${eacute_l1}${eacute_l1}n.html">Die enkele nerderlangstalige pagina</a>.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pageeen = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Die enkele nederlandstalige pagina</title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Één is niet veel maar toch meer dan nul.<br/>
|
||||||
|
Nerdelands is een mooie taal... dit zin stuckje spreekt vanzelf, of niet :)<br/>
|
||||||
|
<a href="http://localhost:{{port}}/p4_m${eacute_u8}${eacute_u8}r.html">Méér</a>
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pageeuro = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Euro page</title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
My tailor isn't rich anymore.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $pagemeer = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Bekende supermarkt</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Ik ben toch niet gek !
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $page404 = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>404</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Nop nop nop...
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# code, msg, headers, content
|
||||||
|
my %urls = (
|
||||||
|
'/index.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=ISO-8859-15",
|
||||||
|
},
|
||||||
|
content => $pageindex,
|
||||||
|
},
|
||||||
|
'/robots.txt' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => "",
|
||||||
|
},
|
||||||
|
'/p1_fran%C3%A7ais.html' => { # UTF-8 encoded
|
||||||
|
code => "404",
|
||||||
|
msg => "File not found",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $page404,
|
||||||
|
},
|
||||||
|
'/p1_fran%E7ais.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $pagefrancais,
|
||||||
|
},
|
||||||
|
'/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=ISO-8859-1",
|
||||||
|
},
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
'/p2_%E9%E9n.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html; charset=ISO-8859-1",
|
||||||
|
},
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
'/p3_%E2%82%AC%E2%82%AC%E2%82%AC.html' => { # UTF-8 encoded
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain; charset=ISO-8859-1",
|
||||||
|
},
|
||||||
|
content => $pageeuro,
|
||||||
|
},
|
||||||
|
'/p3_%A4%A4%A4.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain; charset=ISO-8859-1",
|
||||||
|
},
|
||||||
|
content => $pageeuro,
|
||||||
|
},
|
||||||
|
'/p4_m%C3%A9%C3%A9r.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Ok",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain; charset=UTF-8",
|
||||||
|
},
|
||||||
|
content => $pagemeer,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
my $cmdline = $WgetTest::WGETPATH . " --iri --restrict-file-names=nocontrol -nH -r http://localhost:{{port}}/";
|
||||||
|
|
||||||
|
my $expected_error_code = 0;
|
||||||
|
|
||||||
|
my %expected_downloaded_files = (
|
||||||
|
'index.html' => {
|
||||||
|
content => $pageindex,
|
||||||
|
},
|
||||||
|
'robots.txt' => {
|
||||||
|
content => "",
|
||||||
|
},
|
||||||
|
"p1_fran${ccedilla_l15}ais.html" => {
|
||||||
|
content => $pagefrancais,
|
||||||
|
},
|
||||||
|
"p2_${eacute_u8}${eacute_u8}n.html" => {
|
||||||
|
content => $pageeen,
|
||||||
|
},
|
||||||
|
"p3_${eurosign_u8}${eurosign_u8}${eurosign_u8}.html" => {
|
||||||
|
content => $pageeuro,
|
||||||
|
},
|
||||||
|
"p4_m${eacute_u8}${eacute_u8}r.html" => {
|
||||||
|
content => $pagemeer,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $the_test = HTTPTest->new (name => "Test-iri",
|
||||||
|
input => \%urls,
|
||||||
|
cmdline => $cmdline,
|
||||||
|
errcode => $expected_error_code,
|
||||||
|
output => \%expected_downloaded_files);
|
||||||
|
exit $the_test->run();
|
||||||
|
|
||||||
|
# vim: et ts=4 sw=4
|
||||||
|
|
@ -17,9 +17,16 @@ my @tests = (
|
|||||||
'Test-E-k-K.px',
|
'Test-E-k-K.px',
|
||||||
'Test-E-k.px',
|
'Test-E-k.px',
|
||||||
'Test-ftp.px',
|
'Test-ftp.px',
|
||||||
|
'Test-ftp-iri.px',
|
||||||
|
'Test-ftp-iri-fallback.px',
|
||||||
|
'Test-ftp-iri-disabled.px',
|
||||||
'Test-HTTP-Content-Disposition-1.px',
|
'Test-HTTP-Content-Disposition-1.px',
|
||||||
'Test-HTTP-Content-Disposition-2.px',
|
'Test-HTTP-Content-Disposition-2.px',
|
||||||
'Test-HTTP-Content-Disposition.px',
|
'Test-HTTP-Content-Disposition.px',
|
||||||
|
'Test-iri.px',
|
||||||
|
'Test-iri-disabled.px',
|
||||||
|
'Test-iri-forced-remote.px',
|
||||||
|
'Test-iri-list.px',
|
||||||
'Test-N-current.px',
|
'Test-N-current.px',
|
||||||
'Test-N-smaller.px',
|
'Test-N-smaller.px',
|
||||||
'Test-N-no-info.px',
|
'Test-N-no-info.px',
|
||||||
|
Loading…
Reference in New Issue
Block a user