Automated merge.

This commit is contained in:
Saint Xavier 2008-07-24 00:58:10 +02:00
commit ccd62071dc
27 changed files with 899 additions and 100 deletions

View File

@ -4,6 +4,14 @@
* AUTHORS: Added Steven Schubiger.
2008-06-26 Xavier Saint <wget@sxav.eu>
* configure.ac : IRIs support required libiconv, check it.
2008-06-14 Xavier Saint <wget@sxav.eu>
* configure.ac: Add support for IRIs
2008-05-29 Micah Cowan <micah@cowan.name>
* po/*.po: Updated from TP (the 1.11.3 set).

View File

@ -460,6 +460,77 @@ else
fi
AC_SUBST(COMMENT_IF_NO_POD2MAN)
dnl
dnl Check for IDN/IRIs
dnl
AC_ARG_ENABLE(iri,
AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
[case "${enable_iri}" in
no)
dnl Disable IRIs checking
AC_MSG_NOTICE([disabling IRIs at user request])
iri=no
;;
yes)
dnl IRIs explicitly enabled
iri=yes
force_iri=yes
;;
auto)
dnl Auto-detect IRI
iri=yes
;;
*)
AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
;;
esac
], [
dnl If nothing is specified, assume auto-detection
iri=yes
]
)
AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
[Support IDN/IRIs (needs GNU Libidn)]),
libidn=$withval, libidn="")
if test "X$iri" != "Xno"; then
AM_ICONV
if test "X$am_cv_func_iconv" != "Xyes"; then
iri=no
if test "X$force_iri" = "Xyes"; then
AC_MSG_ERROR([Libiconv is required for IRIs support])
else
AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found])
fi
fi
fi
if test "X$iri" != "Xno"; then
if test "$libidn" != ""; then
LDFLAGS="${LDFLAGS} -L$libidn/lib"
CPPFLAGS="${CPPFLAGS} -I$libidn/include"
fi
AC_CHECK_HEADER(idna.h,
AC_CHECK_LIB(idn, stringprep_check_version,
[iri=yes LIBS="${LIBS} -lidn"], iri=no),
iri=no)
if test "X$iri" != "Xno" ; then
AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
AC_MSG_NOTICE([Enabling support for IRI.])
else
AC_MSG_WARN([Libidn not found])
fi
fi
dnl Needed by src/Makefile.am
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
dnl
dnl Create output
dnl

View File

@ -7,11 +7,27 @@
* init.c (cleanup): Free the memory associated with the base
option (when DEBUG_MALLOC is defined).
2008-07-02 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New function idn_decode() to decode ASCII
encoded hostname to the locale.
* host.c : Show hostname to be resolved both in locale and
ASCII encoded.
2008-06-28 Steven Schubiger <stsc@members.fsf.org>
* retr.c (retrieve_from_file): Allow for reading the links from
an external file (HTTP/FTP).
2008-06-26 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New functions locale_to_utf8() and
idn_encode() adding basic capabilities of IRI/IDN.
* url.c : Convert URLs from locale to UTF-8 allowing a basic
support of IRI/IDN
2008-06-25 Steven Schubiger <stsc@members.fsf.org>
* ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
@ -36,7 +52,7 @@
* http.c: Make -nv --spider include the file's name when it
exists.
2008-06-22 Micah Cowan <micah@cowan.name>
* Makefile.am (version.c): Fixed version string invocation so it
@ -44,12 +60,57 @@
string vars pointers-to-const, and moved line lengths
below 80 (in Makefile.am, not in version.c).
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New function check_encoding_name() as
a preliminary encoding name check.
* main.c, iri.c : Make use of check_encoding_name().
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c : Include missing stringprep.h file and add a
cast.
* init.c : set a default initial value for opt.enable_iri,
opt.locale and opt.encoding_remote.
2008-06-19 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : Add a new function find_locale() to find
out the local system encoding.
* main.c : Make use of find_locale().
2008-06-19 Xavier Saint <wget@sxav.eu>
* html-url.c : Add "content-type" meta tag parsing for
retrieving page encoding.
* iri.h : Make no-op version of parse_charset() return
NULL.
2008-06-16 Micah Cowan <micah@cowan.name>
* http.c (http_loop): When hstat.len is higher than the
successfully completed content's length, but it's because we
_set_ it that way, don't abort.
2008-06-14 Xavier Saint <wget@sxav.eu>
* iri.c, iri.h : New files.
* Makefile.am : Add files iri.h and conditional iri.c.
* build_info.c : Add compiled feature "iri".
* http.c : include iri.h and parse charset from Content-Type
header.
* init.c, main.c, options.h : if an options isn't supported
at compiled time, don't get rid off it and show a dummy
message instead if they are used.
2008-06-13 Micah Cowan <micah@cowan.name>
* build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL
@ -93,11 +154,11 @@
default.
2008-05-17 Kenny Parnell <k.parnell@gmail.com>
(cmd_spec_prefer_family): Initialize prefer_family to prefer_none.
2008-05-17 Micah Cowan <micah@cowan.name>
* main.c (main): Handle Ctrl-D on command-line.
2008-05-15 Steven Schubiger <schubiger@gmail.com>
@ -136,7 +197,7 @@
* options.h: Add an according boolean member to the options
struct.
* sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE
out, because they're now defined independently by config.h.

View File

@ -30,6 +30,10 @@
# Version: @VERSION@
#
if IRI_IS_ENABLED
IRI_OBJ = iri.c
endif
# The following line is losing on some versions of make!
DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
@ -40,8 +44,8 @@ wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c \
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
recur.c res.c retr.c snprintf.c spider.c url.c \
utils.c \
css-url.h connect.h convert.h cookies.h \
utils.c $(IRI_OBJ) \
css-url.h connect.h convert.h cookies.h \
ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
options.h progress.h ptimer.h recur.h res.h retr.h \

View File

@ -100,6 +100,13 @@ const char* (compiled_features[]) =
#else
"-gettext",
#endif
#ifdef ENABLE_IRI
"+iri",
#else
"-iri",
#endif
/* sentinel value */
NULL
};

View File

@ -58,6 +58,7 @@ as that of the covered work. */
#include "host.h"
#include "connect.h"
#include "hash.h"
#include "iri.h"
/* Define sockaddr_storage where unavailable (presumably on IPv4-only
hosts). */
@ -266,9 +267,25 @@ connect_to_ip (const ip_address *ip, int port, const char *print)
if (print)
{
const char *txt_addr = print_address (ip);
if (print && 0 != strcmp (print, txt_addr))
logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
escnonprint_uri (print), txt_addr, port);
if (0 != strcmp (print, txt_addr))
{
char *str = NULL, *name;
if (opt.enable_iri && (name = idn_decode ((char *) print)) != NULL)
{
int len = strlen (print) + strlen (name) + 4;
str = xmalloc (len);
snprintf (str, len, "%s (%s)", name, print);
str[len-1] = '\0';
xfree (name);
}
logprintf (LOG_VERBOSE, _("Connecting to %s|%s|:%d... "),
str ? str : escnonprint_uri (print), txt_addr, port);
if (str)
xfree (str);
}
else
logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port);
}

View File

@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
/* Parse the file... */
urls = is_css ? get_urls_css_file (file, url) :
get_urls_html (file, url, NULL);
get_urls_html (file, url, NULL, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the

View File

@ -68,7 +68,7 @@ ftp_response (int fd, char **ret_line)
return FTPRERR;
/* Strip trailing CRLF before printing the line, so that
escnonprint doesn't include bogus \012 and \015. */
quotting doesn't include bogus \012 and \015. */
p = strchr (line, '\0');
if (p > line && p[-1] == '\n')
*--p = '\0';

View File

@ -53,6 +53,7 @@ as that of the covered work. */
#include "host.h"
#include "url.h"
#include "hash.h"
#include "iri.h"
#ifndef NO_ADDRESS
# define NO_ADDRESS NO_DATA
@ -712,8 +713,24 @@ lookup_host (const char *host, int flags)
/* No luck with the cache; resolve HOST. */
if (!silent && !numeric_address)
logprintf (LOG_VERBOSE, _("Resolving %s... "),
quotearg_style (escape_quoting_style, host));
{
char *str = NULL, *name;
if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
{
int len = strlen (host) + strlen (name) + 4;
str = xmalloc (len);
snprintf (str, len, "%s (%s)", name, host);
str[len-1] = '\0';
xfree (name);
}
logprintf (LOG_VERBOSE, _("Resolving %s... "),
quotearg_style (escape_quoting_style, str ? str : host));
if (str)
xfree (str);
}
#ifdef ENABLE_IPV6
{

View File

@ -174,6 +174,10 @@ static const char *additional_attributes[] = {
static struct hash_table *interesting_tags;
static struct hash_table *interesting_attributes;
/* Will contains the (last) charset found in 'http-equiv=content-type'
meta tags */
static char *meta_charset;
static void
init_interesting (void)
{
@ -284,7 +288,7 @@ append_url (const char *link_uri, int position, int size,
return NULL;
}
url = url_parse (link_uri, NULL);
url = url_parse (link_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@ -303,7 +307,7 @@ append_url (const char *link_uri, int position, int size,
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri));
url = url_parse (complete_uri, NULL);
url = url_parse (complete_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@ -553,6 +557,24 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
entry->link_expect_html = 1;
}
}
else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
{
/* Handle stuff like:
<meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
char *mcharset;
char *content = find_attr (tag, "content", NULL);
if (!content)
return;
mcharset = parse_charset (content);
if (!mcharset)
return;
/*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
xfree_null (meta_charset);
meta_charset = mcharset;
}
else if (name && 0 == strcasecmp (name, "robots"))
{
/* Handle stuff like:
@ -617,7 +639,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
<base href=...> and does the right thing. */
struct urlpos *
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
struct iri *iri)
{
struct file_memory *fm;
struct map_context ctx;
@ -657,6 +680,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
NULL, interesting_attributes);
/* If meta charset isn't null, override content encoding */
if (iri && meta_charset)
set_content_encoding (iri, meta_charset);
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)
*meta_disallow_follow = ctx.nofollow;
@ -726,7 +753,7 @@ get_urls_file (const char *file)
url_text = merged;
}
url = url_parse (url_text, &up_error_code);
url = url_parse (url_text, &up_error_code, NULL);
if (!url)
{
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),

View File

@ -44,7 +44,7 @@ struct map_context {
};
struct urlpos *get_urls_file (const char *);
struct urlpos *get_urls_html (const char *, const char *, bool *);
struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
struct urlpos *append_url (const char *, int, int, struct map_context *);
void free_urlpos (struct urlpos *);

View File

@ -1364,7 +1364,8 @@ free_hstat (struct http_stat *hs)
If PROXY is non-NULL, the connection will be made to the proxy
server, and u->url will be requested. */
static uerr_t
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
struct iri *iri)
{
struct request *req;
@ -1827,7 +1828,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
hs->local_file = url_file_name (u);
}
}
/* TODO: perform this check only once. */
if (!hs->existence_checked && file_exists_p (hs->local_file))
{
@ -1896,7 +1897,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
local_dot_orig_file_exists = true;
local_filename = filename_plus_orig_suffix;
}
}
}
if (!local_dot_orig_file_exists)
/* Couldn't stat() <file>.orig, so try to stat() <file>. */
@ -2048,9 +2049,20 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
char *tmp = strchr (type, ';');
if (tmp)
{
/* sXXXav: only needed if IRI support is enabled */
char *tmp2 = tmp + 1;
while (tmp > type && c_isspace (tmp[-1]))
--tmp;
*tmp = '\0';
/* Try to get remote encoding if needed */
if (opt.enable_iri && !opt.encoding_remote)
{
tmp = parse_charset (tmp2);
if (tmp)
set_content_encoding (iri, tmp);
}
}
}
hs->newloc = resp_header_strdup (resp, "Location");
@ -2325,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
retried, and retried, and retried, and... */
uerr_t
http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
int *dt, struct url *proxy)
int *dt, struct url *proxy, struct iri *iri)
{
int count;
bool got_head = false; /* used for time-stamping and filename detection */
@ -2336,16 +2348,16 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
uerr_t err, ret = TRYLIMEXC;
time_t tmr = -1; /* remote time-stamp */
struct http_stat hstat; /* HTTP status */
struct_stat st;
struct_stat st;
bool send_head_first = true;
/* Assert that no value for *LOCAL_FILE was passed. */
assert (local_file == NULL || *local_file == NULL);
/* Set LOCAL_FILE parameter. */
if (local_file && opt.output_document)
*local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document);
/* Reset NEWLOC parameter. */
*newloc = NULL;
@ -2382,7 +2394,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
retrieve the file. But if the output_document was given, then this
test was already done and the file didn't exist. Hence the !opt.output_document */
logprintf (LOG_VERBOSE, _("\
File %s already there; not retrieving.\n\n"),
File %s already there; not retrieving.\n\n"),
quote (hstat.local_file));
/* If the file is there, we suppose it's retrieved OK. */
*dt |= RETROKF;
@ -2398,10 +2410,10 @@ File %s already there; not retrieving.\n\n"),
/* Reset the counter. */
count = 0;
/* Reset the document type. */
*dt = 0;
/* Skip preliminary HEAD request if we're not in spider mode AND
* if -O was given or HTTP Content-Disposition support is disabled. */
if (!opt.spider
@ -2410,21 +2422,21 @@ File %s already there; not retrieving.\n\n"),
/* Send preliminary HEAD request if -N is given and we have an existing
* destination file. */
if (opt.timestamping
if (opt.timestamping
&& !opt.content_disposition
&& file_exists_p (url_file_name (u)))
send_head_first = true;
/* THE loop */
do
{
/* Increment the pass counter. */
++count;
sleep_between_retrievals (count);
/* Get the current time string. */
tms = datetime_str (time (NULL));
if (opt.spider && !got_head)
logprintf (LOG_VERBOSE, _("\
Spider mode enabled. Check if remote file exists.\n"));
@ -2433,20 +2445,20 @@ Spider mode enabled. Check if remote file exists.\n"));
if (opt.verbose)
{
char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
if (count > 1)
if (count > 1)
{
char tmp[256];
sprintf (tmp, _("(try:%2d)"), count);
logprintf (LOG_NOTQUIET, "--%s-- %s %s\n",
tms, tmp, hurl);
}
else
else
{
logprintf (LOG_NOTQUIET, "--%s-- %s\n",
tms, hurl);
}
#ifdef WINDOWS
ws_changetitle (hurl);
#endif
@ -2456,7 +2468,7 @@ Spider mode enabled. Check if remote file exists.\n"));
/* Default document type is empty. However, if spider mode is
on or time-stamping is employed, HEAD_ONLY commands is
encoded within *dt. */
if (send_head_first && !got_head)
if (send_head_first && !got_head)
*dt |= HEAD_ONLY;
else
*dt &= ~HEAD_ONLY;
@ -2489,11 +2501,11 @@ Spider mode enabled. Check if remote file exists.\n"));
*dt &= ~SEND_NOCACHE;
/* Try fetching the document, or at least its head. */
err = gethttp (u, &hstat, dt, proxy);
err = gethttp (u, &hstat, dt, proxy, iri);
/* Time? */
tms = datetime_str (time (NULL));
/* Get the new location (with or without the redirection). */
if (hstat.newloc)
*newloc = xstrdup (hstat.newloc);
@ -2532,7 +2544,7 @@ Spider mode enabled. Check if remote file exists.\n"));
hstat.statcode);
ret = WRONGCODE;
}
else
else
{
ret = NEWLOCATION;
}
@ -2548,7 +2560,7 @@ Spider mode enabled. Check if remote file exists.\n"));
/* All possibilities should have been exhausted. */
abort ();
}
if (!(*dt & RETROKF))
{
char *hurl = NULL;
@ -2567,11 +2579,13 @@ Spider mode enabled. Check if remote file exists.\n"));
continue;
}
/* Maybe we should always keep track of broken links, not just in
* spider mode. */
else if (opt.spider)
* spider mode.
* Don't log error if it was UTF-8 encoded because we will try
* once unencoded. */
else if (opt.spider && !iri->utf8_encode)
{
/* #### Again: ugly ugly ugly! */
if (!hurl)
if (!hurl)
hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
nonexisting_url (hurl);
logprintf (LOG_NOTQUIET, _("\
@ -2580,7 +2594,7 @@ Remote file does not exist -- broken link!!!\n"));
else
{
logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
tms, hstat.statcode,
tms, hstat.statcode,
quotearg_style (escape_quoting_style, hstat.error));
}
logputs (LOG_VERBOSE, "\n");

View File

@ -33,7 +33,7 @@ as that of the covered work. */
struct url;
uerr_t http_loop (struct url *, char **, char **, const char *, int *,
struct url *);
struct url *, struct iri *);
void save_cookies (void);
void http_cleanup (void);
time_t http_atotm (const char *);

View File

@ -181,9 +181,11 @@ static const struct {
{ "inet6only", &opt.ipv6_only, cmd_boolean },
#endif
{ "input", &opt.input_filename, cmd_file },
{ "iri", &opt.enable_iri, cmd_boolean },
{ "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
{ "limitrate", &opt.limit_rate, cmd_bytes },
{ "loadcookies", &opt.cookies_input, cmd_file },
{ "locale", &opt.locale, cmd_string },
{ "logfile", &opt.lfilename, cmd_file },
{ "login", &opt.ftp_user, cmd_string },/* deprecated*/
{ "maxredirect", &opt.max_redirect, cmd_number },
@ -223,6 +225,7 @@ static const struct {
{ "referer", &opt.referer, cmd_string },
{ "reject", &opt.rejects, cmd_vector },
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
@ -330,6 +333,14 @@ defaults (void)
opt.restrict_files_case = restrict_no_case_restriction;
opt.max_redirect = 20;
#ifdef ENABLE_IRI
opt.enable_iri = true;
#else
opt.enable_iri = false;
#endif
opt.locale = NULL;
opt.encoding_remote = NULL;
}
/* Return the user's home directory (strdup-ed), or NULL if none is

362
src/iri.c Normal file
View File

@ -0,0 +1,362 @@
/* IRI related functions.
Copyright (C) 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
Additional permission under GNU GPL version 3 section 7
If you modify this program, or any covered work, by linking or
combining it with the OpenSSL project's OpenSSL library (or a
modified version of that library), containing parts covered by the
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
grants you additional permission to convey the resulting work.
Corresponding Source for a non-source form of such a combination
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <iconv.h>
#include <stringprep.h>
#include <idna.h>
#include <errno.h>
#include "utils.h"
#include "iri.h"
/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
/* Note: locale encoding is kept in options struct (opt.locale) */
static iconv_t locale2utf8;
static bool open_locale_to_utf8 (void);
static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
/* Given a string containing "charset=XXX", return the encoding if found,
or NULL otherwise */
char *
parse_charset (char *str)
{
char *charset;
if (!str || !*str)
return NULL;
str = strcasestr (str, "charset=");
if (!str)
return NULL;
str += 8;
charset = str;
/* sXXXav: which chars should be banned ??? */
while (*charset && !c_isspace (*charset))
charset++;
/* sXXXav: could strdupdelim return NULL ? */
charset = strdupdelim (str, charset);
/* Do a minimum check on the charset value */
if (!check_encoding_name (charset))
{
xfree (charset);
return NULL;
}
/*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
return charset;
}
/* Find the locale used, or fall back on a default value */
char *
find_locale (void)
{
return (char *) stringprep_locale_charset ();
}
/* Basic check of an encoding name. */
bool
check_encoding_name (char *encoding)
{
char *s = encoding;
while (*s)
{
if (!c_isascii (*s) || c_isspace (*s))
{
logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
return false;
}
s++;
}
return true;
}
/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
static bool
open_locale_to_utf8 (void)
{
if (locale2utf8)
return true;
/* sXXXav : That shouldn't happen, just in case */
if (!opt.locale)
{
logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
opt.locale = find_locale ();
}
if (!opt.locale)
return false;
locale2utf8 = iconv_open ("UTF-8", opt.locale);
if (locale2utf8 != (iconv_t)(-1))
return true;
logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
quote (opt.locale), quote ("UTF-8"));
locale2utf8 = NULL;
return false;
}
/* Try converting string str from locale to UTF-8. Return a new string
on success, or str on error or if conversion isn't needed. */
const char *
locale_to_utf8 (const char *str)
{
char *new;
if (!strcasecmp (opt.locale, "utf-8"))
return str;
if (!open_locale_to_utf8 ())
return str;
if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new))
return (const char *) new;
return str;
}
/* Do the conversion according to the passed conversion descriptor cd. *out
will containes the transcoded string on success. *out content is
unspecified otherwise. */
static bool
do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
{
/* sXXXav : hummm hard to guess... */
size_t len, done, outlen = inlen * 2;
int invalid = 0, tooshort = 0;
char *s;
s = xmalloc (outlen + 1);
*out = s;
len = outlen;
done = 0;
for (;;)
{
if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
{
*out = s;
*(s + len - outlen - done) = '\0';
return true;
}
/* Incomplete or invalid multibyte sequence */
if (errno == EINVAL || errno == EILSEQ)
{
if (!invalid)
logprintf (LOG_VERBOSE,
"Incomplete or invalide multibyte sequence encountered\n");
invalid++;
**out = *in;
in++;
inlen--;
(*out)++;
outlen--;
}
else if (errno == E2BIG) /* Output buffer full */
{
char *new;
tooshort++;
done = len;
outlen = done + inlen * 2;
new = xmalloc (outlen + 1);
memcpy (new, s, done);
xfree (s);
s = new;
len = outlen;
*out = s + done;
}
else /* Weird, we got an unspecified error */
{
logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
break;
}
}
return false;
}
/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
on error. */
char *
idn_encode (struct iri *i, char *host)
{
char *new;
int ret;
/* Encode to UTF-8 if not done */
if (!i->utf8_encode)
{
if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
{
/* Nothing to encode or an error occured */
return NULL;
}
host = new;
}
/* toASCII UTF-8 NULL terminated string */
ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
if (ret != IDNA_SUCCESS)
{
/* sXXXav : free new when needed ! */
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
}
return new;
}
/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
on success or NULL on error. */
char *
idn_decode (char *host)
{
char *new;
int ret;
ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
if (ret != IDNA_SUCCESS)
{
logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
}
return new;
}
/* Try to transcode string str from remote encoding to UTF-8. On success, *new
contains the transcoded string. *new content is unspecified otherwise. */
bool
remote_to_utf8 (struct iri *i, const char *str, const char **new)
{
char *r;
iconv_t cd;
bool ret = false;
if (opt.encoding_remote)
r = opt.encoding_remote;
else if (i->uri_encoding)
r = i->uri_encoding;
else
return false;
cd = iconv_open ("UTF-8", r);
if (cd == (iconv_t)(-1))
return false;
if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
ret = true;
iconv_close (cd);
/* Test if something was converted */
if (!strcmp (str, *new))
{
xfree ((char *) *new);
return false;
}
return ret;
}
struct iri *
iri_new (void)
{
struct iri *i = xmalloc (sizeof (struct iri));
i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
i->content_encoding = NULL;
i->utf8_encode = opt.enable_iri;
}
void
iri_free (struct iri *i)
{
xfree_null (i->uri_encoding);
xfree_null (i->content_encoding);
xfree (i);
}
void
set_uri_encoding (struct iri *i, char *charset)
{
logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset);
if (opt.encoding_remote)
return;
if (i->uri_encoding)
{
if (!strcasecmp (i->uri_encoding, charset))
return;
xfree (i->uri_encoding);
}
i->uri_encoding = charset ? xstrdup (charset) : NULL;
}
void
set_content_encoding (struct iri *i, char *charset)
{
logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset);
if (opt.encoding_remote)
return;
if (i->content_encoding)
{
if (!strcasecmp (i->content_encoding, charset))
return;
xfree (i->content_encoding);
}
i->content_encoding = charset ? xstrdup (charset) : NULL;
}

70
src/iri.h Normal file
View File

@ -0,0 +1,70 @@
/* Internationalization related declarations.
Copyright (C) 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
Additional permission under GNU GPL version 3 section 7
If you modify this program, or any covered work, by linking or
combining it with the OpenSSL project's OpenSSL library (or a
modified version of that library), containing parts covered by the
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
grants you additional permission to convey the resulting work.
Corresponding Source for a non-source form of such a combination
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
#ifndef IRI_H
#define IRI_H
struct iri {
char *uri_encoding; /* Encoding of the uri to fetch */
char *content_encoding; /* Encoding of links inside the fetched file */
bool utf8_encode; /* Will/Is the current url encoded in utf8 */
};
#ifdef ENABLE_IRI
char *parse_charset (char *str);
char *find_locale (void);
bool check_encoding_name (char *encoding);
const char *locale_to_utf8 (const char *str);
char *idn_encode (struct iri *i, char *host);
char *idn_decode (char *host);
bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
struct iri *iri_new (void);
void iri_free (struct iri *i);
void set_uri_encoding (struct iri *i, char *charset);
void set_content_encoding (struct iri *i, char *charset);
#else /* ENABLE_IRI */
struct iri dummy_iri;
#define parse_charset(str) NULL
#define find_locale() NULL
#define check_encoding_name(str) false
#define locale_to_utf8(str) (str)
#define idn_encode(a,b,c) NULL
#define idn_decode(str) NULL
#define remote_to_utf8(a,b,c) false
#define iri_new() (&dummy_iri)
#define iri_free(a)
#define set_uri_encoding(a,b)
#define set_content_encoding(a,b)
#endif /* ENABLE_IRI */
#endif /* IRI_H */

View File

@ -43,7 +43,7 @@ as that of the covered work. */
#include "utils.h"
#include "log.h"
/* This file impplement support for "logging". Logging means printing
/* This file implement support for "logging". Logging means printing
output, plus several additional features:
- Cataloguing output by importance. You can specify that a log

View File

@ -43,6 +43,9 @@ as that of the covered work. */
#include <assert.h>
#include <errno.h>
#include <time.h>
#ifdef ENABLE_IRI
#include <langinfo.h>
#endif
#include "utils.h"
#include "init.h"
@ -200,10 +203,12 @@ static struct cmdline_option option_data[] =
{ "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
#endif
{ "input-file", 'i', OPT_VALUE, "input", -1 },
{ "iri", 0, OPT_BOOLEAN, "iri", -1 },
{ "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
{ "level", 'l', OPT_VALUE, "reclevel", -1 },
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
{ "locale", 0, OPT_VALUE, "locale", -1 },
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
{ "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
{ "no", 'n', OPT__NO, NULL, required_argument },
@ -237,6 +242,7 @@ static struct cmdline_option option_data[] =
{ "referer", 0, OPT_VALUE, "referer", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 },
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
{ "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
{ "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
@ -1058,6 +1064,29 @@ for details.\n\n"));
exit (1);
}
#ifdef ENABLE_IRI
if (opt.enable_iri)
{
if (opt.locale && !check_encoding_name (opt.locale))
opt.locale = NULL;
if (!opt.locale)
opt.locale = find_locale ();
if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
opt.encoding_remote = NULL;
/*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/
}
#else
if (opt.enable_iri || opt.locale || opt.encoding_remote)
{
/* sXXXav : be more specific... */
printf(_("This version does not have support for IRIs\n"));
exit(1);
}
#endif
if (opt.ask_passwd)
{
opt.passwd = prompt_for_password ();
@ -1167,15 +1196,21 @@ WARNING: Can't reopen standard output in binary mode;\n\
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
if (url_scheme (*t) == SCHEME_FTP)
if (url_scheme (*t) == SCHEME_FTP)
opt.follow_ftp = 1;
status = retrieve_tree (*t);
opt.follow_ftp = old_follow_ftp;
}
else
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
{
struct iri *i = iri_new ();
set_uri_encoding (i, opt.locale);
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
opt.recursive, i);
iri_free (i);
}
if (opt.delete_after && file_exists_p(filename))
{

View File

@ -237,6 +237,10 @@ struct options
bool content_disposition; /* Honor HTTP Content-Disposition header. */
bool auth_without_challenge; /* Issue Basic authentication creds without
waiting for a challenge. */
bool enable_iri;
char *encoding_remote;
char *locale;
};
extern struct options opt;

View File

@ -51,7 +51,8 @@ as that of the covered work. */
#include "html-url.h"
#include "css-url.h"
#include "spider.h"
#include "iri.h"
/* Functions for maintaining the URL queue. */
struct queue_element {
@ -60,6 +61,7 @@ struct queue_element {
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
struct iri *iri; /* sXXXav */
bool css_allowed; /* whether the document is allowed to
be treated as CSS. */
struct queue_element *next; /* next element in queue */
@ -93,11 +95,12 @@ url_queue_delete (struct url_queue *queue)
into it. */
static void
url_enqueue (struct url_queue *queue,
url_enqueue (struct url_queue *queue, struct iri *i,
const char *url, const char *referer, int depth,
bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
qel->iri = i;
qel->url = url;
qel->referer = referer;
qel->depth = depth;
@ -112,6 +115,9 @@ url_enqueue (struct url_queue *queue,
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
if (i)
printf ("[Enqueuing %s with %s\n", url, i->uri_encoding);
if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
@ -124,7 +130,7 @@ url_enqueue (struct url_queue *queue,
succeeded, or false if the queue is empty. */
static bool
url_dequeue (struct url_queue *queue,
url_dequeue (struct url_queue *queue, struct iri **i,
const char **url, const char **referer, int *depth,
bool *html_allowed, bool *css_allowed)
{
@ -137,6 +143,7 @@ url_dequeue (struct url_queue *queue,
if (!queue->head)
queue->tail = NULL;
*i = qel->iri;
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
@ -153,9 +160,9 @@ url_dequeue (struct url_queue *queue,
}
static bool download_child_p (const struct urlpos *, struct url *, int,
struct url *, struct hash_table *);
struct url *, struct hash_table *, struct iri *);
static bool descend_redirect_p (const char *, const char *, int,
struct url *, struct hash_table *);
struct url *, struct hash_table *, struct iri *);
/* Retrieve a part of the web beginning with START_URL. This used to
@ -192,8 +199,11 @@ retrieve_tree (const char *start_url)
struct hash_table *blacklist;
int up_error_code;
struct url *start_url_parsed = url_parse (start_url, &up_error_code);
struct url *start_url_parsed;
struct iri *i = iri_new ();
set_uri_encoding (i, opt.locale);
start_url_parsed = url_parse (start_url, &up_error_code, i);
if (!start_url_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
@ -206,7 +216,8 @@ retrieve_tree (const char *start_url)
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
false);
string_set_add (blacklist, start_url_parsed->url);
while (1)
@ -225,7 +236,7 @@ retrieve_tree (const char *start_url)
/* Get the next URL from the queue... */
if (!url_dequeue (queue,
if (!url_dequeue (queue, (struct iri **) &i,
(const char **)&url, (const char **)&referer,
&depth, &html_allowed, &css_allowed))
break;
@ -266,7 +277,8 @@ retrieve_tree (const char *start_url)
int dt = 0;
char *redirected = NULL;
status = retrieve_url (url, &file, &redirected, referer, &dt, false);
status = retrieve_url (url, &file, &redirected, referer, &dt,
false, i);
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
@ -294,7 +306,7 @@ retrieve_tree (const char *start_url)
if (descend)
{
if (!descend_redirect_p (redirected, url, depth,
start_url_parsed, blacklist))
start_url_parsed, blacklist, i))
descend = false;
else
/* Make sure that the old pre-redirect form gets
@ -346,7 +358,7 @@ retrieve_tree (const char *start_url)
bool meta_disallow_follow = false;
struct urlpos *children
= is_css ? get_urls_css_file (file, url) :
get_urls_html (file, url, &meta_disallow_follow);
get_urls_html (file, url, &meta_disallow_follow, i);
if (opt.use_robots && meta_disallow_follow)
{
@ -357,7 +369,8 @@ retrieve_tree (const char *start_url)
if (children)
{
struct urlpos *child = children;
struct url *url_parsed = url_parsed = url_parse (url, NULL);
struct url *url_parsed = url_parse (url, NULL, i);
struct iri *ci;
char *referer_url = url;
bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL);
@ -374,9 +387,11 @@ retrieve_tree (const char *start_url)
if (dash_p_leaf_HTML && !child->link_inline_p)
continue;
if (download_child_p (child, url_parsed, depth, start_url_parsed,
blacklist))
blacklist, i))
{
url_enqueue (queue, xstrdup (child->url->url),
ci = iri_new ();
set_uri_encoding (ci, i->content_encoding);
url_enqueue (queue, ci, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1,
child->link_expect_html,
child->link_expect_css);
@ -394,18 +409,18 @@ retrieve_tree (const char *start_url)
}
}
if (file
&& (opt.delete_after
if (file
&& (opt.delete_after
|| opt.spider /* opt.recursive is implicitely true */
|| !acceptable (file)))
{
/* Either --delete-after was specified, or we loaded this
(otherwise unneeded because of --spider or rejected by -R)
HTML file just to harvest its hyperlinks -- in either case,
(otherwise unneeded because of --spider or rejected by -R)
HTML file just to harvest its hyperlinks -- in either case,
delete the local file. */
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
opt.delete_after ? "--delete-after" :
(opt.spider ? "--spider" :
(opt.spider ? "--spider" :
"recursive rejection criteria")));
logprintf (LOG_VERBOSE,
(opt.delete_after || opt.spider
@ -421,6 +436,7 @@ retrieve_tree (const char *start_url)
xfree (url);
xfree_null (referer);
xfree_null (file);
iri_free (i);
}
/* If anything is left of the queue due to a premature exit, free it
@ -429,9 +445,11 @@ retrieve_tree (const char *start_url)
char *d1, *d2;
int d3;
bool d4, d5;
while (url_dequeue (queue,
struct iri *d6;
while (url_dequeue (queue, (struct iri **)&d6,
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
iri_free (d6);
xfree (d1);
xfree_null (d2);
}
@ -460,7 +478,8 @@ retrieve_tree (const char *start_url)
static bool
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *u = upos->url;
const char *url = u->url;
@ -470,7 +489,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (string_set_contains (blacklist, url))
{
if (opt.spider)
if (opt.spider)
{
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
@ -601,7 +620,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (!specs)
{
char *rfile;
if (res_retrieve_file (url, &rfile))
if (res_retrieve_file (url, &rfile, iri))
{
specs = res_parse_from_file (rfile);
@ -656,23 +675,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
static bool
descend_redirect_p (const char *redirected, const char *original, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *orig_parsed, *new_parsed;
struct urlpos *upos;
bool success;
orig_parsed = url_parse (original, NULL);
orig_parsed = url_parse (original, NULL, NULL);
assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL);
new_parsed = url_parse (redirected, NULL, NULL);
assert (new_parsed != NULL);
upos = xnew0 (struct urlpos);
upos->url = new_parsed;
success = download_child_p (upos, orig_parsed, depth,
start_url_parsed, blacklist);
start_url_parsed, blacklist, iri);
url_free (orig_parsed);
url_free (new_parsed);

View File

@ -532,21 +532,28 @@ res_get_specs (const char *host, int port)
Return true if robots were retrieved OK, false otherwise. */
bool
res_retrieve_file (const char *url, char **file)
res_retrieve_file (const char *url, char **file, struct iri *iri)
{
struct iri *i = iri_new ();
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping;
int saved_sp_val = opt.spider;
/* Copy server URI encoding for a possible IDNA transformation, no need to
encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
set_uri_encoding (i, iri->uri_encoding);
i->utf8_encode = false;
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
opt.timestamping = false;
opt.spider = false;
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
opt.timestamping = saved_ts_val;
opt.spider = saved_sp_val;
opt.spider = saved_sp_val;
xfree (robots_url);
iri_free (i);
if (err != RETROK && *file != NULL)
{

View File

@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *);
void res_register_specs (const char *, int, struct robot_specs *);
struct robot_specs *res_get_specs (const char *, int);
bool res_retrieve_file (const char *, char **);
bool res_retrieve_file (const char *, char **, struct iri *);
bool is_robots_txt_url (const char *);

View File

@ -51,6 +51,7 @@ as that of the covered work. */
#include "hash.h"
#include "convert.h"
#include "ptimer.h"
#include "iri.h"
#include "html-url.h"
/* Total size of downloaded files. Used to enforce quota. */
@ -597,7 +598,7 @@ static char *getproxy (struct url *);
uerr_t
retrieve_url (const char *origurl, char **file, char **newloc,
const char *refurl, int *dt, bool recursive)
const char *refurl, int *dt, bool recursive, struct iri *iri)
{
uerr_t result;
char *url;
@ -625,7 +626,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
if (file)
*file = NULL;
u = url_parse (url, &up_error_code);
second_try:
u = url_parse (url, &up_error_code, iri);
if (!u)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
@ -633,6 +635,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
return URLERROR;
}
printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode);
if (!refurl)
refurl = opt.referer;
@ -646,8 +650,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
proxy = getproxy (u);
if (proxy)
{
/* sXXXav : could a proxy include a path ??? */
struct iri *pi = iri_new ();
set_uri_encoding (pi, opt.locale);
pi->utf8_encode = false;
/* Parse the proxy URL. */
proxy_url = url_parse (proxy, &up_error_code);
proxy_url = url_parse (proxy, &up_error_code, NULL);
if (!proxy_url)
{
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
@ -672,7 +681,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
#endif
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
{
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
}
else if (u->scheme == SCHEME_FTP)
{
@ -722,8 +731,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
xfree (mynewloc);
mynewloc = construced_newloc;
/* Reset UTF-8 encoding state, keep the URI encoding and reset
the content encoding. */
iri->utf8_encode = opt.enable_iri;
set_content_encoding (iri, NULL);
/* Now, see if this new location makes sense. */
newloc_parsed = url_parse (mynewloc, &up_error_code);
newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
if (!newloc_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
@ -770,8 +784,21 @@ retrieve_url (const char *origurl, char **file, char **newloc,
goto redirected;
}
if (local_file)
/* Try to not encode in UTF-8 if fetching failed */
if (!(*dt & RETROKF) && iri->utf8_encode)
{
iri->utf8_encode = false;
printf ("[Fallbacking to non-utf8 for `%s'\n", url);
goto second_try;
}
if (local_file && *dt & RETROKF)
{
register_download (u->url, local_file);
if (redirection_count && 0 != strcmp (origurl, u->url))
register_redirection (origurl, u->url);
if (*dt & TEXTHTML)
register_html (u->url, local_file);
if (*dt & RETROKF)
{
register_download (u->url, local_file);
@ -821,13 +848,17 @@ retrieve_from_file (const char *file, bool html, int *count)
{
uerr_t status;
struct urlpos *url_list, *cur_url;
struct iri *iri = iri_new();
char *input_file = NULL;
const char *url = file;
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
/* sXXXav : Assume filename and links in the file are in the locale */
set_content_encoding (iri, opt.locale);
if (url_has_scheme (url))
{
int dt;
@ -836,7 +867,7 @@ retrieve_from_file (const char *file, bool html, int *count)
if (!opt.base_href)
opt.base_href = xstrdup (url);
status = retrieve_url (url, &input_file, NULL, NULL, &dt, false);
status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri);
if (status != RETROK)
return status;
@ -846,7 +877,7 @@ retrieve_from_file (const char *file, bool html, int *count)
else
input_file = (char *) file;
url_list = (html ? get_urls_html (input_file, NULL, NULL)
url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
: get_urls_file (input_file));
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
@ -868,15 +899,16 @@ retrieve_from_file (const char *file, bool html, int *count)
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
if (cur_url->url->scheme == SCHEME_FTP)
if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
status = retrieve_tree (cur_url->url->url);
opt.follow_ftp = old_follow_ftp;
}
else
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
&dt, opt.recursive, iri);
if (filename && opt.delete_after && file_exists_p (filename))
{
@ -1047,7 +1079,11 @@ bool
url_uses_proxy (const char *url)
{
bool ret;
struct url *u = url_parse (url, NULL);
struct url *u;
struct iri *i = iri_new();
/* url was given in the command line, so use locale as encoding */
set_uri_encoding (i, opt.locale);
u= url_parse (url, NULL, i);
if (!u)
return false;
ret = getproxy (u) != NULL;

View File

@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
char *fd_read_hunk (int, hunk_terminator_t, long, long);
char *fd_read_line (int);
uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
bool, struct iri *);
uerr_t retrieve_from_file (const char *, bool, int *);
const char *retr_rate (wgint, double);

View File

@ -42,6 +42,7 @@ as that of the covered work. */
#include "utils.h"
#include "url.h"
#include "host.h" /* for is_valid_ipv6_address */
#include "iri.h"
#ifdef TESTING
#include "test.h"
@ -640,7 +641,7 @@ static const char *parse_errors[] = {
error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */
struct url *
url_parse (const char *url, int *error)
url_parse (const char *url, int *error, struct iri *iri)
{
struct url *u;
const char *p;
@ -659,7 +660,7 @@ url_parse (const char *url, int *error)
int port;
char *user = NULL, *passwd = NULL;
char *url_encoded = NULL;
char *url_encoded = NULL, *new_url = NULL;
int error_code;
@ -670,9 +671,20 @@ url_parse (const char *url, int *error)
goto error;
}
url_encoded = reencode_escapes (url);
if (iri && iri->utf8_encode)
{
url_unescape ((char *) url);
iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
if (!iri->utf8_encode)
new_url = NULL;
}
url_encoded = reencode_escapes (new_url ? new_url : url);
p = url_encoded;
if (new_url && url_encoded != new_url)
xfree (new_url);
p += strlen (supported_schemes[scheme].leading_string);
uname_b = p;
p = url_skip_credentials (p);
@ -842,6 +854,18 @@ url_parse (const char *url, int *error)
{
url_unescape (u->host);
host_modified = true;
/* Apply IDNA regardless of iri->utf8_encode status */
if (opt.enable_iri && iri)
{
char *new = idn_encode (iri, u->host);
if (new)
{
xfree (u->host);
u->host = new;
host_modified = true;
}
}
}
if (params_b)
@ -851,7 +875,7 @@ url_parse (const char *url, int *error)
if (fragment_b)
u->fragment = strdupdelim (fragment_b, fragment_e);
if (path_modified || u->fragment || host_modified || path_b == path_e)
if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
{
/* If we suspect that a transformation has rendered what
url_string might return different from URL_ENCODED, rebuild

View File

@ -84,7 +84,7 @@ struct url
char *url_escape (const char *);
struct url *url_parse (const char *, int *);
struct url *url_parse (const char *, int *, struct iri *iri);
const char *url_error (int);
char *url_full_path (const struct url *);
void url_set_dir (struct url *, const char *);

View File

@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT;
#include "quote.h"
#include "quotearg.h"
/* Likewise for struct iri definition */
#include "iri.h"
/* Useful macros used across the code: */
/* The number of elements in an array. For example: