mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Basic IDN/IRI support
This commit is contained in:
parent
6c6e838338
commit
9a2ea3938d
@ -716,7 +716,7 @@ lookup_host (const char *host, int flags)
|
|||||||
{
|
{
|
||||||
char *str = NULL, *name;
|
char *str = NULL, *name;
|
||||||
|
|
||||||
if (opt.enable_iri && (name = idn_decode (host)) != NULL)
|
if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
|
||||||
{
|
{
|
||||||
int len = strlen (host) + strlen (name) + 4;
|
int len = strlen (host) + strlen (name) + 4;
|
||||||
str = xmalloc (len);
|
str = xmalloc (len);
|
||||||
@ -725,7 +725,7 @@ lookup_host (const char *host, int flags)
|
|||||||
xfree (name);
|
xfree (name);
|
||||||
}
|
}
|
||||||
|
|
||||||
logprintf (LOG_VERBOSE, _("Resolving %s... "),
|
logprintf (LOG_VERBOSE, _("Resolving %s... "),
|
||||||
quotearg_style (escape_quoting_style, str ? str : host));
|
quotearg_style (escape_quoting_style, str ? str : host));
|
||||||
|
|
||||||
if (str)
|
if (str)
|
||||||
|
@ -274,6 +274,7 @@ append_url (const char *link_uri,
|
|||||||
struct urlpos *newel;
|
struct urlpos *newel;
|
||||||
const char *base = ctx->base ? ctx->base : ctx->parent_base;
|
const char *base = ctx->base ? ctx->base : ctx->parent_base;
|
||||||
struct url *url;
|
struct url *url;
|
||||||
|
bool utf8_encode = false;
|
||||||
|
|
||||||
if (!base)
|
if (!base)
|
||||||
{
|
{
|
||||||
@ -292,7 +293,7 @@ append_url (const char *link_uri,
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
url = url_parse (link_uri, NULL);
|
url = url_parse (link_uri, NULL, &utf8_encode);
|
||||||
if (!url)
|
if (!url)
|
||||||
{
|
{
|
||||||
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
|
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
|
||||||
@ -311,7 +312,7 @@ append_url (const char *link_uri,
|
|||||||
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
|
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
|
||||||
ctx->document_file, base, link_uri, complete_uri));
|
ctx->document_file, base, link_uri, complete_uri));
|
||||||
|
|
||||||
url = url_parse (complete_uri, NULL);
|
url = url_parse (complete_uri, NULL, &utf8_encode);
|
||||||
if (!url)
|
if (!url)
|
||||||
{
|
{
|
||||||
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
|
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
|
||||||
@ -549,9 +550,9 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|||||||
if (!mcharset)
|
if (!mcharset)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));
|
/*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
|
||||||
|
|
||||||
/* sXXXav: Not used yet */
|
set_current_charset (mcharset);
|
||||||
xfree (mcharset);
|
xfree (mcharset);
|
||||||
}
|
}
|
||||||
else if (name && 0 == strcasecmp (name, "robots"))
|
else if (name && 0 == strcasecmp (name, "robots"))
|
||||||
@ -660,6 +661,7 @@ get_urls_file (const char *file)
|
|||||||
struct file_memory *fm;
|
struct file_memory *fm;
|
||||||
struct urlpos *head, *tail;
|
struct urlpos *head, *tail;
|
||||||
const char *text, *text_end;
|
const char *text, *text_end;
|
||||||
|
bool utf8_encode = false;
|
||||||
|
|
||||||
/* Load the file. */
|
/* Load the file. */
|
||||||
fm = read_file (file);
|
fm = read_file (file);
|
||||||
@ -711,7 +713,7 @@ get_urls_file (const char *file)
|
|||||||
url_text = merged;
|
url_text = merged;
|
||||||
}
|
}
|
||||||
|
|
||||||
url = url_parse (url_text, &up_error_code);
|
url = url_parse (url_text, &up_error_code, &utf8_encode);
|
||||||
if (!url)
|
if (!url)
|
||||||
{
|
{
|
||||||
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
|
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
|
||||||
|
@ -1825,7 +1825,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
|
|||||||
hs->local_file = url_file_name (u);
|
hs->local_file = url_file_name (u);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TODO: perform this check only once. */
|
/* TODO: perform this check only once. */
|
||||||
if (!hs->existence_checked && file_exists_p (hs->local_file))
|
if (!hs->existence_checked && file_exists_p (hs->local_file))
|
||||||
{
|
{
|
||||||
@ -1894,7 +1894,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
|
|||||||
local_dot_orig_file_exists = true;
|
local_dot_orig_file_exists = true;
|
||||||
local_filename = filename_plus_orig_suffix;
|
local_filename = filename_plus_orig_suffix;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!local_dot_orig_file_exists)
|
if (!local_dot_orig_file_exists)
|
||||||
/* Couldn't stat() <file>.orig, so try to stat() <file>. */
|
/* Couldn't stat() <file>.orig, so try to stat() <file>. */
|
||||||
@ -2055,7 +2055,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
|
|||||||
|
|
||||||
/* Try to get remote encoding if needed */
|
/* Try to get remote encoding if needed */
|
||||||
if (opt.enable_iri && !opt.encoding_remote)
|
if (opt.enable_iri && !opt.encoding_remote)
|
||||||
/* xxx = */ parse_charset (tmp2);
|
set_current_charset (parse_charset (tmp2));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
hs->newloc = resp_header_strdup (resp, "Location");
|
hs->newloc = resp_header_strdup (resp, "Location");
|
||||||
|
108
src/iri.c
108
src/iri.c
@ -41,6 +41,8 @@ as that of the covered work. */
|
|||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "iri.h"
|
#include "iri.h"
|
||||||
|
|
||||||
|
char *remote;
|
||||||
|
char *current;
|
||||||
|
|
||||||
static iconv_t locale2utf8;
|
static iconv_t locale2utf8;
|
||||||
|
|
||||||
@ -80,7 +82,7 @@ parse_charset (char *str)
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));
|
/*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
|
||||||
|
|
||||||
return charset;
|
return charset;
|
||||||
}
|
}
|
||||||
@ -196,7 +198,7 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
|
|||||||
(*out)++;
|
(*out)++;
|
||||||
outlen--;
|
outlen--;
|
||||||
}
|
}
|
||||||
else if (errno == E2BIG) /* Output buffer full */
|
else if (errno == E2BIG) /* Output buffer full */
|
||||||
{
|
{
|
||||||
char *new;
|
char *new;
|
||||||
|
|
||||||
@ -222,15 +224,29 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
|
|||||||
|
|
||||||
/* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL
|
/* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL
|
||||||
on error. */
|
on error. */
|
||||||
char *idn_encode (char *host)
|
char *
|
||||||
|
idn_encode (char *host, bool utf8_encoded)
|
||||||
{
|
{
|
||||||
char *new;
|
char *new;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
/* Encode to UTF-8 if not done using current remote */
|
||||||
|
if (!utf8_encoded)
|
||||||
|
{
|
||||||
|
if (!remote_to_utf8 ((const char *) host, (const char **) &new))
|
||||||
|
{
|
||||||
|
/* Nothing to encode or an error occured */
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
host = new;
|
||||||
|
}
|
||||||
|
|
||||||
/* toASCII UTF-8 NULL terminated string */
|
/* toASCII UTF-8 NULL terminated string */
|
||||||
ret = idna_to_ascii_8z (host, &new, 0);
|
ret = idna_to_ascii_8z (host, &new, 0);
|
||||||
if (ret != IDNA_SUCCESS)
|
if (ret != IDNA_SUCCESS)
|
||||||
{
|
{
|
||||||
|
/* sXXXav : free new when needed ! */
|
||||||
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
|
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
|
||||||
quote (idna_strerror (ret)));
|
quote (idna_strerror (ret)));
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -241,7 +257,8 @@ char *idn_encode (char *host)
|
|||||||
|
|
||||||
/* Try to decode an ASCII encoded host. Return the new domain in the locale on
|
/* Try to decode an ASCII encoded host. Return the new domain in the locale on
|
||||||
success or NULL on error. */
|
success or NULL on error. */
|
||||||
char *idn_decode (char *host)
|
char *
|
||||||
|
idn_decode (char *host)
|
||||||
{
|
{
|
||||||
char *new;
|
char *new;
|
||||||
int ret;
|
int ret;
|
||||||
@ -257,4 +274,87 @@ char *idn_decode (char *host)
|
|||||||
return new;
|
return new;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Return a new string */
|
||||||
|
bool
|
||||||
|
remote_to_utf8 (const char *str, const char **new)
|
||||||
|
{
|
||||||
|
char *remote;
|
||||||
|
iconv_t cd;
|
||||||
|
bool ret = false;
|
||||||
|
|
||||||
|
if (opt.encoding_remote)
|
||||||
|
remote = opt.encoding_remote;
|
||||||
|
else if (current)
|
||||||
|
remote = current;
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
|
||||||
|
cd = iconv_open ("UTF-8", remote);
|
||||||
|
if (cd == (iconv_t)(-1))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
|
||||||
|
ret = true;
|
||||||
|
|
||||||
|
iconv_close (cd);
|
||||||
|
|
||||||
|
/* Test if something was converted */
|
||||||
|
if (!strcmp (str, *new))
|
||||||
|
{
|
||||||
|
xfree ((char *) *new);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *get_remote_charset (void)
|
||||||
|
{
|
||||||
|
return remote;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *get_current_charset (void)
|
||||||
|
{
|
||||||
|
return current;
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_current_charset (char *charset)
|
||||||
|
{
|
||||||
|
/*printf("[ current = `%s'\n", charset);*/
|
||||||
|
|
||||||
|
if (current)
|
||||||
|
xfree (current);
|
||||||
|
|
||||||
|
current = charset ? xstrdup (charset) : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_current_as_locale (void)
|
||||||
|
{
|
||||||
|
/*printf("[ current = locale = `%s'\n", opt.locale);*/
|
||||||
|
if (current)
|
||||||
|
xfree (current);
|
||||||
|
|
||||||
|
/* sXXXav : assert opt.locale NULL ? */
|
||||||
|
current = xstrdup (opt.locale);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
set_remote_charset (char *charset)
|
||||||
|
{
|
||||||
|
/*printf("[ remote = `%s'\n", charset);*/
|
||||||
|
if (remote)
|
||||||
|
xfree (remote);
|
||||||
|
|
||||||
|
remote = charset ? xstrdup (charset) : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
set_remote_as_current (void)
|
||||||
|
{
|
||||||
|
/*printf("[ remote = current = `%s'\n", current);*/
|
||||||
|
if (remote)
|
||||||
|
xfree (remote);
|
||||||
|
|
||||||
|
remote = current ? xstrdup (current) : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
20
src/iri.h
20
src/iri.h
@ -36,8 +36,16 @@ char *parse_charset (char *str);
|
|||||||
char *find_locale (void);
|
char *find_locale (void);
|
||||||
bool check_encoding_name (char *encoding);
|
bool check_encoding_name (char *encoding);
|
||||||
const char *locale_to_utf8 (const char *str);
|
const char *locale_to_utf8 (const char *str);
|
||||||
char *idn_encode (char *host);
|
char *idn_encode (char *host, bool utf8_encoded);
|
||||||
char *idn_decode (char *host);
|
char *idn_decode (char *host);
|
||||||
|
char *get_remote_charset (void);
|
||||||
|
char *get_current_charset (void);
|
||||||
|
void set_current_charset (char *charset);
|
||||||
|
void set_current_as_locale (void);
|
||||||
|
void set_current_charset (char *charset);
|
||||||
|
void set_remote_charset (char *charset);
|
||||||
|
void set_remote_as_current (void);
|
||||||
|
bool remote_to_utf8 (const char *str, const char **new);
|
||||||
|
|
||||||
#else /* ENABLE_IRI */
|
#else /* ENABLE_IRI */
|
||||||
|
|
||||||
@ -45,8 +53,16 @@ char *idn_decode (char *host);
|
|||||||
#define find_locale() NULL
|
#define find_locale() NULL
|
||||||
#define check_encoding_name(str) false
|
#define check_encoding_name(str) false
|
||||||
#define locale_to_utf8(str) (str)
|
#define locale_to_utf8(str) (str)
|
||||||
#define idn_encode(str) NULL
|
#define idn_encode(str,encoded) NULL
|
||||||
#define idn_decode(str) NULL
|
#define idn_decode(str) NULL
|
||||||
|
#define get_remote_charset() NULL
|
||||||
|
#define get_current_charset() NULL
|
||||||
|
#define set_current_charset(str)
|
||||||
|
#define set_current_as_locale()
|
||||||
|
#define set_current_charset(str)
|
||||||
|
#define set_remote_charset(str)
|
||||||
|
#define set_remote_as_current()
|
||||||
|
#define remote_to_utf8(a,b) false
|
||||||
|
|
||||||
#endif /* ENABLE_IRI */
|
#endif /* ENABLE_IRI */
|
||||||
#endif /* IRI_H */
|
#endif /* IRI_H */
|
||||||
|
17
src/main.c
17
src/main.c
@ -1067,16 +1067,16 @@ for details.\n\n"));
|
|||||||
#ifdef ENABLE_IRI
|
#ifdef ENABLE_IRI
|
||||||
if (opt.enable_iri)
|
if (opt.enable_iri)
|
||||||
{
|
{
|
||||||
if (opt.locale && !check_encoding_name(opt.locale))
|
if (opt.locale && !check_encoding_name (opt.locale))
|
||||||
opt.locale = NULL;
|
opt.locale = NULL;
|
||||||
|
|
||||||
if (!opt.locale)
|
if (!opt.locale)
|
||||||
opt.locale = find_locale ();
|
opt.locale = find_locale ();
|
||||||
|
|
||||||
if (opt.encoding_remote && !check_encoding_name(opt.encoding_remote))
|
if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
|
||||||
opt.encoding_remote = NULL;
|
opt.encoding_remote = NULL;
|
||||||
|
|
||||||
logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));
|
/*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (opt.enable_iri || opt.locale || opt.encoding_remote)
|
if (opt.enable_iri || opt.locale || opt.encoding_remote)
|
||||||
@ -1190,21 +1190,26 @@ WARNING: Can't reopen standard output in binary mode;\n\
|
|||||||
char *filename = NULL, *redirected_URL = NULL;
|
char *filename = NULL, *redirected_URL = NULL;
|
||||||
int dt;
|
int dt;
|
||||||
|
|
||||||
|
set_current_as_locale ();
|
||||||
|
|
||||||
if ((opt.recursive || opt.page_requisites)
|
if ((opt.recursive || opt.page_requisites)
|
||||||
&& (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
|
&& (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
|
||||||
{
|
{
|
||||||
int old_follow_ftp = opt.follow_ftp;
|
int old_follow_ftp = opt.follow_ftp;
|
||||||
|
|
||||||
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
|
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
|
||||||
if (url_scheme (*t) == SCHEME_FTP)
|
if (url_scheme (*t) == SCHEME_FTP)
|
||||||
opt.follow_ftp = 1;
|
opt.follow_ftp = 1;
|
||||||
|
|
||||||
status = retrieve_tree (*t);
|
status = retrieve_tree (*t);
|
||||||
|
|
||||||
opt.follow_ftp = old_follow_ftp;
|
opt.follow_ftp = old_follow_ftp;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
|
{
|
||||||
|
set_remote_as_current ();
|
||||||
|
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
|
||||||
|
}
|
||||||
|
|
||||||
if (opt.delete_after && file_exists_p(filename))
|
if (opt.delete_after && file_exists_p(filename))
|
||||||
{
|
{
|
||||||
|
35
src/recur.c
35
src/recur.c
@ -49,6 +49,7 @@ as that of the covered work. */
|
|||||||
#include "res.h"
|
#include "res.h"
|
||||||
#include "convert.h"
|
#include "convert.h"
|
||||||
#include "spider.h"
|
#include "spider.h"
|
||||||
|
#include "iri.h"
|
||||||
|
|
||||||
/* Functions for maintaining the URL queue. */
|
/* Functions for maintaining the URL queue. */
|
||||||
|
|
||||||
@ -58,7 +59,7 @@ struct queue_element {
|
|||||||
int depth; /* the depth */
|
int depth; /* the depth */
|
||||||
bool html_allowed; /* whether the document is allowed to
|
bool html_allowed; /* whether the document is allowed to
|
||||||
be treated as HTML. */
|
be treated as HTML. */
|
||||||
|
char *remote_encoding;
|
||||||
struct queue_element *next; /* next element in queue */
|
struct queue_element *next; /* next element in queue */
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -94,12 +95,18 @@ url_enqueue (struct url_queue *queue,
|
|||||||
const char *url, const char *referer, int depth, bool html_allowed)
|
const char *url, const char *referer, int depth, bool html_allowed)
|
||||||
{
|
{
|
||||||
struct queue_element *qel = xnew (struct queue_element);
|
struct queue_element *qel = xnew (struct queue_element);
|
||||||
|
char *charset = get_current_charset ();
|
||||||
qel->url = url;
|
qel->url = url;
|
||||||
qel->referer = referer;
|
qel->referer = referer;
|
||||||
qel->depth = depth;
|
qel->depth = depth;
|
||||||
qel->html_allowed = html_allowed;
|
qel->html_allowed = html_allowed;
|
||||||
qel->next = NULL;
|
qel->next = NULL;
|
||||||
|
|
||||||
|
if (charset)
|
||||||
|
qel->remote_encoding = xstrdup (charset);
|
||||||
|
else
|
||||||
|
qel->remote_encoding = NULL;
|
||||||
|
|
||||||
++queue->count;
|
++queue->count;
|
||||||
if (queue->count > queue->maxcount)
|
if (queue->count > queue->maxcount)
|
||||||
queue->maxcount = queue->count;
|
queue->maxcount = queue->count;
|
||||||
@ -107,6 +114,8 @@ url_enqueue (struct url_queue *queue,
|
|||||||
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
|
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
|
||||||
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
|
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
|
||||||
|
|
||||||
|
/*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
|
||||||
|
|
||||||
if (queue->tail)
|
if (queue->tail)
|
||||||
queue->tail->next = qel;
|
queue->tail->next = qel;
|
||||||
queue->tail = qel;
|
queue->tail = qel;
|
||||||
@ -132,6 +141,10 @@ url_dequeue (struct url_queue *queue,
|
|||||||
if (!queue->head)
|
if (!queue->head)
|
||||||
queue->tail = NULL;
|
queue->tail = NULL;
|
||||||
|
|
||||||
|
set_remote_charset (qel->remote_encoding);
|
||||||
|
if (qel->remote_encoding)
|
||||||
|
xfree (qel->remote_encoding);
|
||||||
|
|
||||||
*url = qel->url;
|
*url = qel->url;
|
||||||
*referer = qel->referer;
|
*referer = qel->referer;
|
||||||
*depth = qel->depth;
|
*depth = qel->depth;
|
||||||
@ -177,6 +190,7 @@ uerr_t
|
|||||||
retrieve_tree (const char *start_url)
|
retrieve_tree (const char *start_url)
|
||||||
{
|
{
|
||||||
uerr_t status = RETROK;
|
uerr_t status = RETROK;
|
||||||
|
bool utf8_encode = false;
|
||||||
|
|
||||||
/* The queue of URLs we need to load. */
|
/* The queue of URLs we need to load. */
|
||||||
struct url_queue *queue;
|
struct url_queue *queue;
|
||||||
@ -186,7 +200,7 @@ retrieve_tree (const char *start_url)
|
|||||||
struct hash_table *blacklist;
|
struct hash_table *blacklist;
|
||||||
|
|
||||||
int up_error_code;
|
int up_error_code;
|
||||||
struct url *start_url_parsed = url_parse (start_url, &up_error_code);
|
struct url *start_url_parsed = url_parse (start_url, &up_error_code, &utf8_encode);
|
||||||
|
|
||||||
if (!start_url_parsed)
|
if (!start_url_parsed)
|
||||||
{
|
{
|
||||||
@ -324,7 +338,7 @@ retrieve_tree (const char *start_url)
|
|||||||
if (children)
|
if (children)
|
||||||
{
|
{
|
||||||
struct urlpos *child = children;
|
struct urlpos *child = children;
|
||||||
struct url *url_parsed = url_parsed = url_parse (url, NULL);
|
struct url *url_parsed = url_parsed = url_parse (url, NULL, &utf8_encode);
|
||||||
char *referer_url = url;
|
char *referer_url = url;
|
||||||
bool strip_auth = (url_parsed != NULL
|
bool strip_auth = (url_parsed != NULL
|
||||||
&& url_parsed->user != NULL);
|
&& url_parsed->user != NULL);
|
||||||
@ -360,18 +374,18 @@ retrieve_tree (const char *start_url)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (file
|
if (file
|
||||||
&& (opt.delete_after
|
&& (opt.delete_after
|
||||||
|| opt.spider /* opt.recursive is implicitely true */
|
|| opt.spider /* opt.recursive is implicitely true */
|
||||||
|| !acceptable (file)))
|
|| !acceptable (file)))
|
||||||
{
|
{
|
||||||
/* Either --delete-after was specified, or we loaded this
|
/* Either --delete-after was specified, or we loaded this
|
||||||
(otherwise unneeded because of --spider or rejected by -R)
|
(otherwise unneeded because of --spider or rejected by -R)
|
||||||
HTML file just to harvest its hyperlinks -- in either case,
|
HTML file just to harvest its hyperlinks -- in either case,
|
||||||
delete the local file. */
|
delete the local file. */
|
||||||
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
|
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
|
||||||
opt.delete_after ? "--delete-after" :
|
opt.delete_after ? "--delete-after" :
|
||||||
(opt.spider ? "--spider" :
|
(opt.spider ? "--spider" :
|
||||||
"recursive rejection criteria")));
|
"recursive rejection criteria")));
|
||||||
logprintf (LOG_VERBOSE,
|
logprintf (LOG_VERBOSE,
|
||||||
(opt.delete_after || opt.spider
|
(opt.delete_after || opt.spider
|
||||||
@ -627,11 +641,12 @@ descend_redirect_p (const char *redirected, const char *original, int depth,
|
|||||||
struct url *orig_parsed, *new_parsed;
|
struct url *orig_parsed, *new_parsed;
|
||||||
struct urlpos *upos;
|
struct urlpos *upos;
|
||||||
bool success;
|
bool success;
|
||||||
|
bool utf8_encode = false;
|
||||||
|
|
||||||
orig_parsed = url_parse (original, NULL);
|
orig_parsed = url_parse (original, NULL, &utf8_encode);
|
||||||
assert (orig_parsed != NULL);
|
assert (orig_parsed != NULL);
|
||||||
|
|
||||||
new_parsed = url_parse (redirected, NULL);
|
new_parsed = url_parse (redirected, NULL, &utf8_encode);
|
||||||
assert (new_parsed != NULL);
|
assert (new_parsed != NULL);
|
||||||
|
|
||||||
upos = xnew0 (struct urlpos);
|
upos = xnew0 (struct urlpos);
|
||||||
|
47
src/retr.c
47
src/retr.c
@ -51,6 +51,7 @@ as that of the covered work. */
|
|||||||
#include "hash.h"
|
#include "hash.h"
|
||||||
#include "convert.h"
|
#include "convert.h"
|
||||||
#include "ptimer.h"
|
#include "ptimer.h"
|
||||||
|
#include "iri.h"
|
||||||
|
|
||||||
/* Total size of downloaded files. Used to enforce quota. */
|
/* Total size of downloaded files. Used to enforce quota. */
|
||||||
SUM_SIZE_INT total_downloaded_bytes;
|
SUM_SIZE_INT total_downloaded_bytes;
|
||||||
@ -612,6 +613,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
char *saved_post_data = NULL;
|
char *saved_post_data = NULL;
|
||||||
char *saved_post_file_name = NULL;
|
char *saved_post_file_name = NULL;
|
||||||
|
|
||||||
|
bool utf8_encoded = opt.enable_iri;
|
||||||
|
|
||||||
/* If dt is NULL, use local storage. */
|
/* If dt is NULL, use local storage. */
|
||||||
if (!dt)
|
if (!dt)
|
||||||
{
|
{
|
||||||
@ -624,7 +627,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
if (file)
|
if (file)
|
||||||
*file = NULL;
|
*file = NULL;
|
||||||
|
|
||||||
u = url_parse (url, &up_error_code);
|
second_try:
|
||||||
|
u = url_parse (url, &up_error_code, &utf8_encoded);
|
||||||
if (!u)
|
if (!u)
|
||||||
{
|
{
|
||||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
|
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
|
||||||
@ -632,6 +636,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
return URLERROR;
|
return URLERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
|
||||||
|
|
||||||
if (!refurl)
|
if (!refurl)
|
||||||
refurl = opt.referer;
|
refurl = opt.referer;
|
||||||
|
|
||||||
@ -645,8 +651,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
proxy = getproxy (u);
|
proxy = getproxy (u);
|
||||||
if (proxy)
|
if (proxy)
|
||||||
{
|
{
|
||||||
|
/* sXXXav : support IRI for proxy */
|
||||||
|
bool proxy_utf8_encode = false;
|
||||||
/* Parse the proxy URL. */
|
/* Parse the proxy URL. */
|
||||||
proxy_url = url_parse (proxy, &up_error_code);
|
proxy_url = url_parse (proxy, &up_error_code, &proxy_utf8_encode);
|
||||||
if (!proxy_url)
|
if (!proxy_url)
|
||||||
{
|
{
|
||||||
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
|
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
|
||||||
@ -721,8 +729,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
xfree (mynewloc);
|
xfree (mynewloc);
|
||||||
mynewloc = construced_newloc;
|
mynewloc = construced_newloc;
|
||||||
|
|
||||||
|
utf8_encoded = opt.enable_iri;
|
||||||
|
|
||||||
/* Now, see if this new location makes sense. */
|
/* Now, see if this new location makes sense. */
|
||||||
newloc_parsed = url_parse (mynewloc, &up_error_code);
|
newloc_parsed = url_parse (mynewloc, &up_error_code, &utf8_encoded);
|
||||||
if (!newloc_parsed)
|
if (!newloc_parsed)
|
||||||
{
|
{
|
||||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
|
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
|
||||||
@ -769,16 +779,21 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
goto redirected;
|
goto redirected;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (local_file)
|
/* Try to not encode in UTF-8 if fetching failed */
|
||||||
|
if (result != RETROK && utf8_encoded)
|
||||||
{
|
{
|
||||||
if (*dt & RETROKF)
|
utf8_encoded = false;
|
||||||
{
|
/*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
|
||||||
register_download (u->url, local_file);
|
goto second_try;
|
||||||
if (redirection_count && 0 != strcmp (origurl, u->url))
|
}
|
||||||
register_redirection (origurl, u->url);
|
|
||||||
if (*dt & TEXTHTML)
|
if (local_file && *dt & RETROKF)
|
||||||
register_html (u->url, local_file);
|
{
|
||||||
}
|
register_download (u->url, local_file);
|
||||||
|
if (redirection_count && 0 != strcmp (origurl, u->url))
|
||||||
|
register_redirection (origurl, u->url);
|
||||||
|
if (*dt & TEXTHTML)
|
||||||
|
register_html (u->url, local_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (file)
|
if (file)
|
||||||
@ -843,9 +858,9 @@ retrieve_from_file (const char *file, bool html, int *count)
|
|||||||
int old_follow_ftp = opt.follow_ftp;
|
int old_follow_ftp = opt.follow_ftp;
|
||||||
|
|
||||||
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
|
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
|
||||||
if (cur_url->url->scheme == SCHEME_FTP)
|
if (cur_url->url->scheme == SCHEME_FTP)
|
||||||
opt.follow_ftp = 1;
|
opt.follow_ftp = 1;
|
||||||
|
|
||||||
status = retrieve_tree (cur_url->url->url);
|
status = retrieve_tree (cur_url->url->url);
|
||||||
|
|
||||||
opt.follow_ftp = old_follow_ftp;
|
opt.follow_ftp = old_follow_ftp;
|
||||||
@ -1021,8 +1036,8 @@ getproxy (struct url *u)
|
|||||||
bool
|
bool
|
||||||
url_uses_proxy (const char *url)
|
url_uses_proxy (const char *url)
|
||||||
{
|
{
|
||||||
bool ret;
|
bool ret, utf8_encode = false;
|
||||||
struct url *u = url_parse (url, NULL);
|
struct url *u = url_parse (url, NULL, &utf8_encode);
|
||||||
if (!u)
|
if (!u)
|
||||||
return false;
|
return false;
|
||||||
ret = getproxy (u) != NULL;
|
ret = getproxy (u) != NULL;
|
||||||
|
11
src/url.c
11
src/url.c
@ -641,7 +641,7 @@ static const char *parse_errors[] = {
|
|||||||
error, and if ERROR is not NULL, also set *ERROR to the appropriate
|
error, and if ERROR is not NULL, also set *ERROR to the appropriate
|
||||||
error code. */
|
error code. */
|
||||||
struct url *
|
struct url *
|
||||||
url_parse (const char *url, int *error)
|
url_parse (const char *url, int *error, bool *utf8_encode)
|
||||||
{
|
{
|
||||||
struct url *u;
|
struct url *u;
|
||||||
const char *p;
|
const char *p;
|
||||||
@ -671,10 +671,13 @@ url_parse (const char *url, int *error)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opt.enable_iri)
|
if (opt.enable_iri && *utf8_encode)
|
||||||
{
|
{
|
||||||
|
const char *new;
|
||||||
url_unescape ((char *) url);
|
url_unescape ((char *) url);
|
||||||
url = locale_to_utf8(url);
|
*utf8_encode = remote_to_utf8 (url, &new);
|
||||||
|
if (*utf8_encode)
|
||||||
|
url = new;
|
||||||
}
|
}
|
||||||
|
|
||||||
url_encoded = reencode_escapes (url);
|
url_encoded = reencode_escapes (url);
|
||||||
@ -853,7 +856,7 @@ url_parse (const char *url, int *error)
|
|||||||
|
|
||||||
if (opt.enable_iri)
|
if (opt.enable_iri)
|
||||||
{
|
{
|
||||||
char *new = idn_encode (u->host);
|
char *new = idn_encode (u->host, *utf8_encode);
|
||||||
if (new)
|
if (new)
|
||||||
{
|
{
|
||||||
xfree (u->host);
|
xfree (u->host);
|
||||||
|
@ -84,7 +84,7 @@ struct url
|
|||||||
|
|
||||||
char *url_escape (const char *);
|
char *url_escape (const char *);
|
||||||
|
|
||||||
struct url *url_parse (const char *, int *);
|
struct url *url_parse (const char *, int *, bool *);
|
||||||
const char *url_error (int);
|
const char *url_error (int);
|
||||||
char *url_full_path (const struct url *);
|
char *url_full_path (const struct url *);
|
||||||
void url_set_dir (struct url *, const char *);
|
void url_set_dir (struct url *, const char *);
|
||||||
|
Loading…
Reference in New Issue
Block a user