1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

Basic IDN/IRI support

This commit is contained in:
Saint Xavier 2008-07-20 13:10:02 +02:00
parent 6c6e838338
commit 9a2ea3938d
10 changed files with 209 additions and 53 deletions

View File

@ -716,7 +716,7 @@ lookup_host (const char *host, int flags)
{ {
char *str = NULL, *name; char *str = NULL, *name;
if (opt.enable_iri && (name = idn_decode (host)) != NULL) if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
{ {
int len = strlen (host) + strlen (name) + 4; int len = strlen (host) + strlen (name) + 4;
str = xmalloc (len); str = xmalloc (len);

View File

@ -274,6 +274,7 @@ append_url (const char *link_uri,
struct urlpos *newel; struct urlpos *newel;
const char *base = ctx->base ? ctx->base : ctx->parent_base; const char *base = ctx->base ? ctx->base : ctx->parent_base;
struct url *url; struct url *url;
bool utf8_encode = false;
if (!base) if (!base)
{ {
@ -292,7 +293,7 @@ append_url (const char *link_uri,
return NULL; return NULL;
} }
url = url_parse (link_uri, NULL); url = url_parse (link_uri, NULL, &utf8_encode);
if (!url) if (!url)
{ {
DEBUGP (("%s: link \"%s\" doesn't parse.\n", DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@ -311,7 +312,7 @@ append_url (const char *link_uri,
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri)); ctx->document_file, base, link_uri, complete_uri));
url = url_parse (complete_uri, NULL); url = url_parse (complete_uri, NULL, &utf8_encode);
if (!url) if (!url)
{ {
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@ -549,9 +550,9 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
if (!mcharset) if (!mcharset)
return; return;
logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset)); /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
/* sXXXav: Not used yet */ set_current_charset (mcharset);
xfree (mcharset); xfree (mcharset);
} }
else if (name && 0 == strcasecmp (name, "robots")) else if (name && 0 == strcasecmp (name, "robots"))
@ -660,6 +661,7 @@ get_urls_file (const char *file)
struct file_memory *fm; struct file_memory *fm;
struct urlpos *head, *tail; struct urlpos *head, *tail;
const char *text, *text_end; const char *text, *text_end;
bool utf8_encode = false;
/* Load the file. */ /* Load the file. */
fm = read_file (file); fm = read_file (file);
@ -711,7 +713,7 @@ get_urls_file (const char *file)
url_text = merged; url_text = merged;
} }
url = url_parse (url_text, &up_error_code); url = url_parse (url_text, &up_error_code, &utf8_encode);
if (!url) if (!url)
{ {
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),

View File

@ -2055,7 +2055,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
/* Try to get remote encoding if needed */ /* Try to get remote encoding if needed */
if (opt.enable_iri && !opt.encoding_remote) if (opt.enable_iri && !opt.encoding_remote)
/* xxx = */ parse_charset (tmp2); set_current_charset (parse_charset (tmp2));
} }
} }
hs->newloc = resp_header_strdup (resp, "Location"); hs->newloc = resp_header_strdup (resp, "Location");

106
src/iri.c
View File

@ -41,6 +41,8 @@ as that of the covered work. */
#include "utils.h" #include "utils.h"
#include "iri.h" #include "iri.h"
char *remote;
char *current;
static iconv_t locale2utf8; static iconv_t locale2utf8;
@ -80,7 +82,7 @@ parse_charset (char *str)
return NULL; return NULL;
} }
logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset)); /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
return charset; return charset;
} }
@ -222,15 +224,29 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
/* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL /* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL
on error. */ on error. */
char *idn_encode (char *host) char *
idn_encode (char *host, bool utf8_encoded)
{ {
char *new; char *new;
int ret; int ret;
/* Encode to UTF-8 if not done using current remote */
if (!utf8_encoded)
{
if (!remote_to_utf8 ((const char *) host, (const char **) &new))
{
/* Nothing to encode or an error occured */
return NULL;
}
host = new;
}
/* toASCII UTF-8 NULL terminated string */ /* toASCII UTF-8 NULL terminated string */
ret = idna_to_ascii_8z (host, &new, 0); ret = idna_to_ascii_8z (host, &new, 0);
if (ret != IDNA_SUCCESS) if (ret != IDNA_SUCCESS)
{ {
/* sXXXav : free new when needed ! */
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret, logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
quote (idna_strerror (ret))); quote (idna_strerror (ret)));
return NULL; return NULL;
@ -241,7 +257,8 @@ char *idn_encode (char *host)
/* Try to decode an ASCII encoded host. Return the new domain in the locale on /* Try to decode an ASCII encoded host. Return the new domain in the locale on
success or NULL on error. */ success or NULL on error. */
char *idn_decode (char *host) char *
idn_decode (char *host)
{ {
char *new; char *new;
int ret; int ret;
@ -257,4 +274,87 @@ char *idn_decode (char *host)
return new; return new;
} }
/* Return a new string */
bool
remote_to_utf8 (const char *str, const char **new)
{
char *remote;
iconv_t cd;
bool ret = false;
if (opt.encoding_remote)
remote = opt.encoding_remote;
else if (current)
remote = current;
else
return false;
cd = iconv_open ("UTF-8", remote);
if (cd == (iconv_t)(-1))
return false;
if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
ret = true;
iconv_close (cd);
/* Test if something was converted */
if (!strcmp (str, *new))
{
xfree ((char *) *new);
return false;
}
return ret;
}
char *get_remote_charset (void)
{
return remote;
}
char *get_current_charset (void)
{
return current;
}
void set_current_charset (char *charset)
{
/*printf("[ current = `%s'\n", charset);*/
if (current)
xfree (current);
current = charset ? xstrdup (charset) : NULL;
}
void set_current_as_locale (void)
{
/*printf("[ current = locale = `%s'\n", opt.locale);*/
if (current)
xfree (current);
/* sXXXav : assert opt.locale NULL ? */
current = xstrdup (opt.locale);
}
void
set_remote_charset (char *charset)
{
/*printf("[ remote = `%s'\n", charset);*/
if (remote)
xfree (remote);
remote = charset ? xstrdup (charset) : NULL;
}
void
set_remote_as_current (void)
{
/*printf("[ remote = current = `%s'\n", current);*/
if (remote)
xfree (remote);
remote = current ? xstrdup (current) : NULL;
}

View File

@ -36,8 +36,16 @@ char *parse_charset (char *str);
char *find_locale (void); char *find_locale (void);
bool check_encoding_name (char *encoding); bool check_encoding_name (char *encoding);
const char *locale_to_utf8 (const char *str); const char *locale_to_utf8 (const char *str);
char *idn_encode (char *host); char *idn_encode (char *host, bool utf8_encoded);
char *idn_decode (char *host); char *idn_decode (char *host);
char *get_remote_charset (void);
char *get_current_charset (void);
void set_current_charset (char *charset);
void set_current_as_locale (void);
void set_current_charset (char *charset);
void set_remote_charset (char *charset);
void set_remote_as_current (void);
bool remote_to_utf8 (const char *str, const char **new);
#else /* ENABLE_IRI */ #else /* ENABLE_IRI */
@ -45,8 +53,16 @@ char *idn_decode (char *host);
#define find_locale() NULL #define find_locale() NULL
#define check_encoding_name(str) false #define check_encoding_name(str) false
#define locale_to_utf8(str) (str) #define locale_to_utf8(str) (str)
#define idn_encode(str) NULL #define idn_encode(str,encoded) NULL
#define idn_decode(str) NULL #define idn_decode(str) NULL
#define get_remote_charset() NULL
#define get_current_charset() NULL
#define set_current_charset(str)
#define set_current_as_locale()
#define set_current_charset(str)
#define set_remote_charset(str)
#define set_remote_as_current()
#define remote_to_utf8(a,b) false
#endif /* ENABLE_IRI */ #endif /* ENABLE_IRI */
#endif /* IRI_H */ #endif /* IRI_H */

View File

@ -1076,7 +1076,7 @@ for details.\n\n"));
if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote)) if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
opt.encoding_remote = NULL; opt.encoding_remote = NULL;
logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale)); /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/
} }
#else #else
if (opt.enable_iri || opt.locale || opt.encoding_remote) if (opt.enable_iri || opt.locale || opt.encoding_remote)
@ -1190,6 +1190,8 @@ WARNING: Can't reopen standard output in binary mode;\n\
char *filename = NULL, *redirected_URL = NULL; char *filename = NULL, *redirected_URL = NULL;
int dt; int dt;
set_current_as_locale ();
if ((opt.recursive || opt.page_requisites) if ((opt.recursive || opt.page_requisites)
&& (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t))) && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
{ {
@ -1204,7 +1206,10 @@ WARNING: Can't reopen standard output in binary mode;\n\
opt.follow_ftp = old_follow_ftp; opt.follow_ftp = old_follow_ftp;
} }
else else
{
set_remote_as_current ();
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
}
if (opt.delete_after && file_exists_p(filename)) if (opt.delete_after && file_exists_p(filename))
{ {

View File

@ -49,6 +49,7 @@ as that of the covered work. */
#include "res.h" #include "res.h"
#include "convert.h" #include "convert.h"
#include "spider.h" #include "spider.h"
#include "iri.h"
/* Functions for maintaining the URL queue. */ /* Functions for maintaining the URL queue. */
@ -58,7 +59,7 @@ struct queue_element {
int depth; /* the depth */ int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to bool html_allowed; /* whether the document is allowed to
be treated as HTML. */ be treated as HTML. */
char *remote_encoding;
struct queue_element *next; /* next element in queue */ struct queue_element *next; /* next element in queue */
}; };
@ -94,12 +95,18 @@ url_enqueue (struct url_queue *queue,
const char *url, const char *referer, int depth, bool html_allowed) const char *url, const char *referer, int depth, bool html_allowed)
{ {
struct queue_element *qel = xnew (struct queue_element); struct queue_element *qel = xnew (struct queue_element);
char *charset = get_current_charset ();
qel->url = url; qel->url = url;
qel->referer = referer; qel->referer = referer;
qel->depth = depth; qel->depth = depth;
qel->html_allowed = html_allowed; qel->html_allowed = html_allowed;
qel->next = NULL; qel->next = NULL;
if (charset)
qel->remote_encoding = xstrdup (charset);
else
qel->remote_encoding = NULL;
++queue->count; ++queue->count;
if (queue->count > queue->maxcount) if (queue->count > queue->maxcount)
queue->maxcount = queue->count; queue->maxcount = queue->count;
@ -107,6 +114,8 @@ url_enqueue (struct url_queue *queue,
DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
/*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
if (queue->tail) if (queue->tail)
queue->tail->next = qel; queue->tail->next = qel;
queue->tail = qel; queue->tail = qel;
@ -132,6 +141,10 @@ url_dequeue (struct url_queue *queue,
if (!queue->head) if (!queue->head)
queue->tail = NULL; queue->tail = NULL;
set_remote_charset (qel->remote_encoding);
if (qel->remote_encoding)
xfree (qel->remote_encoding);
*url = qel->url; *url = qel->url;
*referer = qel->referer; *referer = qel->referer;
*depth = qel->depth; *depth = qel->depth;
@ -177,6 +190,7 @@ uerr_t
retrieve_tree (const char *start_url) retrieve_tree (const char *start_url)
{ {
uerr_t status = RETROK; uerr_t status = RETROK;
bool utf8_encode = false;
/* The queue of URLs we need to load. */ /* The queue of URLs we need to load. */
struct url_queue *queue; struct url_queue *queue;
@ -186,7 +200,7 @@ retrieve_tree (const char *start_url)
struct hash_table *blacklist; struct hash_table *blacklist;
int up_error_code; int up_error_code;
struct url *start_url_parsed = url_parse (start_url, &up_error_code); struct url *start_url_parsed = url_parse (start_url, &up_error_code, &utf8_encode);
if (!start_url_parsed) if (!start_url_parsed)
{ {
@ -324,7 +338,7 @@ retrieve_tree (const char *start_url)
if (children) if (children)
{ {
struct urlpos *child = children; struct urlpos *child = children;
struct url *url_parsed = url_parsed = url_parse (url, NULL); struct url *url_parsed = url_parsed = url_parse (url, NULL, &utf8_encode);
char *referer_url = url; char *referer_url = url;
bool strip_auth = (url_parsed != NULL bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL); && url_parsed->user != NULL);
@ -627,11 +641,12 @@ descend_redirect_p (const char *redirected, const char *original, int depth,
struct url *orig_parsed, *new_parsed; struct url *orig_parsed, *new_parsed;
struct urlpos *upos; struct urlpos *upos;
bool success; bool success;
bool utf8_encode = false;
orig_parsed = url_parse (original, NULL); orig_parsed = url_parse (original, NULL, &utf8_encode);
assert (orig_parsed != NULL); assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL); new_parsed = url_parse (redirected, NULL, &utf8_encode);
assert (new_parsed != NULL); assert (new_parsed != NULL);
upos = xnew0 (struct urlpos); upos = xnew0 (struct urlpos);

View File

@ -51,6 +51,7 @@ as that of the covered work. */
#include "hash.h" #include "hash.h"
#include "convert.h" #include "convert.h"
#include "ptimer.h" #include "ptimer.h"
#include "iri.h"
/* Total size of downloaded files. Used to enforce quota. */ /* Total size of downloaded files. Used to enforce quota. */
SUM_SIZE_INT total_downloaded_bytes; SUM_SIZE_INT total_downloaded_bytes;
@ -612,6 +613,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
char *saved_post_data = NULL; char *saved_post_data = NULL;
char *saved_post_file_name = NULL; char *saved_post_file_name = NULL;
bool utf8_encoded = opt.enable_iri;
/* If dt is NULL, use local storage. */ /* If dt is NULL, use local storage. */
if (!dt) if (!dt)
{ {
@ -624,7 +627,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
if (file) if (file)
*file = NULL; *file = NULL;
u = url_parse (url, &up_error_code); second_try:
u = url_parse (url, &up_error_code, &utf8_encoded);
if (!u) if (!u)
{ {
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code)); logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
@ -632,6 +636,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
return URLERROR; return URLERROR;
} }
/*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
if (!refurl) if (!refurl)
refurl = opt.referer; refurl = opt.referer;
@ -645,8 +651,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
proxy = getproxy (u); proxy = getproxy (u);
if (proxy) if (proxy)
{ {
/* sXXXav : support IRI for proxy */
bool proxy_utf8_encode = false;
/* Parse the proxy URL. */ /* Parse the proxy URL. */
proxy_url = url_parse (proxy, &up_error_code); proxy_url = url_parse (proxy, &up_error_code, &proxy_utf8_encode);
if (!proxy_url) if (!proxy_url)
{ {
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
@ -721,8 +729,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
xfree (mynewloc); xfree (mynewloc);
mynewloc = construced_newloc; mynewloc = construced_newloc;
utf8_encoded = opt.enable_iri;
/* Now, see if this new location makes sense. */ /* Now, see if this new location makes sense. */
newloc_parsed = url_parse (mynewloc, &up_error_code); newloc_parsed = url_parse (mynewloc, &up_error_code, &utf8_encoded);
if (!newloc_parsed) if (!newloc_parsed)
{ {
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
@ -769,9 +779,15 @@ retrieve_url (const char *origurl, char **file, char **newloc,
goto redirected; goto redirected;
} }
if (local_file) /* Try to not encode in UTF-8 if fetching failed */
if (result != RETROK && utf8_encoded)
{ {
if (*dt & RETROKF) utf8_encoded = false;
/*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
goto second_try;
}
if (local_file && *dt & RETROKF)
{ {
register_download (u->url, local_file); register_download (u->url, local_file);
if (redirection_count && 0 != strcmp (origurl, u->url)) if (redirection_count && 0 != strcmp (origurl, u->url))
@ -779,7 +795,6 @@ retrieve_url (const char *origurl, char **file, char **newloc,
if (*dt & TEXTHTML) if (*dt & TEXTHTML)
register_html (u->url, local_file); register_html (u->url, local_file);
} }
}
if (file) if (file)
*file = local_file ? local_file : NULL; *file = local_file ? local_file : NULL;
@ -1021,8 +1036,8 @@ getproxy (struct url *u)
bool bool
url_uses_proxy (const char *url) url_uses_proxy (const char *url)
{ {
bool ret; bool ret, utf8_encode = false;
struct url *u = url_parse (url, NULL); struct url *u = url_parse (url, NULL, &utf8_encode);
if (!u) if (!u)
return false; return false;
ret = getproxy (u) != NULL; ret = getproxy (u) != NULL;

View File

@ -641,7 +641,7 @@ static const char *parse_errors[] = {
error, and if ERROR is not NULL, also set *ERROR to the appropriate error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */ error code. */
struct url * struct url *
url_parse (const char *url, int *error) url_parse (const char *url, int *error, bool *utf8_encode)
{ {
struct url *u; struct url *u;
const char *p; const char *p;
@ -671,10 +671,13 @@ url_parse (const char *url, int *error)
goto error; goto error;
} }
if (opt.enable_iri) if (opt.enable_iri && *utf8_encode)
{ {
const char *new;
url_unescape ((char *) url); url_unescape ((char *) url);
url = locale_to_utf8(url); *utf8_encode = remote_to_utf8 (url, &new);
if (*utf8_encode)
url = new;
} }
url_encoded = reencode_escapes (url); url_encoded = reencode_escapes (url);
@ -853,7 +856,7 @@ url_parse (const char *url, int *error)
if (opt.enable_iri) if (opt.enable_iri)
{ {
char *new = idn_encode (u->host); char *new = idn_encode (u->host, *utf8_encode);
if (new) if (new)
{ {
xfree (u->host); xfree (u->host);

View File

@ -84,7 +84,7 @@ struct url
char *url_escape (const char *); char *url_escape (const char *);
struct url *url_parse (const char *, int *); struct url *url_parse (const char *, int *, bool *);
const char *url_error (int); const char *url_error (int);
char *url_full_path (const struct url *); char *url_full_path (const struct url *);
void url_set_dir (struct url *, const char *); void url_set_dir (struct url *, const char *);