Change global variable model for state-object

This commit is contained in:
Saint Xavier 2008-07-24 00:56:29 +02:00
parent c31e00b52d
commit d82f80ecab
16 changed files with 197 additions and 231 deletions

View File

@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
/* Parse the file... */
urls = is_css ? get_urls_css_file (file, url) :
get_urls_html (file, url, NULL);
get_urls_html (file, url, NULL, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the

View File

@ -44,7 +44,6 @@ as that of the covered work. */
#include "recur.h"
#include "html-url.h"
#include "css-url.h"
#include "iri.h"
typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
@ -175,6 +174,10 @@ static const char *additional_attributes[] = {
static struct hash_table *interesting_tags;
static struct hash_table *interesting_attributes;
/* Will contains the (last) charset found in 'http-equiv=content-type'
meta tags */
static char *meta_charset;
static void
init_interesting (void)
{
@ -285,9 +288,7 @@ append_url (const char *link_uri, int position, int size,
return NULL;
}
set_ugly_no_encode (true);
url = url_parse (link_uri, NULL);
set_ugly_no_encode (false);
url = url_parse (link_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@ -306,9 +307,7 @@ append_url (const char *link_uri, int position, int size,
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri));
set_ugly_no_encode (true);
url = url_parse (complete_uri, NULL);
set_ugly_no_encode (false);
url = url_parse (complete_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@ -573,9 +572,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
return;
/*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
set_current_charset (mcharset);
xfree (mcharset);
xfree_null (meta_charset);
meta_charset = mcharset;
}
else if (name && 0 == strcasecmp (name, "robots"))
{
@ -641,7 +639,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
<base href=...> and does the right thing. */
struct urlpos *
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
struct iri *iri)
{
struct file_memory *fm;
struct map_context ctx;
@ -681,6 +680,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
NULL, interesting_attributes);
/* If meta charset isn't null, override content encoding */
if (iri && meta_charset)
set_content_encoding (iri, meta_charset);
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)
*meta_disallow_follow = ctx.nofollow;
@ -750,9 +753,7 @@ get_urls_file (const char *file)
url_text = merged;
}
set_ugly_no_encode (true);
url = url_parse (url_text, &up_error_code);
set_ugly_no_encode (false);
url = url_parse (url_text, &up_error_code, NULL);
if (!url)
{
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),

View File

@ -44,7 +44,7 @@ struct map_context {
};
struct urlpos *get_urls_file (const char *);
struct urlpos *get_urls_html (const char *, const char *, bool *);
struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
struct urlpos *append_url (const char *, int, int, struct map_context *);
void free_urlpos (struct urlpos *);

View File

@ -49,7 +49,6 @@ as that of the covered work. */
#include "retr.h"
#include "connect.h"
#include "netrc.h"
#include "iri.h"
#ifdef HAVE_SSL
# include "ssl.h"
#endif
@ -1365,7 +1364,8 @@ free_hstat (struct http_stat *hs)
If PROXY is non-NULL, the connection will be made to the proxy
server, and u->url will be requested. */
static uerr_t
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
struct iri *iri)
{
struct request *req;
@ -2058,7 +2058,11 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
/* Try to get remote encoding if needed */
if (opt.enable_iri && !opt.encoding_remote)
set_current_charset (parse_charset (tmp2));
{
tmp = parse_charset (tmp2);
if (tmp)
set_content_encoding (iri, tmp);
}
}
}
hs->newloc = resp_header_strdup (resp, "Location");
@ -2333,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
retried, and retried, and retried, and... */
uerr_t
http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
int *dt, struct url *proxy)
int *dt, struct url *proxy, struct iri *iri)
{
int count;
bool got_head = false; /* used for time-stamping and filename detection */
@ -2497,7 +2501,7 @@ Spider mode enabled. Check if remote file exists.\n"));
*dt &= ~SEND_NOCACHE;
/* Try fetching the document, or at least its head. */
err = gethttp (u, &hstat, dt, proxy);
err = gethttp (u, &hstat, dt, proxy, iri);
/* Time? */
tms = datetime_str (time (NULL));
@ -2576,9 +2580,9 @@ Spider mode enabled. Check if remote file exists.\n"));
}
/* Maybe we should always keep track of broken links, not just in
* spider mode.
* Don't log error if it was utf8 encoded because we will try
* one unencoded. */
else if (opt.spider && !get_utf8_encode ())
* Don't log error if it was UTF-8 encoded because we will try
* once unencoded. */
else if (opt.spider && !iri->utf8_encode)
{
/* #### Again: ugly ugly ugly! */
if (!hurl)

View File

@ -33,7 +33,7 @@ as that of the covered work. */
struct url;
uerr_t http_loop (struct url *, char **, char **, const char *, int *,
struct url *);
struct url *, struct iri *);
void save_cookies (void);
void http_cleanup (void);
time_t http_atotm (const char *);

126
src/iri.c
View File

@ -46,18 +46,6 @@ as that of the covered work. */
/* Note: locale encoding is kept in options struct (opt.locale) */
/* Hold the encoding used for the current fetch */
char *remote;
/* Hold the encoding for the future found links */
char *current;
/* Will/Is the current URL encoded in utf8 ? */
bool utf8_encode;
/* Force no utf8 encoding for url_parse () */
bool ugly_no_encode;
static iconv_t locale2utf8;
static bool open_locale_to_utf8 (void);
@ -239,15 +227,15 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
on error. */
char *
idn_encode (char *host, bool utf8_encoded)
idn_encode (struct iri *i, char *host)
{
char *new;
int ret;
/* Encode to UTF-8 if not done using current remote */
if (!utf8_encoded)
/* Encode to UTF-8 if not done */
if (!i->utf8_encode)
{
if (!remote_to_utf8 ((const char *) host, (const char **) &new))
if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
{
/* Nothing to encode or an error occured */
return NULL;
@ -291,7 +279,7 @@ idn_decode (char *host)
/* Try to transcode string str from remote encoding to UTF-8. On success, *new
contains the transcoded string. *new content is unspecified otherwise. */
bool
remote_to_utf8 (const char *str, const char **new)
remote_to_utf8 (struct iri *i, const char *str, const char **new)
{
char *r;
iconv_t cd;
@ -299,8 +287,8 @@ remote_to_utf8 (const char *str, const char **new)
if (opt.encoding_remote)
r = opt.encoding_remote;
else if (current)
r = current;
else if (i->uri_encoding)
r = i->uri_encoding;
else
return false;
@ -323,90 +311,52 @@ remote_to_utf8 (const char *str, const char **new)
return ret;
}
char *get_remote_charset (void)
struct iri *
iri_new (void)
{
return remote;
}
char *get_current_charset (void)
{
return current;
}
void set_current_charset (char *charset)
{
/*printf("[ current = `%s'\n", charset);*/
if (current)
{
/* Do nothing if already equal */
if (!strcasecmp (current, charset))
return;
xfree (current);
}
current = charset ? xstrdup (charset) : NULL;
}
void set_current_as_locale (void)
{
/* sXXXav : assert opt.locale NULL ? */
/*printf("[ current = locale = `%s'\n", opt.locale);*/
if (current)
{
if (!strcasecmp (current, opt.locale))
return;
xfree (current);
}
current = xstrdup (opt.locale);
struct iri *i = xmalloc (sizeof (struct iri));
i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
i->content_encoding = NULL;
i->utf8_encode = opt.enable_iri;
}
void
set_remote_charset (char *charset)
iri_free (struct iri *i)
{
/*printf("[ remote = `%s'\n", charset);*/
if (remote)
{
/* Do nothing if already equal */
if (!strcasecmp (remote, charset))
return;
xfree (remote);
}
remote = charset ? xstrdup (charset) : NULL;
xfree_null (i->uri_encoding);
xfree_null (i->content_encoding);
xfree (i);
}
void
set_remote_as_current (void)
set_uri_encoding (struct iri *i, char *charset)
{
/*printf("[ remote = current = `%s'\n", current);*/
if (remote)
logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset);
if (opt.encoding_remote)
return;
if (i->uri_encoding)
{
/* Do nothing if already equal */
if (current && !strcasecmp (remote, current))
if (!strcasecmp (i->uri_encoding, charset))
return;
xfree (remote);
xfree (i->uri_encoding);
}
remote = current ? xstrdup (current) : NULL;
i->uri_encoding = charset ? xstrdup (charset) : NULL;
}
void reset_utf8_encode (void)
void
set_content_encoding (struct iri *i, char *charset)
{
set_utf8_encode (opt.enable_iri);
}
void set_utf8_encode (bool encode)
{
utf8_encode = encode;
}
bool get_utf8_encode (void)
{
return (!ugly_no_encode && utf8_encode);
}
void set_ugly_no_encode (bool ugly)
{
ugly_no_encode = ugly;
logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset);
if (opt.encoding_remote)
return;
if (i->content_encoding)
{
if (!strcasecmp (i->content_encoding, charset))
return;
xfree (i->content_encoding);
}
i->content_encoding = charset ? xstrdup (charset) : NULL;
}

View File

@ -30,49 +30,41 @@ as that of the covered work. */
#ifndef IRI_H
#define IRI_H
struct iri {
char *uri_encoding; /* Encoding of the uri to fetch */
char *content_encoding; /* Encoding of links inside the fetched file */
bool utf8_encode; /* Will/Is the current url encoded in utf8 */
};
#ifdef ENABLE_IRI
char *parse_charset (char *str);
char *find_locale (void);
bool check_encoding_name (char *encoding);
const char *locale_to_utf8 (const char *str);
char *idn_encode (char *host, bool utf8_encoded);
char *idn_encode (struct iri *i, char *host);
char *idn_decode (char *host);
char *get_remote_charset (void);
char *get_current_charset (void);
void set_current_charset (char *charset);
void set_current_as_locale (void);
void set_current_charset (char *charset);
void set_remote_charset (char *charset);
void set_remote_as_current (void);
bool remote_to_utf8 (const char *str, const char **new);
void reset_utf8_encode (void);
void set_utf8_encode (bool encode);
bool get_utf8_encode (void);
/* ugly ugly ugly */
void set_ugly_no_encode (bool ugly);
bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
struct iri *iri_new (void);
void iri_free (struct iri *i);
void set_uri_encoding (struct iri *i, char *charset);
void set_content_encoding (struct iri *i, char *charset);
#else /* ENABLE_IRI */
struct iri dummy_iri;
#define parse_charset(str) NULL
#define find_locale() NULL
#define check_encoding_name(str) false
#define locale_to_utf8(str) (str)
#define idn_encode(str,encoded) NULL
#define idn_encode(a,b,c) NULL
#define idn_decode(str) NULL
#define get_remote_charset() NULL
#define get_current_charset() NULL
#define set_current_charset(str)
#define set_current_as_locale()
#define set_current_charset(str)
#define set_remote_charset(str)
#define set_remote_as_current()
#define remote_to_utf8(a,b) false
#define reset_utf8_encode()
#define set_utf8_encode(a)
#define get_utf8_encode() false
#define set_ugly_no_encode(a)
#define remote_to_utf8(a,b,c) false
#define iri_new() (&dummy_iri)
#define iri_free(a)
#define set_uri_encoding(a,b)
#define set_content_encoding(a,b)
#endif /* ENABLE_IRI */
#endif /* IRI_H */

View File

@ -57,7 +57,6 @@ as that of the covered work. */
#include "convert.h"
#include "spider.h"
#include "http.h" /* for save_cookies */
#include "iri.h"
#include <getopt.h>
#include <getpass.h>
@ -1191,9 +1190,6 @@ WARNING: Can't reopen standard output in binary mode;\n\
char *filename = NULL, *redirected_URL = NULL;
int dt;
set_current_as_locale ();
set_ugly_no_encode (false);
if ((opt.recursive || opt.page_requisites)
&& (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
{
@ -1209,8 +1205,11 @@ WARNING: Can't reopen standard output in binary mode;\n\
}
else
{
set_remote_as_current ();
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
struct iri *i = iri_new ();
set_uri_encoding (i, opt.locale);
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
opt.recursive, i);
iri_free (i);
}
if (opt.delete_after && file_exists_p(filename))

View File

@ -61,7 +61,7 @@ struct queue_element {
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
char *remote_encoding;
struct iri *iri; /* sXXXav */
bool css_allowed; /* whether the document is allowed to
be treated as CSS. */
struct queue_element *next; /* next element in queue */
@ -95,12 +95,12 @@ url_queue_delete (struct url_queue *queue)
into it. */
static void
url_enqueue (struct url_queue *queue,
url_enqueue (struct url_queue *queue, struct iri *i,
const char *url, const char *referer, int depth,
bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
char *charset = get_current_charset ();
qel->iri = i;
qel->url = url;
qel->referer = referer;
qel->depth = depth;
@ -108,11 +108,6 @@ url_enqueue (struct url_queue *queue,
qel->css_allowed = css_allowed;
qel->next = NULL;
if (charset)
qel->remote_encoding = xstrdup (charset);
else
qel->remote_encoding = NULL;
++queue->count;
if (queue->count > queue->maxcount)
queue->maxcount = queue->count;
@ -120,7 +115,8 @@ url_enqueue (struct url_queue *queue,
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
/*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
if (i)
printf ("[Enqueuing %s with %s\n", url, i->uri_encoding);
if (queue->tail)
queue->tail->next = qel;
@ -134,7 +130,7 @@ url_enqueue (struct url_queue *queue,
succeeded, or false if the queue is empty. */
static bool
url_dequeue (struct url_queue *queue,
url_dequeue (struct url_queue *queue, struct iri **i,
const char **url, const char **referer, int *depth,
bool *html_allowed, bool *css_allowed)
{
@ -147,10 +143,7 @@ url_dequeue (struct url_queue *queue,
if (!queue->head)
queue->tail = NULL;
set_remote_charset (qel->remote_encoding);
if (qel->remote_encoding)
xfree (qel->remote_encoding);
*i = qel->iri;
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
@ -167,9 +160,9 @@ url_dequeue (struct url_queue *queue,
}
static bool download_child_p (const struct urlpos *, struct url *, int,
struct url *, struct hash_table *);
struct url *, struct hash_table *, struct iri *);
static bool descend_redirect_p (const char *, const char *, int,
struct url *, struct hash_table *);
struct url *, struct hash_table *, struct iri *);
/* Retrieve a part of the web beginning with START_URL. This used to
@ -207,10 +200,10 @@ retrieve_tree (const char *start_url)
int up_error_code;
struct url *start_url_parsed;
struct iri *i = iri_new ();
set_uri_encoding (i, opt.locale);
set_ugly_no_encode (true);
start_url_parsed= url_parse (start_url, &up_error_code);
set_ugly_no_encode (false);
start_url_parsed = url_parse (start_url, &up_error_code, i);
if (!start_url_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
@ -223,7 +216,8 @@ retrieve_tree (const char *start_url)
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
false);
string_set_add (blacklist, start_url_parsed->url);
while (1)
@ -242,7 +236,7 @@ retrieve_tree (const char *start_url)
/* Get the next URL from the queue... */
if (!url_dequeue (queue,
if (!url_dequeue (queue, (struct iri **) &i,
(const char **)&url, (const char **)&referer,
&depth, &html_allowed, &css_allowed))
break;
@ -283,7 +277,8 @@ retrieve_tree (const char *start_url)
int dt = 0;
char *redirected = NULL;
status = retrieve_url (url, &file, &redirected, referer, &dt, false);
status = retrieve_url (url, &file, &redirected, referer, &dt,
false, i);
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
@ -311,7 +306,7 @@ retrieve_tree (const char *start_url)
if (descend)
{
if (!descend_redirect_p (redirected, url, depth,
start_url_parsed, blacklist))
start_url_parsed, blacklist, i))
descend = false;
else
/* Make sure that the old pre-redirect form gets
@ -363,7 +358,7 @@ retrieve_tree (const char *start_url)
bool meta_disallow_follow = false;
struct urlpos *children
= is_css ? get_urls_css_file (file, url) :
get_urls_html (file, url, &meta_disallow_follow);
get_urls_html (file, url, &meta_disallow_follow, i);
if (opt.use_robots && meta_disallow_follow)
{
@ -374,9 +369,8 @@ retrieve_tree (const char *start_url)
if (children)
{
struct urlpos *child = children;
set_ugly_no_encode (true);
struct url *url_parsed = url_parse (url, NULL);
set_ugly_no_encode (false);
struct url *url_parsed = url_parse (url, NULL, i);
struct iri *ci;
char *referer_url = url;
bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL);
@ -393,9 +387,11 @@ retrieve_tree (const char *start_url)
if (dash_p_leaf_HTML && !child->link_inline_p)
continue;
if (download_child_p (child, url_parsed, depth, start_url_parsed,
blacklist))
blacklist, i))
{
url_enqueue (queue, xstrdup (child->url->url),
ci = iri_new ();
set_uri_encoding (ci, i->content_encoding);
url_enqueue (queue, ci, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1,
child->link_expect_html,
child->link_expect_css);
@ -440,6 +436,7 @@ retrieve_tree (const char *start_url)
xfree (url);
xfree_null (referer);
xfree_null (file);
iri_free (i);
}
/* If anything is left of the queue due to a premature exit, free it
@ -448,9 +445,11 @@ retrieve_tree (const char *start_url)
char *d1, *d2;
int d3;
bool d4, d5;
while (url_dequeue (queue,
struct iri *d6;
while (url_dequeue (queue, (struct iri **)&d6,
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
iri_free (d6);
xfree (d1);
xfree_null (d2);
}
@ -479,7 +478,8 @@ retrieve_tree (const char *start_url)
static bool
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *u = upos->url;
const char *url = u->url;
@ -620,7 +620,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
if (!specs)
{
char *rfile;
if (res_retrieve_file (url, &rfile))
if (res_retrieve_file (url, &rfile, iri))
{
specs = res_parse_from_file (rfile);
@ -675,25 +675,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
static bool
descend_redirect_p (const char *redirected, const char *original, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
struct url *start_url_parsed, struct hash_table *blacklist,
struct iri *iri)
{
struct url *orig_parsed, *new_parsed;
struct urlpos *upos;
bool success;
set_ugly_no_encode (true);
orig_parsed = url_parse (original, NULL);
orig_parsed = url_parse (original, NULL, NULL);
assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL);
new_parsed = url_parse (redirected, NULL, NULL);
assert (new_parsed != NULL);
set_ugly_no_encode (false);
upos = xnew0 (struct urlpos);
upos->url = new_parsed;
success = download_child_p (upos, orig_parsed, depth,
start_url_parsed, blacklist);
start_url_parsed, blacklist, iri);
url_free (orig_parsed);
url_free (new_parsed);

View File

@ -532,21 +532,28 @@ res_get_specs (const char *host, int port)
Return true if robots were retrieved OK, false otherwise. */
bool
res_retrieve_file (const char *url, char **file)
res_retrieve_file (const char *url, char **file, struct iri *iri)
{
struct iri *i = iri_new ();
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping;
int saved_sp_val = opt.spider;
/* Copy server URI encoding for a possible IDNA transformation, no need to
encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
set_uri_encoding (i, iri->uri_encoding);
i->utf8_encode = false;
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
opt.timestamping = false;
opt.spider = false;
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
opt.timestamping = saved_ts_val;
opt.spider = saved_sp_val;
opt.spider = saved_sp_val;
xfree (robots_url);
iri_free (i);
if (err != RETROK && *file != NULL)
{

View File

@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *);
void res_register_specs (const char *, int, struct robot_specs *);
struct robot_specs *res_get_specs (const char *, int);
bool res_retrieve_file (const char *, char **);
bool res_retrieve_file (const char *, char **, struct iri *);
bool is_robots_txt_url (const char *);

View File

@ -598,7 +598,7 @@ static char *getproxy (struct url *);
uerr_t
retrieve_url (const char *origurl, char **file, char **newloc,
const char *refurl, int *dt, bool recursive)
const char *refurl, int *dt, bool recursive, struct iri *iri)
{
uerr_t result;
char *url;
@ -626,10 +626,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
if (file)
*file = NULL;
reset_utf8_encode ();
second_try:
u = url_parse (url, &up_error_code);
u = url_parse (url, &up_error_code, iri);
if (!u)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
@ -637,7 +635,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
return URLERROR;
}
/*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode);
if (!refurl)
refurl = opt.referer;
@ -652,11 +650,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
proxy = getproxy (u);
if (proxy)
{
/* sXXXav : support IRI for proxy */
/* sXXXav : could a proxy include a path ??? */
struct iri *pi = iri_new ();
set_uri_encoding (pi, opt.locale);
pi->utf8_encode = false;
/* Parse the proxy URL. */
set_ugly_no_encode (true);
proxy_url = url_parse (proxy, &up_error_code);
set_ugly_no_encode (false);
proxy_url = url_parse (proxy, &up_error_code, NULL);
if (!proxy_url)
{
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
@ -681,7 +681,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
#endif
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
{
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
}
else if (u->scheme == SCHEME_FTP)
{
@ -731,10 +731,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
xfree (mynewloc);
mynewloc = construced_newloc;
reset_utf8_encode ();
/* Reset UTF-8 encoding state, keep the URI encoding and reset
the content encoding. */
iri->utf8_encode = opt.enable_iri;
set_content_encoding (iri, NULL);
/* Now, see if this new location makes sense. */
newloc_parsed = url_parse (mynewloc, &up_error_code);
newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
if (!newloc_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
@ -782,10 +785,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
}
/* Try to not encode in UTF-8 if fetching failed */
if (!(*dt & RETROKF) && get_utf8_encode ())
if (!(*dt & RETROKF) && iri->utf8_encode)
{
set_utf8_encode (false);
/*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
iri->utf8_encode = false;
printf ("[Fallbacking to non-utf8 for `%s'\n", url);
goto second_try;
}
@ -845,24 +848,28 @@ retrieve_from_file (const char *file, bool html, int *count)
{
uerr_t status;
struct urlpos *url_list, *cur_url;
struct iri *iri = iri_new();
char *input_file = NULL;
const char *url = file;
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
/* sXXXav : Assume filename and links in the file are in the locale */
set_content_encoding (iri, opt.locale);
if (url_has_scheme (url))
{
uerr_t status;
status = retrieve_url (url, &input_file, NULL, NULL, NULL, false);
status = retrieve_url (url, &input_file, NULL, NULL, NULL, false, iri);
if (status != RETROK)
return status;
}
else
input_file = (char *) file;
url_list = (html ? get_urls_html (input_file, NULL, NULL)
url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
: get_urls_file (input_file));
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
@ -892,7 +899,8 @@ retrieve_from_file (const char *file, bool html, int *count)
opt.follow_ftp = old_follow_ftp;
}
else
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
&dt, opt.recursive, iri);
if (filename && opt.delete_after && file_exists_p (filename))
{
@ -1064,9 +1072,10 @@ url_uses_proxy (const char *url)
{
bool ret;
struct url *u;
set_ugly_no_encode(true);
u= url_parse (url, NULL);
set_ugly_no_encode(false);
struct iri *i = iri_new();
/* url was given in the command line, so use locale as encoding */
set_uri_encoding (i, opt.locale);
u= url_parse (url, NULL, i);
if (!u)
return false;
ret = getproxy (u) != NULL;

View File

@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
char *fd_read_hunk (int, hunk_terminator_t, long, long);
char *fd_read_line (int);
uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
bool, struct iri *);
uerr_t retrieve_from_file (const char *, bool, int *);
const char *retr_rate (wgint, double);

View File

@ -641,7 +641,7 @@ static const char *parse_errors[] = {
error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */
struct url *
url_parse (const char *url, int *error)
url_parse (const char *url, int *error, struct iri *iri)
{
struct url *u;
const char *p;
@ -660,7 +660,7 @@ url_parse (const char *url, int *error)
int port;
char *user = NULL, *passwd = NULL;
char *url_encoded = NULL;
char *url_encoded = NULL, *new_url = NULL;
int error_code;
@ -671,20 +671,20 @@ url_parse (const char *url, int *error)
goto error;
}
if (opt.enable_iri && get_utf8_encode ())
if (iri && iri->utf8_encode)
{
const char *new;
bool utf8_encode;
url_unescape ((char *) url);
utf8_encode = remote_to_utf8 (url, &new);
set_utf8_encode (utf8_encode);
if (utf8_encode)
url = new;
iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
if (!iri->utf8_encode)
new_url = NULL;
}
url_encoded = reencode_escapes (url);
url_encoded = reencode_escapes (new_url ? new_url : url);
p = url_encoded;
if (new_url && url_encoded != new_url)
xfree (new_url);
p += strlen (supported_schemes[scheme].leading_string);
uname_b = p;
p = url_skip_credentials (p);
@ -854,16 +854,17 @@ url_parse (const char *url, int *error)
{
url_unescape (u->host);
host_modified = true;
}
if (opt.enable_iri)
{
char *new = idn_encode (u->host, get_utf8_encode ());
if (new)
/* Apply IDNA regardless of iri->utf8_encode status */
if (opt.enable_iri && iri)
{
xfree (u->host);
u->host = new;
host_modified = true;
char *new = idn_encode (iri, u->host);
if (new)
{
xfree (u->host);
u->host = new;
host_modified = true;
}
}
}

View File

@ -84,7 +84,7 @@ struct url
char *url_escape (const char *);
struct url *url_parse (const char *, int *);
struct url *url_parse (const char *, int *, struct iri *iri);
const char *url_error (int);
char *url_full_path (const struct url *);
void url_set_dir (struct url *, const char *);

View File

@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT;
#include "quote.h"
#include "quotearg.h"
/* Likewise for struct iri definition */
#include "iri.h"
/* Useful macros used across the code: */
/* The number of elements in an array. For example: