mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Change global variable model for state-object
This commit is contained in:
parent
c31e00b52d
commit
d82f80ecab
@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
|
||||
|
||||
/* Parse the file... */
|
||||
urls = is_css ? get_urls_css_file (file, url) :
|
||||
get_urls_html (file, url, NULL);
|
||||
get_urls_html (file, url, NULL, NULL);
|
||||
|
||||
/* We don't respect meta_disallow_follow here because, even if
|
||||
the file is not followed, we might still want to convert the
|
||||
|
@ -44,7 +44,6 @@ as that of the covered work. */
|
||||
#include "recur.h"
|
||||
#include "html-url.h"
|
||||
#include "css-url.h"
|
||||
#include "iri.h"
|
||||
|
||||
typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
|
||||
|
||||
@ -175,6 +174,10 @@ static const char *additional_attributes[] = {
|
||||
static struct hash_table *interesting_tags;
|
||||
static struct hash_table *interesting_attributes;
|
||||
|
||||
/* Will contains the (last) charset found in 'http-equiv=content-type'
|
||||
meta tags */
|
||||
static char *meta_charset;
|
||||
|
||||
static void
|
||||
init_interesting (void)
|
||||
{
|
||||
@ -285,9 +288,7 @@ append_url (const char *link_uri, int position, int size,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
set_ugly_no_encode (true);
|
||||
url = url_parse (link_uri, NULL);
|
||||
set_ugly_no_encode (false);
|
||||
url = url_parse (link_uri, NULL, NULL);
|
||||
if (!url)
|
||||
{
|
||||
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
|
||||
@ -306,9 +307,7 @@ append_url (const char *link_uri, int position, int size,
|
||||
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
|
||||
ctx->document_file, base, link_uri, complete_uri));
|
||||
|
||||
set_ugly_no_encode (true);
|
||||
url = url_parse (complete_uri, NULL);
|
||||
set_ugly_no_encode (false);
|
||||
url = url_parse (complete_uri, NULL, NULL);
|
||||
if (!url)
|
||||
{
|
||||
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
|
||||
@ -573,9 +572,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
return;
|
||||
|
||||
/*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
|
||||
|
||||
set_current_charset (mcharset);
|
||||
xfree (mcharset);
|
||||
xfree_null (meta_charset);
|
||||
meta_charset = mcharset;
|
||||
}
|
||||
else if (name && 0 == strcasecmp (name, "robots"))
|
||||
{
|
||||
@ -641,7 +639,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
||||
<base href=...> and does the right thing. */
|
||||
|
||||
struct urlpos *
|
||||
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
|
||||
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
|
||||
struct iri *iri)
|
||||
{
|
||||
struct file_memory *fm;
|
||||
struct map_context ctx;
|
||||
@ -681,6 +680,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
|
||||
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
|
||||
NULL, interesting_attributes);
|
||||
|
||||
/* If meta charset isn't null, override content encoding */
|
||||
if (iri && meta_charset)
|
||||
set_content_encoding (iri, meta_charset);
|
||||
|
||||
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
|
||||
if (meta_disallow_follow)
|
||||
*meta_disallow_follow = ctx.nofollow;
|
||||
@ -750,9 +753,7 @@ get_urls_file (const char *file)
|
||||
url_text = merged;
|
||||
}
|
||||
|
||||
set_ugly_no_encode (true);
|
||||
url = url_parse (url_text, &up_error_code);
|
||||
set_ugly_no_encode (false);
|
||||
url = url_parse (url_text, &up_error_code, NULL);
|
||||
if (!url)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
|
||||
|
@ -44,7 +44,7 @@ struct map_context {
|
||||
};
|
||||
|
||||
struct urlpos *get_urls_file (const char *);
|
||||
struct urlpos *get_urls_html (const char *, const char *, bool *);
|
||||
struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
|
||||
struct urlpos *append_url (const char *, int, int, struct map_context *);
|
||||
void free_urlpos (struct urlpos *);
|
||||
|
||||
|
20
src/http.c
20
src/http.c
@ -49,7 +49,6 @@ as that of the covered work. */
|
||||
#include "retr.h"
|
||||
#include "connect.h"
|
||||
#include "netrc.h"
|
||||
#include "iri.h"
|
||||
#ifdef HAVE_SSL
|
||||
# include "ssl.h"
|
||||
#endif
|
||||
@ -1365,7 +1364,8 @@ free_hstat (struct http_stat *hs)
|
||||
If PROXY is non-NULL, the connection will be made to the proxy
|
||||
server, and u->url will be requested. */
|
||||
static uerr_t
|
||||
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
|
||||
gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
||||
struct iri *iri)
|
||||
{
|
||||
struct request *req;
|
||||
|
||||
@ -2058,7 +2058,11 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
|
||||
|
||||
/* Try to get remote encoding if needed */
|
||||
if (opt.enable_iri && !opt.encoding_remote)
|
||||
set_current_charset (parse_charset (tmp2));
|
||||
{
|
||||
tmp = parse_charset (tmp2);
|
||||
if (tmp)
|
||||
set_content_encoding (iri, tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
hs->newloc = resp_header_strdup (resp, "Location");
|
||||
@ -2333,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
|
||||
retried, and retried, and retried, and... */
|
||||
uerr_t
|
||||
http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
|
||||
int *dt, struct url *proxy)
|
||||
int *dt, struct url *proxy, struct iri *iri)
|
||||
{
|
||||
int count;
|
||||
bool got_head = false; /* used for time-stamping and filename detection */
|
||||
@ -2497,7 +2501,7 @@ Spider mode enabled. Check if remote file exists.\n"));
|
||||
*dt &= ~SEND_NOCACHE;
|
||||
|
||||
/* Try fetching the document, or at least its head. */
|
||||
err = gethttp (u, &hstat, dt, proxy);
|
||||
err = gethttp (u, &hstat, dt, proxy, iri);
|
||||
|
||||
/* Time? */
|
||||
tms = datetime_str (time (NULL));
|
||||
@ -2576,9 +2580,9 @@ Spider mode enabled. Check if remote file exists.\n"));
|
||||
}
|
||||
/* Maybe we should always keep track of broken links, not just in
|
||||
* spider mode.
|
||||
* Don't log error if it was utf8 encoded because we will try
|
||||
* one unencoded. */
|
||||
else if (opt.spider && !get_utf8_encode ())
|
||||
* Don't log error if it was UTF-8 encoded because we will try
|
||||
* once unencoded. */
|
||||
else if (opt.spider && !iri->utf8_encode)
|
||||
{
|
||||
/* #### Again: ugly ugly ugly! */
|
||||
if (!hurl)
|
||||
|
@ -33,7 +33,7 @@ as that of the covered work. */
|
||||
struct url;
|
||||
|
||||
uerr_t http_loop (struct url *, char **, char **, const char *, int *,
|
||||
struct url *);
|
||||
struct url *, struct iri *);
|
||||
void save_cookies (void);
|
||||
void http_cleanup (void);
|
||||
time_t http_atotm (const char *);
|
||||
|
126
src/iri.c
126
src/iri.c
@ -46,18 +46,6 @@ as that of the covered work. */
|
||||
|
||||
/* Note: locale encoding is kept in options struct (opt.locale) */
|
||||
|
||||
/* Hold the encoding used for the current fetch */
|
||||
char *remote;
|
||||
|
||||
/* Hold the encoding for the future found links */
|
||||
char *current;
|
||||
|
||||
/* Will/Is the current URL encoded in utf8 ? */
|
||||
bool utf8_encode;
|
||||
|
||||
/* Force no utf8 encoding for url_parse () */
|
||||
bool ugly_no_encode;
|
||||
|
||||
static iconv_t locale2utf8;
|
||||
|
||||
static bool open_locale_to_utf8 (void);
|
||||
@ -239,15 +227,15 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
|
||||
/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
|
||||
on error. */
|
||||
char *
|
||||
idn_encode (char *host, bool utf8_encoded)
|
||||
idn_encode (struct iri *i, char *host)
|
||||
{
|
||||
char *new;
|
||||
int ret;
|
||||
|
||||
/* Encode to UTF-8 if not done using current remote */
|
||||
if (!utf8_encoded)
|
||||
/* Encode to UTF-8 if not done */
|
||||
if (!i->utf8_encode)
|
||||
{
|
||||
if (!remote_to_utf8 ((const char *) host, (const char **) &new))
|
||||
if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
|
||||
{
|
||||
/* Nothing to encode or an error occured */
|
||||
return NULL;
|
||||
@ -291,7 +279,7 @@ idn_decode (char *host)
|
||||
/* Try to transcode string str from remote encoding to UTF-8. On success, *new
|
||||
contains the transcoded string. *new content is unspecified otherwise. */
|
||||
bool
|
||||
remote_to_utf8 (const char *str, const char **new)
|
||||
remote_to_utf8 (struct iri *i, const char *str, const char **new)
|
||||
{
|
||||
char *r;
|
||||
iconv_t cd;
|
||||
@ -299,8 +287,8 @@ remote_to_utf8 (const char *str, const char **new)
|
||||
|
||||
if (opt.encoding_remote)
|
||||
r = opt.encoding_remote;
|
||||
else if (current)
|
||||
r = current;
|
||||
else if (i->uri_encoding)
|
||||
r = i->uri_encoding;
|
||||
else
|
||||
return false;
|
||||
|
||||
@ -323,90 +311,52 @@ remote_to_utf8 (const char *str, const char **new)
|
||||
return ret;
|
||||
}
|
||||
|
||||
char *get_remote_charset (void)
|
||||
struct iri *
|
||||
iri_new (void)
|
||||
{
|
||||
return remote;
|
||||
}
|
||||
|
||||
char *get_current_charset (void)
|
||||
{
|
||||
return current;
|
||||
}
|
||||
|
||||
void set_current_charset (char *charset)
|
||||
{
|
||||
/*printf("[ current = `%s'\n", charset);*/
|
||||
if (current)
|
||||
{
|
||||
/* Do nothing if already equal */
|
||||
if (!strcasecmp (current, charset))
|
||||
return;
|
||||
xfree (current);
|
||||
}
|
||||
|
||||
current = charset ? xstrdup (charset) : NULL;
|
||||
}
|
||||
|
||||
void set_current_as_locale (void)
|
||||
{
|
||||
/* sXXXav : assert opt.locale NULL ? */
|
||||
/*printf("[ current = locale = `%s'\n", opt.locale);*/
|
||||
if (current)
|
||||
{
|
||||
if (!strcasecmp (current, opt.locale))
|
||||
return;
|
||||
xfree (current);
|
||||
}
|
||||
|
||||
current = xstrdup (opt.locale);
|
||||
struct iri *i = xmalloc (sizeof (struct iri));
|
||||
i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
|
||||
i->content_encoding = NULL;
|
||||
i->utf8_encode = opt.enable_iri;
|
||||
}
|
||||
|
||||
void
|
||||
set_remote_charset (char *charset)
|
||||
iri_free (struct iri *i)
|
||||
{
|
||||
/*printf("[ remote = `%s'\n", charset);*/
|
||||
if (remote)
|
||||
{
|
||||
/* Do nothing if already equal */
|
||||
if (!strcasecmp (remote, charset))
|
||||
return;
|
||||
xfree (remote);
|
||||
}
|
||||
remote = charset ? xstrdup (charset) : NULL;
|
||||
xfree_null (i->uri_encoding);
|
||||
xfree_null (i->content_encoding);
|
||||
xfree (i);
|
||||
}
|
||||
|
||||
void
|
||||
set_remote_as_current (void)
|
||||
set_uri_encoding (struct iri *i, char *charset)
|
||||
{
|
||||
/*printf("[ remote = current = `%s'\n", current);*/
|
||||
if (remote)
|
||||
logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset);
|
||||
if (opt.encoding_remote)
|
||||
return;
|
||||
if (i->uri_encoding)
|
||||
{
|
||||
/* Do nothing if already equal */
|
||||
if (current && !strcasecmp (remote, current))
|
||||
if (!strcasecmp (i->uri_encoding, charset))
|
||||
return;
|
||||
xfree (remote);
|
||||
xfree (i->uri_encoding);
|
||||
}
|
||||
|
||||
remote = current ? xstrdup (current) : NULL;
|
||||
i->uri_encoding = charset ? xstrdup (charset) : NULL;
|
||||
}
|
||||
|
||||
void reset_utf8_encode (void)
|
||||
void
|
||||
set_content_encoding (struct iri *i, char *charset)
|
||||
{
|
||||
set_utf8_encode (opt.enable_iri);
|
||||
}
|
||||
|
||||
void set_utf8_encode (bool encode)
|
||||
{
|
||||
utf8_encode = encode;
|
||||
}
|
||||
|
||||
bool get_utf8_encode (void)
|
||||
{
|
||||
return (!ugly_no_encode && utf8_encode);
|
||||
}
|
||||
|
||||
void set_ugly_no_encode (bool ugly)
|
||||
{
|
||||
ugly_no_encode = ugly;
|
||||
logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset);
|
||||
if (opt.encoding_remote)
|
||||
return;
|
||||
if (i->content_encoding)
|
||||
{
|
||||
if (!strcasecmp (i->content_encoding, charset))
|
||||
return;
|
||||
xfree (i->content_encoding);
|
||||
}
|
||||
|
||||
i->content_encoding = charset ? xstrdup (charset) : NULL;
|
||||
}
|
||||
|
||||
|
48
src/iri.h
48
src/iri.h
@ -30,49 +30,41 @@ as that of the covered work. */
|
||||
#ifndef IRI_H
|
||||
#define IRI_H
|
||||
|
||||
struct iri {
|
||||
char *uri_encoding; /* Encoding of the uri to fetch */
|
||||
char *content_encoding; /* Encoding of links inside the fetched file */
|
||||
bool utf8_encode; /* Will/Is the current url encoded in utf8 */
|
||||
};
|
||||
|
||||
#ifdef ENABLE_IRI
|
||||
|
||||
char *parse_charset (char *str);
|
||||
char *find_locale (void);
|
||||
bool check_encoding_name (char *encoding);
|
||||
const char *locale_to_utf8 (const char *str);
|
||||
char *idn_encode (char *host, bool utf8_encoded);
|
||||
char *idn_encode (struct iri *i, char *host);
|
||||
char *idn_decode (char *host);
|
||||
char *get_remote_charset (void);
|
||||
char *get_current_charset (void);
|
||||
void set_current_charset (char *charset);
|
||||
void set_current_as_locale (void);
|
||||
void set_current_charset (char *charset);
|
||||
void set_remote_charset (char *charset);
|
||||
void set_remote_as_current (void);
|
||||
bool remote_to_utf8 (const char *str, const char **new);
|
||||
void reset_utf8_encode (void);
|
||||
void set_utf8_encode (bool encode);
|
||||
bool get_utf8_encode (void);
|
||||
|
||||
/* ugly ugly ugly */
|
||||
void set_ugly_no_encode (bool ugly);
|
||||
bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
|
||||
struct iri *iri_new (void);
|
||||
void iri_free (struct iri *i);
|
||||
void set_uri_encoding (struct iri *i, char *charset);
|
||||
void set_content_encoding (struct iri *i, char *charset);
|
||||
|
||||
#else /* ENABLE_IRI */
|
||||
|
||||
struct iri dummy_iri;
|
||||
|
||||
#define parse_charset(str) NULL
|
||||
#define find_locale() NULL
|
||||
#define check_encoding_name(str) false
|
||||
#define locale_to_utf8(str) (str)
|
||||
#define idn_encode(str,encoded) NULL
|
||||
#define idn_encode(a,b,c) NULL
|
||||
#define idn_decode(str) NULL
|
||||
#define get_remote_charset() NULL
|
||||
#define get_current_charset() NULL
|
||||
#define set_current_charset(str)
|
||||
#define set_current_as_locale()
|
||||
#define set_current_charset(str)
|
||||
#define set_remote_charset(str)
|
||||
#define set_remote_as_current()
|
||||
#define remote_to_utf8(a,b) false
|
||||
#define reset_utf8_encode()
|
||||
#define set_utf8_encode(a)
|
||||
#define get_utf8_encode() false
|
||||
#define set_ugly_no_encode(a)
|
||||
#define remote_to_utf8(a,b,c) false
|
||||
#define iri_new() (&dummy_iri)
|
||||
#define iri_free(a)
|
||||
#define set_uri_encoding(a,b)
|
||||
#define set_content_encoding(a,b)
|
||||
|
||||
#endif /* ENABLE_IRI */
|
||||
#endif /* IRI_H */
|
||||
|
11
src/main.c
11
src/main.c
@ -57,7 +57,6 @@ as that of the covered work. */
|
||||
#include "convert.h"
|
||||
#include "spider.h"
|
||||
#include "http.h" /* for save_cookies */
|
||||
#include "iri.h"
|
||||
|
||||
#include <getopt.h>
|
||||
#include <getpass.h>
|
||||
@ -1191,9 +1190,6 @@ WARNING: Can't reopen standard output in binary mode;\n\
|
||||
char *filename = NULL, *redirected_URL = NULL;
|
||||
int dt;
|
||||
|
||||
set_current_as_locale ();
|
||||
set_ugly_no_encode (false);
|
||||
|
||||
if ((opt.recursive || opt.page_requisites)
|
||||
&& (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
|
||||
{
|
||||
@ -1209,8 +1205,11 @@ WARNING: Can't reopen standard output in binary mode;\n\
|
||||
}
|
||||
else
|
||||
{
|
||||
set_remote_as_current ();
|
||||
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
|
||||
struct iri *i = iri_new ();
|
||||
set_uri_encoding (i, opt.locale);
|
||||
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
|
||||
opt.recursive, i);
|
||||
iri_free (i);
|
||||
}
|
||||
|
||||
if (opt.delete_after && file_exists_p(filename))
|
||||
|
75
src/recur.c
75
src/recur.c
@ -61,7 +61,7 @@ struct queue_element {
|
||||
int depth; /* the depth */
|
||||
bool html_allowed; /* whether the document is allowed to
|
||||
be treated as HTML. */
|
||||
char *remote_encoding;
|
||||
struct iri *iri; /* sXXXav */
|
||||
bool css_allowed; /* whether the document is allowed to
|
||||
be treated as CSS. */
|
||||
struct queue_element *next; /* next element in queue */
|
||||
@ -95,12 +95,12 @@ url_queue_delete (struct url_queue *queue)
|
||||
into it. */
|
||||
|
||||
static void
|
||||
url_enqueue (struct url_queue *queue,
|
||||
url_enqueue (struct url_queue *queue, struct iri *i,
|
||||
const char *url, const char *referer, int depth,
|
||||
bool html_allowed, bool css_allowed)
|
||||
{
|
||||
struct queue_element *qel = xnew (struct queue_element);
|
||||
char *charset = get_current_charset ();
|
||||
qel->iri = i;
|
||||
qel->url = url;
|
||||
qel->referer = referer;
|
||||
qel->depth = depth;
|
||||
@ -108,11 +108,6 @@ url_enqueue (struct url_queue *queue,
|
||||
qel->css_allowed = css_allowed;
|
||||
qel->next = NULL;
|
||||
|
||||
if (charset)
|
||||
qel->remote_encoding = xstrdup (charset);
|
||||
else
|
||||
qel->remote_encoding = NULL;
|
||||
|
||||
++queue->count;
|
||||
if (queue->count > queue->maxcount)
|
||||
queue->maxcount = queue->count;
|
||||
@ -120,7 +115,8 @@ url_enqueue (struct url_queue *queue,
|
||||
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
|
||||
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
|
||||
|
||||
/*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
|
||||
if (i)
|
||||
printf ("[Enqueuing %s with %s\n", url, i->uri_encoding);
|
||||
|
||||
if (queue->tail)
|
||||
queue->tail->next = qel;
|
||||
@ -134,7 +130,7 @@ url_enqueue (struct url_queue *queue,
|
||||
succeeded, or false if the queue is empty. */
|
||||
|
||||
static bool
|
||||
url_dequeue (struct url_queue *queue,
|
||||
url_dequeue (struct url_queue *queue, struct iri **i,
|
||||
const char **url, const char **referer, int *depth,
|
||||
bool *html_allowed, bool *css_allowed)
|
||||
{
|
||||
@ -147,10 +143,7 @@ url_dequeue (struct url_queue *queue,
|
||||
if (!queue->head)
|
||||
queue->tail = NULL;
|
||||
|
||||
set_remote_charset (qel->remote_encoding);
|
||||
if (qel->remote_encoding)
|
||||
xfree (qel->remote_encoding);
|
||||
|
||||
*i = qel->iri;
|
||||
*url = qel->url;
|
||||
*referer = qel->referer;
|
||||
*depth = qel->depth;
|
||||
@ -167,9 +160,9 @@ url_dequeue (struct url_queue *queue,
|
||||
}
|
||||
|
||||
static bool download_child_p (const struct urlpos *, struct url *, int,
|
||||
struct url *, struct hash_table *);
|
||||
struct url *, struct hash_table *, struct iri *);
|
||||
static bool descend_redirect_p (const char *, const char *, int,
|
||||
struct url *, struct hash_table *);
|
||||
struct url *, struct hash_table *, struct iri *);
|
||||
|
||||
|
||||
/* Retrieve a part of the web beginning with START_URL. This used to
|
||||
@ -207,10 +200,10 @@ retrieve_tree (const char *start_url)
|
||||
|
||||
int up_error_code;
|
||||
struct url *start_url_parsed;
|
||||
struct iri *i = iri_new ();
|
||||
set_uri_encoding (i, opt.locale);
|
||||
|
||||
set_ugly_no_encode (true);
|
||||
start_url_parsed= url_parse (start_url, &up_error_code);
|
||||
set_ugly_no_encode (false);
|
||||
start_url_parsed = url_parse (start_url, &up_error_code, i);
|
||||
if (!start_url_parsed)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
|
||||
@ -223,7 +216,8 @@ retrieve_tree (const char *start_url)
|
||||
|
||||
/* Enqueue the starting URL. Use start_url_parsed->url rather than
|
||||
just URL so we enqueue the canonical form of the URL. */
|
||||
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
|
||||
url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
|
||||
false);
|
||||
string_set_add (blacklist, start_url_parsed->url);
|
||||
|
||||
while (1)
|
||||
@ -242,7 +236,7 @@ retrieve_tree (const char *start_url)
|
||||
|
||||
/* Get the next URL from the queue... */
|
||||
|
||||
if (!url_dequeue (queue,
|
||||
if (!url_dequeue (queue, (struct iri **) &i,
|
||||
(const char **)&url, (const char **)&referer,
|
||||
&depth, &html_allowed, &css_allowed))
|
||||
break;
|
||||
@ -283,7 +277,8 @@ retrieve_tree (const char *start_url)
|
||||
int dt = 0;
|
||||
char *redirected = NULL;
|
||||
|
||||
status = retrieve_url (url, &file, &redirected, referer, &dt, false);
|
||||
status = retrieve_url (url, &file, &redirected, referer, &dt,
|
||||
false, i);
|
||||
|
||||
if (html_allowed && file && status == RETROK
|
||||
&& (dt & RETROKF) && (dt & TEXTHTML))
|
||||
@ -311,7 +306,7 @@ retrieve_tree (const char *start_url)
|
||||
if (descend)
|
||||
{
|
||||
if (!descend_redirect_p (redirected, url, depth,
|
||||
start_url_parsed, blacklist))
|
||||
start_url_parsed, blacklist, i))
|
||||
descend = false;
|
||||
else
|
||||
/* Make sure that the old pre-redirect form gets
|
||||
@ -363,7 +358,7 @@ retrieve_tree (const char *start_url)
|
||||
bool meta_disallow_follow = false;
|
||||
struct urlpos *children
|
||||
= is_css ? get_urls_css_file (file, url) :
|
||||
get_urls_html (file, url, &meta_disallow_follow);
|
||||
get_urls_html (file, url, &meta_disallow_follow, i);
|
||||
|
||||
if (opt.use_robots && meta_disallow_follow)
|
||||
{
|
||||
@ -374,9 +369,8 @@ retrieve_tree (const char *start_url)
|
||||
if (children)
|
||||
{
|
||||
struct urlpos *child = children;
|
||||
set_ugly_no_encode (true);
|
||||
struct url *url_parsed = url_parse (url, NULL);
|
||||
set_ugly_no_encode (false);
|
||||
struct url *url_parsed = url_parse (url, NULL, i);
|
||||
struct iri *ci;
|
||||
char *referer_url = url;
|
||||
bool strip_auth = (url_parsed != NULL
|
||||
&& url_parsed->user != NULL);
|
||||
@ -393,9 +387,11 @@ retrieve_tree (const char *start_url)
|
||||
if (dash_p_leaf_HTML && !child->link_inline_p)
|
||||
continue;
|
||||
if (download_child_p (child, url_parsed, depth, start_url_parsed,
|
||||
blacklist))
|
||||
blacklist, i))
|
||||
{
|
||||
url_enqueue (queue, xstrdup (child->url->url),
|
||||
ci = iri_new ();
|
||||
set_uri_encoding (ci, i->content_encoding);
|
||||
url_enqueue (queue, ci, xstrdup (child->url->url),
|
||||
xstrdup (referer_url), depth + 1,
|
||||
child->link_expect_html,
|
||||
child->link_expect_css);
|
||||
@ -440,6 +436,7 @@ retrieve_tree (const char *start_url)
|
||||
xfree (url);
|
||||
xfree_null (referer);
|
||||
xfree_null (file);
|
||||
iri_free (i);
|
||||
}
|
||||
|
||||
/* If anything is left of the queue due to a premature exit, free it
|
||||
@ -448,9 +445,11 @@ retrieve_tree (const char *start_url)
|
||||
char *d1, *d2;
|
||||
int d3;
|
||||
bool d4, d5;
|
||||
while (url_dequeue (queue,
|
||||
struct iri *d6;
|
||||
while (url_dequeue (queue, (struct iri **)&d6,
|
||||
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
|
||||
{
|
||||
iri_free (d6);
|
||||
xfree (d1);
|
||||
xfree_null (d2);
|
||||
}
|
||||
@ -479,7 +478,8 @@ retrieve_tree (const char *start_url)
|
||||
|
||||
static bool
|
||||
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
struct url *start_url_parsed, struct hash_table *blacklist)
|
||||
struct url *start_url_parsed, struct hash_table *blacklist,
|
||||
struct iri *iri)
|
||||
{
|
||||
struct url *u = upos->url;
|
||||
const char *url = u->url;
|
||||
@ -620,7 +620,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
if (!specs)
|
||||
{
|
||||
char *rfile;
|
||||
if (res_retrieve_file (url, &rfile))
|
||||
if (res_retrieve_file (url, &rfile, iri))
|
||||
{
|
||||
specs = res_parse_from_file (rfile);
|
||||
|
||||
@ -675,25 +675,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
|
||||
static bool
|
||||
descend_redirect_p (const char *redirected, const char *original, int depth,
|
||||
struct url *start_url_parsed, struct hash_table *blacklist)
|
||||
struct url *start_url_parsed, struct hash_table *blacklist,
|
||||
struct iri *iri)
|
||||
{
|
||||
struct url *orig_parsed, *new_parsed;
|
||||
struct urlpos *upos;
|
||||
bool success;
|
||||
|
||||
set_ugly_no_encode (true);
|
||||
orig_parsed = url_parse (original, NULL);
|
||||
orig_parsed = url_parse (original, NULL, NULL);
|
||||
assert (orig_parsed != NULL);
|
||||
|
||||
new_parsed = url_parse (redirected, NULL);
|
||||
new_parsed = url_parse (redirected, NULL, NULL);
|
||||
assert (new_parsed != NULL);
|
||||
set_ugly_no_encode (false);
|
||||
|
||||
upos = xnew0 (struct urlpos);
|
||||
upos->url = new_parsed;
|
||||
|
||||
success = download_child_p (upos, orig_parsed, depth,
|
||||
start_url_parsed, blacklist);
|
||||
start_url_parsed, blacklist, iri);
|
||||
|
||||
url_free (orig_parsed);
|
||||
url_free (new_parsed);
|
||||
|
13
src/res.c
13
src/res.c
@ -532,21 +532,28 @@ res_get_specs (const char *host, int port)
|
||||
Return true if robots were retrieved OK, false otherwise. */
|
||||
|
||||
bool
|
||||
res_retrieve_file (const char *url, char **file)
|
||||
res_retrieve_file (const char *url, char **file, struct iri *iri)
|
||||
{
|
||||
struct iri *i = iri_new ();
|
||||
uerr_t err;
|
||||
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
|
||||
int saved_ts_val = opt.timestamping;
|
||||
int saved_sp_val = opt.spider;
|
||||
|
||||
/* Copy server URI encoding for a possible IDNA transformation, no need to
|
||||
encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
|
||||
set_uri_encoding (i, iri->uri_encoding);
|
||||
i->utf8_encode = false;
|
||||
|
||||
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
|
||||
*file = NULL;
|
||||
opt.timestamping = false;
|
||||
opt.spider = false;
|
||||
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
|
||||
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
|
||||
opt.timestamping = saved_ts_val;
|
||||
opt.spider = saved_sp_val;
|
||||
opt.spider = saved_sp_val;
|
||||
xfree (robots_url);
|
||||
iri_free (i);
|
||||
|
||||
if (err != RETROK && *file != NULL)
|
||||
{
|
||||
|
@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *);
|
||||
void res_register_specs (const char *, int, struct robot_specs *);
|
||||
struct robot_specs *res_get_specs (const char *, int);
|
||||
|
||||
bool res_retrieve_file (const char *, char **);
|
||||
bool res_retrieve_file (const char *, char **, struct iri *);
|
||||
|
||||
bool is_robots_txt_url (const char *);
|
||||
|
||||
|
53
src/retr.c
53
src/retr.c
@ -598,7 +598,7 @@ static char *getproxy (struct url *);
|
||||
|
||||
uerr_t
|
||||
retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
const char *refurl, int *dt, bool recursive)
|
||||
const char *refurl, int *dt, bool recursive, struct iri *iri)
|
||||
{
|
||||
uerr_t result;
|
||||
char *url;
|
||||
@ -626,10 +626,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
if (file)
|
||||
*file = NULL;
|
||||
|
||||
reset_utf8_encode ();
|
||||
|
||||
second_try:
|
||||
u = url_parse (url, &up_error_code);
|
||||
u = url_parse (url, &up_error_code, iri);
|
||||
if (!u)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
|
||||
@ -637,7 +635,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
return URLERROR;
|
||||
}
|
||||
|
||||
/*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
|
||||
printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode);
|
||||
|
||||
if (!refurl)
|
||||
refurl = opt.referer;
|
||||
@ -652,11 +650,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
proxy = getproxy (u);
|
||||
if (proxy)
|
||||
{
|
||||
/* sXXXav : support IRI for proxy */
|
||||
/* sXXXav : could a proxy include a path ??? */
|
||||
struct iri *pi = iri_new ();
|
||||
set_uri_encoding (pi, opt.locale);
|
||||
pi->utf8_encode = false;
|
||||
|
||||
/* Parse the proxy URL. */
|
||||
set_ugly_no_encode (true);
|
||||
proxy_url = url_parse (proxy, &up_error_code);
|
||||
set_ugly_no_encode (false);
|
||||
proxy_url = url_parse (proxy, &up_error_code, NULL);
|
||||
if (!proxy_url)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
|
||||
@ -681,7 +681,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
#endif
|
||||
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
|
||||
{
|
||||
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
|
||||
result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
|
||||
}
|
||||
else if (u->scheme == SCHEME_FTP)
|
||||
{
|
||||
@ -731,10 +731,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
xfree (mynewloc);
|
||||
mynewloc = construced_newloc;
|
||||
|
||||
reset_utf8_encode ();
|
||||
/* Reset UTF-8 encoding state, keep the URI encoding and reset
|
||||
the content encoding. */
|
||||
iri->utf8_encode = opt.enable_iri;
|
||||
set_content_encoding (iri, NULL);
|
||||
|
||||
/* Now, see if this new location makes sense. */
|
||||
newloc_parsed = url_parse (mynewloc, &up_error_code);
|
||||
newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
|
||||
if (!newloc_parsed)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
|
||||
@ -782,10 +785,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
}
|
||||
|
||||
/* Try to not encode in UTF-8 if fetching failed */
|
||||
if (!(*dt & RETROKF) && get_utf8_encode ())
|
||||
if (!(*dt & RETROKF) && iri->utf8_encode)
|
||||
{
|
||||
set_utf8_encode (false);
|
||||
/*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
|
||||
iri->utf8_encode = false;
|
||||
printf ("[Fallbacking to non-utf8 for `%s'\n", url);
|
||||
goto second_try;
|
||||
}
|
||||
|
||||
@ -845,24 +848,28 @@ retrieve_from_file (const char *file, bool html, int *count)
|
||||
{
|
||||
uerr_t status;
|
||||
struct urlpos *url_list, *cur_url;
|
||||
struct iri *iri = iri_new();
|
||||
|
||||
char *input_file = NULL;
|
||||
const char *url = file;
|
||||
|
||||
status = RETROK; /* Suppose everything is OK. */
|
||||
*count = 0; /* Reset the URL count. */
|
||||
|
||||
|
||||
/* sXXXav : Assume filename and links in the file are in the locale */
|
||||
set_content_encoding (iri, opt.locale);
|
||||
|
||||
if (url_has_scheme (url))
|
||||
{
|
||||
uerr_t status;
|
||||
status = retrieve_url (url, &input_file, NULL, NULL, NULL, false);
|
||||
status = retrieve_url (url, &input_file, NULL, NULL, NULL, false, iri);
|
||||
if (status != RETROK)
|
||||
return status;
|
||||
}
|
||||
else
|
||||
input_file = (char *) file;
|
||||
|
||||
url_list = (html ? get_urls_html (input_file, NULL, NULL)
|
||||
url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
|
||||
: get_urls_file (input_file));
|
||||
|
||||
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
|
||||
@ -892,7 +899,8 @@ retrieve_from_file (const char *file, bool html, int *count)
|
||||
opt.follow_ftp = old_follow_ftp;
|
||||
}
|
||||
else
|
||||
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
|
||||
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
|
||||
&dt, opt.recursive, iri);
|
||||
|
||||
if (filename && opt.delete_after && file_exists_p (filename))
|
||||
{
|
||||
@ -1064,9 +1072,10 @@ url_uses_proxy (const char *url)
|
||||
{
|
||||
bool ret;
|
||||
struct url *u;
|
||||
set_ugly_no_encode(true);
|
||||
u= url_parse (url, NULL);
|
||||
set_ugly_no_encode(false);
|
||||
struct iri *i = iri_new();
|
||||
/* url was given in the command line, so use locale as encoding */
|
||||
set_uri_encoding (i, opt.locale);
|
||||
u= url_parse (url, NULL, i);
|
||||
if (!u)
|
||||
return false;
|
||||
ret = getproxy (u) != NULL;
|
||||
|
@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
|
||||
char *fd_read_hunk (int, hunk_terminator_t, long, long);
|
||||
char *fd_read_line (int);
|
||||
|
||||
uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
|
||||
uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
|
||||
bool, struct iri *);
|
||||
uerr_t retrieve_from_file (const char *, bool, int *);
|
||||
|
||||
const char *retr_rate (wgint, double);
|
||||
|
37
src/url.c
37
src/url.c
@ -641,7 +641,7 @@ static const char *parse_errors[] = {
|
||||
error, and if ERROR is not NULL, also set *ERROR to the appropriate
|
||||
error code. */
|
||||
struct url *
|
||||
url_parse (const char *url, int *error)
|
||||
url_parse (const char *url, int *error, struct iri *iri)
|
||||
{
|
||||
struct url *u;
|
||||
const char *p;
|
||||
@ -660,7 +660,7 @@ url_parse (const char *url, int *error)
|
||||
int port;
|
||||
char *user = NULL, *passwd = NULL;
|
||||
|
||||
char *url_encoded = NULL;
|
||||
char *url_encoded = NULL, *new_url = NULL;
|
||||
|
||||
int error_code;
|
||||
|
||||
@ -671,20 +671,20 @@ url_parse (const char *url, int *error)
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (opt.enable_iri && get_utf8_encode ())
|
||||
if (iri && iri->utf8_encode)
|
||||
{
|
||||
const char *new;
|
||||
bool utf8_encode;
|
||||
url_unescape ((char *) url);
|
||||
utf8_encode = remote_to_utf8 (url, &new);
|
||||
set_utf8_encode (utf8_encode);
|
||||
if (utf8_encode)
|
||||
url = new;
|
||||
iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
|
||||
if (!iri->utf8_encode)
|
||||
new_url = NULL;
|
||||
}
|
||||
|
||||
url_encoded = reencode_escapes (url);
|
||||
url_encoded = reencode_escapes (new_url ? new_url : url);
|
||||
p = url_encoded;
|
||||
|
||||
if (new_url && url_encoded != new_url)
|
||||
xfree (new_url);
|
||||
|
||||
p += strlen (supported_schemes[scheme].leading_string);
|
||||
uname_b = p;
|
||||
p = url_skip_credentials (p);
|
||||
@ -854,16 +854,17 @@ url_parse (const char *url, int *error)
|
||||
{
|
||||
url_unescape (u->host);
|
||||
host_modified = true;
|
||||
}
|
||||
|
||||
if (opt.enable_iri)
|
||||
{
|
||||
char *new = idn_encode (u->host, get_utf8_encode ());
|
||||
if (new)
|
||||
/* Apply IDNA regardless of iri->utf8_encode status */
|
||||
if (opt.enable_iri && iri)
|
||||
{
|
||||
xfree (u->host);
|
||||
u->host = new;
|
||||
host_modified = true;
|
||||
char *new = idn_encode (iri, u->host);
|
||||
if (new)
|
||||
{
|
||||
xfree (u->host);
|
||||
u->host = new;
|
||||
host_modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -84,7 +84,7 @@ struct url
|
||||
|
||||
char *url_escape (const char *);
|
||||
|
||||
struct url *url_parse (const char *, int *);
|
||||
struct url *url_parse (const char *, int *, struct iri *iri);
|
||||
const char *url_error (int);
|
||||
char *url_full_path (const struct url *);
|
||||
void url_set_dir (struct url *, const char *);
|
||||
|
@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT;
|
||||
#include "quote.h"
|
||||
#include "quotearg.h"
|
||||
|
||||
/* Likewise for struct iri definition */
|
||||
#include "iri.h"
|
||||
|
||||
/* Useful macros used across the code: */
|
||||
|
||||
/* The number of elements in an array. For example:
|
||||
|
Loading…
Reference in New Issue
Block a user