mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Don't descend into HTML that was downloaded by following <img src=...>
and such.
This commit is contained in:
parent
37e70109a4
commit
1b3cdef574
@ -1,3 +1,13 @@
|
||||
2003-10-10 Hrvoje Niksic <hniksic@xemacs.org>
|
||||
|
||||
* recur.c (retrieve_tree): Don't descend into documents that are
|
||||
not expected to contain HTML, regardless of their content-type.
|
||||
|
||||
* html-url.c (tag_url_attributes): Record which attributes are
|
||||
supposed to yield HTML links that can be followed.
|
||||
(tag_find_urls): Propagate that information to the caller through
|
||||
struct urlpos.
|
||||
|
||||
2003-10-10 Hrvoje Niksic <hniksic@xemacs.org>
|
||||
|
||||
* hash.c (find_mapping): Return the next available mapping when
|
||||
|
@ -56,11 +56,11 @@ struct urlpos {
|
||||
|
||||
/* Information about the original link: */
|
||||
|
||||
unsigned int link_relative_p :1; /* was the link relative? */
|
||||
unsigned int link_complete_p :1; /* was the link complete (with the
|
||||
host name, etc.) */
|
||||
unsigned int link_base_p :1; /* was the link <base href=...> */
|
||||
unsigned int link_inline_p :1; /* needed to render the page. */
|
||||
unsigned int link_relative_p :1; /* the link was relative */
|
||||
unsigned int link_complete_p :1; /* the link was complete (had host name) */
|
||||
unsigned int link_base_p :1; /* the url came from <base href=...> */
|
||||
unsigned int link_inline_p :1; /* needed to render the page */
|
||||
unsigned int link_expect_html :1; /* expected to contain HTML */
|
||||
|
||||
unsigned int link_refresh_p :1; /* link was received from
|
||||
<meta http-equiv=refresh content=...> */
|
||||
|
@ -121,11 +121,19 @@ static struct known_tag {
|
||||
/* tag_url_attributes documents which attributes of which tags contain
|
||||
URLs to harvest. It is used by tag_find_urls. */
|
||||
|
||||
/* Defines for the FLAGS field; currently only one flag is defined. */
|
||||
/* Defines for the FLAGS. */
|
||||
|
||||
/* This tag points to an external document not necessary for rendering this
|
||||
document (i.e. it's not an inlined image, stylesheet, etc.). */
|
||||
#define TUA_EXTERNAL 1
|
||||
/* The link is "inline", i.e. needs to be retrieved for this document
|
||||
to be correctly rendered. Inline links include inlined images,
|
||||
stylesheets, children frames, etc. */
|
||||
#define ATTR_INLINE 1
|
||||
|
||||
/* The link is expected to yield HTML contents. It's important not to
|
||||
try to follow HTML obtained by following e.g. <img src="...">
|
||||
regardless of content-type. Doing this causes infinite loops for
|
||||
"images" that return non-404 error pages with links to the same
|
||||
image. */
|
||||
#define ATTR_HTML 2
|
||||
|
||||
/* For tags handled by tag_find_urls: attributes that contain URLs to
|
||||
download. */
|
||||
@ -134,26 +142,26 @@ static struct {
|
||||
const char *attr_name;
|
||||
int flags;
|
||||
} tag_url_attributes[] = {
|
||||
{ TAG_A, "href", TUA_EXTERNAL },
|
||||
{ TAG_APPLET, "code", 0 },
|
||||
{ TAG_AREA, "href", TUA_EXTERNAL },
|
||||
{ TAG_BGSOUND, "src", 0 },
|
||||
{ TAG_BODY, "background", 0 },
|
||||
{ TAG_EMBED, "href", TUA_EXTERNAL },
|
||||
{ TAG_EMBED, "src", 0 },
|
||||
{ TAG_FIG, "src", 0 },
|
||||
{ TAG_FRAME, "src", 0 },
|
||||
{ TAG_IFRAME, "src", 0 },
|
||||
{ TAG_IMG, "href", 0 },
|
||||
{ TAG_IMG, "lowsrc", 0 },
|
||||
{ TAG_IMG, "src", 0 },
|
||||
{ TAG_INPUT, "src", 0 },
|
||||
{ TAG_LAYER, "src", 0 },
|
||||
{ TAG_OVERLAY, "src", 0 },
|
||||
{ TAG_SCRIPT, "src", 0 },
|
||||
{ TAG_TABLE, "background", 0 },
|
||||
{ TAG_TD, "background", 0 },
|
||||
{ TAG_TH, "background", 0 }
|
||||
{ TAG_A, "href", ATTR_HTML },
|
||||
{ TAG_APPLET, "code", ATTR_INLINE },
|
||||
{ TAG_AREA, "href", ATTR_HTML },
|
||||
{ TAG_BGSOUND, "src", ATTR_INLINE },
|
||||
{ TAG_BODY, "background", ATTR_INLINE },
|
||||
{ TAG_EMBED, "href", ATTR_HTML },
|
||||
{ TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
|
||||
{ TAG_FIG, "src", ATTR_INLINE },
|
||||
{ TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
|
||||
{ TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
|
||||
{ TAG_IMG, "href", ATTR_INLINE },
|
||||
{ TAG_IMG, "lowsrc", ATTR_INLINE },
|
||||
{ TAG_IMG, "src", ATTR_INLINE },
|
||||
{ TAG_INPUT, "src", ATTR_INLINE },
|
||||
{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
|
||||
{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
|
||||
{ TAG_SCRIPT, "src", ATTR_INLINE },
|
||||
{ TAG_TABLE, "background", ATTR_INLINE },
|
||||
{ TAG_TD, "background", ATTR_INLINE },
|
||||
{ TAG_TH, "background", ATTR_INLINE }
|
||||
};
|
||||
|
||||
/* The lists of interesting tags and attributes are built dynamically,
|
||||
@ -262,7 +270,7 @@ struct map_context {
|
||||
size. */
|
||||
|
||||
static struct urlpos *
|
||||
append_one_url (const char *link_uri, int inlinep,
|
||||
append_one_url (const char *link_uri,
|
||||
struct taginfo *tag, int attrind, struct map_context *ctx)
|
||||
{
|
||||
int link_has_scheme = url_has_scheme (link_uri);
|
||||
@ -326,7 +334,6 @@ append_one_url (const char *link_uri, int inlinep,
|
||||
newel->url = url;
|
||||
newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
|
||||
newel->size = tag->attrs[attrind].value_raw_size;
|
||||
newel->link_inline_p = inlinep;
|
||||
|
||||
/* A URL is relative if the host is not named, and the name does not
|
||||
start with `/'. */
|
||||
@ -392,9 +399,16 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
{
|
||||
if (0 == strcasecmp (tag->attrs[attrind].name,
|
||||
tag_url_attributes[i].attr_name))
|
||||
{
|
||||
struct urlpos *up = append_one_url (link, tag, attrind, ctx);
|
||||
if (up)
|
||||
{
|
||||
int flags = tag_url_attributes[i].flags;
|
||||
append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
|
||||
if (flags & ATTR_INLINE)
|
||||
up->link_inline_p = 1;
|
||||
if (flags & ATTR_HTML)
|
||||
up->link_expect_html = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -411,7 +425,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
if (!newbase)
|
||||
return;
|
||||
|
||||
base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
|
||||
base_urlpos = append_one_url (newbase, tag, attrind, ctx);
|
||||
if (!base_urlpos)
|
||||
return;
|
||||
base_urlpos->ignore_when_downloading = 1;
|
||||
@ -434,10 +448,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
char *action = find_attr (tag, "action", &attrind);
|
||||
if (action)
|
||||
{
|
||||
struct urlpos *action_urlpos = append_one_url (action, 0, tag,
|
||||
attrind, ctx);
|
||||
if (action_urlpos)
|
||||
action_urlpos->ignore_when_downloading = 1;
|
||||
struct urlpos *up = append_one_url (action, tag, attrind, ctx);
|
||||
if (up)
|
||||
up->ignore_when_downloading = 1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -457,12 +470,16 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
<link rel="shortcut icon" href="...">
|
||||
*/
|
||||
if (href)
|
||||
{
|
||||
struct urlpos *up = append_one_url (href, tag, attrind, ctx);
|
||||
if (up)
|
||||
{
|
||||
char *rel = find_attr (tag, "rel", NULL);
|
||||
int inlinep = (rel
|
||||
if (rel
|
||||
&& (0 == strcasecmp (rel, "stylesheet")
|
||||
|| 0 == strcasecmp (rel, "shortcut icon")));
|
||||
append_one_url (href, inlinep, tag, attrind, ctx);
|
||||
|| 0 == strcasecmp (rel, "shortcut icon")))
|
||||
up->link_inline_p = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -511,7 +528,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
while (ISSPACE (*p))
|
||||
++p;
|
||||
|
||||
entry = append_one_url (p, 0, tag, attrind, ctx);
|
||||
entry = append_one_url (p, tag, attrind, ctx);
|
||||
if (entry)
|
||||
{
|
||||
entry->link_refresh_p = 1;
|
||||
|
39
src/recur.c
39
src/recur.c
@ -6,7 +6,7 @@ This file is part of GNU Wget.
|
||||
GNU Wget is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
(at your option) any later version.
|
||||
|
||||
GNU Wget is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -66,10 +66,13 @@ extern struct hash_table *downloaded_html_set;
|
||||
/* Functions for maintaining the URL queue. */
|
||||
|
||||
struct queue_element {
|
||||
const char *url;
|
||||
const char *referer;
|
||||
int depth;
|
||||
struct queue_element *next;
|
||||
const char *url; /* the URL to download */
|
||||
const char *referer; /* the referring document */
|
||||
int depth; /* the depth */
|
||||
unsigned int html_allowed :1; /* whether the document is allowed to
|
||||
be treated as HTML. */
|
||||
|
||||
struct queue_element *next; /* next element in queue */
|
||||
};
|
||||
|
||||
struct url_queue {
|
||||
@ -102,12 +105,13 @@ url_queue_delete (struct url_queue *queue)
|
||||
|
||||
static void
|
||||
url_enqueue (struct url_queue *queue,
|
||||
const char *url, const char *referer, int depth)
|
||||
const char *url, const char *referer, int depth, int html_allowed)
|
||||
{
|
||||
struct queue_element *qel = xmalloc (sizeof (*qel));
|
||||
qel->url = url;
|
||||
qel->referer = referer;
|
||||
qel->depth = depth;
|
||||
qel->html_allowed = html_allowed;
|
||||
qel->next = NULL;
|
||||
|
||||
++queue->count;
|
||||
@ -130,7 +134,8 @@ url_enqueue (struct url_queue *queue,
|
||||
|
||||
static int
|
||||
url_dequeue (struct url_queue *queue,
|
||||
const char **url, const char **referer, int *depth)
|
||||
const char **url, const char **referer, int *depth,
|
||||
int *html_allowed)
|
||||
{
|
||||
struct queue_element *qel = queue->head;
|
||||
|
||||
@ -144,6 +149,7 @@ url_dequeue (struct url_queue *queue,
|
||||
*url = qel->url;
|
||||
*referer = qel->referer;
|
||||
*depth = qel->depth;
|
||||
*html_allowed = qel->html_allowed;
|
||||
|
||||
--queue->count;
|
||||
|
||||
@ -208,14 +214,14 @@ retrieve_tree (const char *start_url)
|
||||
|
||||
/* Enqueue the starting URL. Use start_url_parsed->url rather than
|
||||
just URL so we enqueue the canonical form of the URL. */
|
||||
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0);
|
||||
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
|
||||
string_set_add (blacklist, start_url_parsed->url);
|
||||
|
||||
while (1)
|
||||
{
|
||||
int descend = 0;
|
||||
char *url, *referer, *file = NULL;
|
||||
int depth;
|
||||
int depth, html_allowed;
|
||||
boolean dash_p_leaf_HTML = FALSE;
|
||||
|
||||
if (downloaded_exceeds_quota ())
|
||||
@ -227,7 +233,7 @@ retrieve_tree (const char *start_url)
|
||||
|
||||
if (!url_dequeue (queue,
|
||||
(const char **)&url, (const char **)&referer,
|
||||
&depth))
|
||||
&depth, &html_allowed))
|
||||
break;
|
||||
|
||||
/* ...and download it. Note that this download is in most cases
|
||||
@ -245,7 +251,8 @@ retrieve_tree (const char *start_url)
|
||||
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
|
||||
url, file));
|
||||
|
||||
if (downloaded_html_set
|
||||
if (html_allowed
|
||||
&& downloaded_html_set
|
||||
&& string_set_contains (downloaded_html_set, file))
|
||||
descend = 1;
|
||||
}
|
||||
@ -259,7 +266,7 @@ retrieve_tree (const char *start_url)
|
||||
status = retrieve_url (url, &file, &redirected, referer, &dt);
|
||||
opt.recursive = oldrec;
|
||||
|
||||
if (file && status == RETROK
|
||||
if (html_allowed && file && status == RETROK
|
||||
&& (dt & RETROKF) && (dt & TEXTHTML))
|
||||
descend = 1;
|
||||
|
||||
@ -341,7 +348,8 @@ retrieve_tree (const char *start_url)
|
||||
blacklist))
|
||||
{
|
||||
url_enqueue (queue, xstrdup (child->url->url),
|
||||
xstrdup (url), depth + 1);
|
||||
xstrdup (url), depth + 1,
|
||||
child->link_expect_html);
|
||||
/* We blacklist the URL we have enqueued, because we
|
||||
don't want to enqueue (and hence download) the
|
||||
same URL twice. */
|
||||
@ -382,8 +390,9 @@ retrieve_tree (const char *start_url)
|
||||
now. */
|
||||
{
|
||||
char *d1, *d2;
|
||||
int d3;
|
||||
while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
|
||||
int d3, d4;
|
||||
while (url_dequeue (queue,
|
||||
(const char **)&d1, (const char **)&d2, &d3, &d4))
|
||||
{
|
||||
xfree (d1);
|
||||
FREE_MAYBE (d2);
|
||||
|
Loading…
Reference in New Issue
Block a user