1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Don't descend into HTML that was downloaded by following <img src=...>

and such.
This commit is contained in:
hniksic 2003-10-10 07:25:10 -07:00
parent 37e70109a4
commit 1b3cdef574
4 changed files with 95 additions and 59 deletions

View File

@ -1,3 +1,13 @@
2003-10-10 Hrvoje Niksic <hniksic@xemacs.org>
* recur.c (retrieve_tree): Don't descend into documents that are
not expected to contain HTML, regardless of their content-type.
* html-url.c (tag_url_attributes): Record which attributes are
supposed to yield HTML links that can be followed.
(tag_find_urls): Propagate that information to the caller through
struct urlpos.
2003-10-10 Hrvoje Niksic <hniksic@xemacs.org>
* hash.c (find_mapping): Return the next available mapping when

View File

@ -56,11 +56,11 @@ struct urlpos {
/* Information about the original link: */
unsigned int link_relative_p :1; /* was the link relative? */
unsigned int link_complete_p :1; /* was the link complete (with the
host name, etc.) */
unsigned int link_base_p :1; /* was the link <base href=...> */
unsigned int link_inline_p :1; /* needed to render the page. */
unsigned int link_relative_p :1; /* the link was relative */
unsigned int link_complete_p :1; /* the link was complete (had host name) */
unsigned int link_base_p :1; /* the url came from <base href=...> */
unsigned int link_inline_p :1; /* needed to render the page */
unsigned int link_expect_html :1; /* expected to contain HTML */
unsigned int link_refresh_p :1; /* link was received from
<meta http-equiv=refresh content=...> */

View File

@ -121,11 +121,19 @@ static struct known_tag {
/* tag_url_attributes documents which attributes of which tags contain
URLs to harvest. It is used by tag_find_urls. */
/* Defines for the FLAGS field; currently only one flag is defined. */
/* Defines for the FLAGS. */
/* This tag points to an external document not necessary for rendering this
document (i.e. it's not an inlined image, stylesheet, etc.). */
#define TUA_EXTERNAL 1
/* The link is "inline", i.e. needs to be retrieved for this document
to be correctly rendered. Inline links include inlined images,
stylesheets, children frames, etc. */
#define ATTR_INLINE 1
/* The link is expected to yield HTML contents. It's important not to
try to follow HTML obtained by following e.g. <img src="...">
regardless of content-type. Doing this causes infinite loops for
"images" that return non-404 error pages with links to the same
image. */
#define ATTR_HTML 2
/* For tags handled by tag_find_urls: attributes that contain URLs to
download. */
@ -134,26 +142,26 @@ static struct {
const char *attr_name;
int flags;
} tag_url_attributes[] = {
{ TAG_A, "href", TUA_EXTERNAL },
{ TAG_APPLET, "code", 0 },
{ TAG_AREA, "href", TUA_EXTERNAL },
{ TAG_BGSOUND, "src", 0 },
{ TAG_BODY, "background", 0 },
{ TAG_EMBED, "href", TUA_EXTERNAL },
{ TAG_EMBED, "src", 0 },
{ TAG_FIG, "src", 0 },
{ TAG_FRAME, "src", 0 },
{ TAG_IFRAME, "src", 0 },
{ TAG_IMG, "href", 0 },
{ TAG_IMG, "lowsrc", 0 },
{ TAG_IMG, "src", 0 },
{ TAG_INPUT, "src", 0 },
{ TAG_LAYER, "src", 0 },
{ TAG_OVERLAY, "src", 0 },
{ TAG_SCRIPT, "src", 0 },
{ TAG_TABLE, "background", 0 },
{ TAG_TD, "background", 0 },
{ TAG_TH, "background", 0 }
{ TAG_A, "href", ATTR_HTML },
{ TAG_APPLET, "code", ATTR_INLINE },
{ TAG_AREA, "href", ATTR_HTML },
{ TAG_BGSOUND, "src", ATTR_INLINE },
{ TAG_BODY, "background", ATTR_INLINE },
{ TAG_EMBED, "href", ATTR_HTML },
{ TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_FIG, "src", ATTR_INLINE },
{ TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_IMG, "href", ATTR_INLINE },
{ TAG_IMG, "lowsrc", ATTR_INLINE },
{ TAG_IMG, "src", ATTR_INLINE },
{ TAG_INPUT, "src", ATTR_INLINE },
{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_SCRIPT, "src", ATTR_INLINE },
{ TAG_TABLE, "background", ATTR_INLINE },
{ TAG_TD, "background", ATTR_INLINE },
{ TAG_TH, "background", ATTR_INLINE }
};
/* The lists of interesting tags and attributes are built dynamically,
@ -262,7 +270,7 @@ struct map_context {
size. */
static struct urlpos *
append_one_url (const char *link_uri, int inlinep,
append_one_url (const char *link_uri,
struct taginfo *tag, int attrind, struct map_context *ctx)
{
int link_has_scheme = url_has_scheme (link_uri);
@ -326,7 +334,6 @@ append_one_url (const char *link_uri, int inlinep,
newel->url = url;
newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
newel->size = tag->attrs[attrind].value_raw_size;
newel->link_inline_p = inlinep;
/* A URL is relative if the host is not named, and the name does not
start with `/'. */
@ -392,9 +399,16 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
{
if (0 == strcasecmp (tag->attrs[attrind].name,
tag_url_attributes[i].attr_name))
{
struct urlpos *up = append_one_url (link, tag, attrind, ctx);
if (up)
{
int flags = tag_url_attributes[i].flags;
append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
if (flags & ATTR_INLINE)
up->link_inline_p = 1;
if (flags & ATTR_HTML)
up->link_expect_html = 1;
}
}
}
}
@ -411,7 +425,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
if (!newbase)
return;
base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
base_urlpos = append_one_url (newbase, tag, attrind, ctx);
if (!base_urlpos)
return;
base_urlpos->ignore_when_downloading = 1;
@ -434,10 +448,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
char *action = find_attr (tag, "action", &attrind);
if (action)
{
struct urlpos *action_urlpos = append_one_url (action, 0, tag,
attrind, ctx);
if (action_urlpos)
action_urlpos->ignore_when_downloading = 1;
struct urlpos *up = append_one_url (action, tag, attrind, ctx);
if (up)
up->ignore_when_downloading = 1;
}
}
@ -457,12 +470,16 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
<link rel="shortcut icon" href="...">
*/
if (href)
{
struct urlpos *up = append_one_url (href, tag, attrind, ctx);
if (up)
{
char *rel = find_attr (tag, "rel", NULL);
int inlinep = (rel
if (rel
&& (0 == strcasecmp (rel, "stylesheet")
|| 0 == strcasecmp (rel, "shortcut icon")));
append_one_url (href, inlinep, tag, attrind, ctx);
|| 0 == strcasecmp (rel, "shortcut icon")))
up->link_inline_p = 1;
}
}
}
@ -511,7 +528,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
while (ISSPACE (*p))
++p;
entry = append_one_url (p, 0, tag, attrind, ctx);
entry = append_one_url (p, tag, attrind, ctx);
if (entry)
{
entry->link_refresh_p = 1;

View File

@ -6,7 +6,7 @@ This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -66,10 +66,13 @@ extern struct hash_table *downloaded_html_set;
/* Functions for maintaining the URL queue. */
struct queue_element {
const char *url;
const char *referer;
int depth;
struct queue_element *next;
const char *url; /* the URL to download */
const char *referer; /* the referring document */
int depth; /* the depth */
unsigned int html_allowed :1; /* whether the document is allowed to
be treated as HTML. */
struct queue_element *next; /* next element in queue */
};
struct url_queue {
@ -102,12 +105,13 @@ url_queue_delete (struct url_queue *queue)
static void
url_enqueue (struct url_queue *queue,
const char *url, const char *referer, int depth)
const char *url, const char *referer, int depth, int html_allowed)
{
struct queue_element *qel = xmalloc (sizeof (*qel));
qel->url = url;
qel->referer = referer;
qel->depth = depth;
qel->html_allowed = html_allowed;
qel->next = NULL;
++queue->count;
@ -130,7 +134,8 @@ url_enqueue (struct url_queue *queue,
static int
url_dequeue (struct url_queue *queue,
const char **url, const char **referer, int *depth)
const char **url, const char **referer, int *depth,
int *html_allowed)
{
struct queue_element *qel = queue->head;
@ -144,6 +149,7 @@ url_dequeue (struct url_queue *queue,
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
*html_allowed = qel->html_allowed;
--queue->count;
@ -208,14 +214,14 @@ retrieve_tree (const char *start_url)
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0);
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
string_set_add (blacklist, start_url_parsed->url);
while (1)
{
int descend = 0;
char *url, *referer, *file = NULL;
int depth;
int depth, html_allowed;
boolean dash_p_leaf_HTML = FALSE;
if (downloaded_exceeds_quota ())
@ -227,7 +233,7 @@ retrieve_tree (const char *start_url)
if (!url_dequeue (queue,
(const char **)&url, (const char **)&referer,
&depth))
&depth, &html_allowed))
break;
/* ...and download it. Note that this download is in most cases
@ -245,7 +251,8 @@ retrieve_tree (const char *start_url)
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
url, file));
if (downloaded_html_set
if (html_allowed
&& downloaded_html_set
&& string_set_contains (downloaded_html_set, file))
descend = 1;
}
@ -259,7 +266,7 @@ retrieve_tree (const char *start_url)
status = retrieve_url (url, &file, &redirected, referer, &dt);
opt.recursive = oldrec;
if (file && status == RETROK
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
descend = 1;
@ -341,7 +348,8 @@ retrieve_tree (const char *start_url)
blacklist))
{
url_enqueue (queue, xstrdup (child->url->url),
xstrdup (url), depth + 1);
xstrdup (url), depth + 1,
child->link_expect_html);
/* We blacklist the URL we have enqueued, because we
don't want to enqueue (and hence download) the
same URL twice. */
@ -382,8 +390,9 @@ retrieve_tree (const char *start_url)
now. */
{
char *d1, *d2;
int d3;
while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
int d3, d4;
while (url_dequeue (queue,
(const char **)&d1, (const char **)&d2, &d3, &d4))
{
xfree (d1);
FREE_MAYBE (d2);