1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Don't descend into HTML that was downloaded by following <img src=...>

and such.
This commit is contained in:
hniksic 2003-10-10 07:25:10 -07:00
parent 37e70109a4
commit 1b3cdef574
4 changed files with 95 additions and 59 deletions

View File

@ -1,3 +1,13 @@
2003-10-10 Hrvoje Niksic <hniksic@xemacs.org>
* recur.c (retrieve_tree): Don't descend into documents that are
not expected to contain HTML, regardless of their content-type.
* html-url.c (tag_url_attributes): Record which attributes are
supposed to yield HTML links that can be followed.
(tag_find_urls): Propagate that information to the caller through
struct urlpos.
2003-10-10 Hrvoje Niksic <hniksic@xemacs.org> 2003-10-10 Hrvoje Niksic <hniksic@xemacs.org>
* hash.c (find_mapping): Return the next available mapping when * hash.c (find_mapping): Return the next available mapping when

View File

@ -56,11 +56,11 @@ struct urlpos {
/* Information about the original link: */ /* Information about the original link: */
unsigned int link_relative_p :1; /* was the link relative? */ unsigned int link_relative_p :1; /* the link was relative */
unsigned int link_complete_p :1; /* was the link complete (with the unsigned int link_complete_p :1; /* the link was complete (had host name) */
host name, etc.) */ unsigned int link_base_p :1; /* the url came from <base href=...> */
unsigned int link_base_p :1; /* was the link <base href=...> */ unsigned int link_inline_p :1; /* needed to render the page */
unsigned int link_inline_p :1; /* needed to render the page. */ unsigned int link_expect_html :1; /* expected to contain HTML */
unsigned int link_refresh_p :1; /* link was received from unsigned int link_refresh_p :1; /* link was received from
<meta http-equiv=refresh content=...> */ <meta http-equiv=refresh content=...> */

View File

@ -121,11 +121,19 @@ static struct known_tag {
/* tag_url_attributes documents which attributes of which tags contain /* tag_url_attributes documents which attributes of which tags contain
URLs to harvest. It is used by tag_find_urls. */ URLs to harvest. It is used by tag_find_urls. */
/* Defines for the FLAGS field; currently only one flag is defined. */ /* Defines for the FLAGS. */
/* This tag points to an external document not necessary for rendering this /* The link is "inline", i.e. needs to be retrieved for this document
document (i.e. it's not an inlined image, stylesheet, etc.). */ to be correctly rendered. Inline links include inlined images,
#define TUA_EXTERNAL 1 stylesheets, children frames, etc. */
#define ATTR_INLINE 1
/* The link is expected to yield HTML contents. It's important not to
try to follow HTML obtained by following e.g. <img src="...">
regardless of content-type. Doing this causes infinite loops for
"images" that return non-404 error pages with links to the same
image. */
#define ATTR_HTML 2
/* For tags handled by tag_find_urls: attributes that contain URLs to /* For tags handled by tag_find_urls: attributes that contain URLs to
download. */ download. */
@ -134,26 +142,26 @@ static struct {
const char *attr_name; const char *attr_name;
int flags; int flags;
} tag_url_attributes[] = { } tag_url_attributes[] = {
{ TAG_A, "href", TUA_EXTERNAL }, { TAG_A, "href", ATTR_HTML },
{ TAG_APPLET, "code", 0 }, { TAG_APPLET, "code", ATTR_INLINE },
{ TAG_AREA, "href", TUA_EXTERNAL }, { TAG_AREA, "href", ATTR_HTML },
{ TAG_BGSOUND, "src", 0 }, { TAG_BGSOUND, "src", ATTR_INLINE },
{ TAG_BODY, "background", 0 }, { TAG_BODY, "background", ATTR_INLINE },
{ TAG_EMBED, "href", TUA_EXTERNAL }, { TAG_EMBED, "href", ATTR_HTML },
{ TAG_EMBED, "src", 0 }, { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_FIG, "src", 0 }, { TAG_FIG, "src", ATTR_INLINE },
{ TAG_FRAME, "src", 0 }, { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_IFRAME, "src", 0 }, { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_IMG, "href", 0 }, { TAG_IMG, "href", ATTR_INLINE },
{ TAG_IMG, "lowsrc", 0 }, { TAG_IMG, "lowsrc", ATTR_INLINE },
{ TAG_IMG, "src", 0 }, { TAG_IMG, "src", ATTR_INLINE },
{ TAG_INPUT, "src", 0 }, { TAG_INPUT, "src", ATTR_INLINE },
{ TAG_LAYER, "src", 0 }, { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_OVERLAY, "src", 0 }, { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_SCRIPT, "src", 0 }, { TAG_SCRIPT, "src", ATTR_INLINE },
{ TAG_TABLE, "background", 0 }, { TAG_TABLE, "background", ATTR_INLINE },
{ TAG_TD, "background", 0 }, { TAG_TD, "background", ATTR_INLINE },
{ TAG_TH, "background", 0 } { TAG_TH, "background", ATTR_INLINE }
}; };
/* The lists of interesting tags and attributes are built dynamically, /* The lists of interesting tags and attributes are built dynamically,
@ -262,7 +270,7 @@ struct map_context {
size. */ size. */
static struct urlpos * static struct urlpos *
append_one_url (const char *link_uri, int inlinep, append_one_url (const char *link_uri,
struct taginfo *tag, int attrind, struct map_context *ctx) struct taginfo *tag, int attrind, struct map_context *ctx)
{ {
int link_has_scheme = url_has_scheme (link_uri); int link_has_scheme = url_has_scheme (link_uri);
@ -326,7 +334,6 @@ append_one_url (const char *link_uri, int inlinep,
newel->url = url; newel->url = url;
newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
newel->size = tag->attrs[attrind].value_raw_size; newel->size = tag->attrs[attrind].value_raw_size;
newel->link_inline_p = inlinep;
/* A URL is relative if the host is not named, and the name does not /* A URL is relative if the host is not named, and the name does not
start with `/'. */ start with `/'. */
@ -393,8 +400,15 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
if (0 == strcasecmp (tag->attrs[attrind].name, if (0 == strcasecmp (tag->attrs[attrind].name,
tag_url_attributes[i].attr_name)) tag_url_attributes[i].attr_name))
{ {
int flags = tag_url_attributes[i].flags; struct urlpos *up = append_one_url (link, tag, attrind, ctx);
append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx); if (up)
{
int flags = tag_url_attributes[i].flags;
if (flags & ATTR_INLINE)
up->link_inline_p = 1;
if (flags & ATTR_HTML)
up->link_expect_html = 1;
}
} }
} }
} }
@ -411,7 +425,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
if (!newbase) if (!newbase)
return; return;
base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx); base_urlpos = append_one_url (newbase, tag, attrind, ctx);
if (!base_urlpos) if (!base_urlpos)
return; return;
base_urlpos->ignore_when_downloading = 1; base_urlpos->ignore_when_downloading = 1;
@ -434,10 +448,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
char *action = find_attr (tag, "action", &attrind); char *action = find_attr (tag, "action", &attrind);
if (action) if (action)
{ {
struct urlpos *action_urlpos = append_one_url (action, 0, tag, struct urlpos *up = append_one_url (action, tag, attrind, ctx);
attrind, ctx); if (up)
if (action_urlpos) up->ignore_when_downloading = 1;
action_urlpos->ignore_when_downloading = 1;
} }
} }
@ -458,11 +471,15 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
*/ */
if (href) if (href)
{ {
char *rel = find_attr (tag, "rel", NULL); struct urlpos *up = append_one_url (href, tag, attrind, ctx);
int inlinep = (rel if (up)
&& (0 == strcasecmp (rel, "stylesheet") {
|| 0 == strcasecmp (rel, "shortcut icon"))); char *rel = find_attr (tag, "rel", NULL);
append_one_url (href, inlinep, tag, attrind, ctx); if (rel
&& (0 == strcasecmp (rel, "stylesheet")
|| 0 == strcasecmp (rel, "shortcut icon")))
up->link_inline_p = 1;
}
} }
} }
@ -511,7 +528,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
while (ISSPACE (*p)) while (ISSPACE (*p))
++p; ++p;
entry = append_one_url (p, 0, tag, attrind, ctx); entry = append_one_url (p, tag, attrind, ctx);
if (entry) if (entry)
{ {
entry->link_refresh_p = 1; entry->link_refresh_p = 1;

View File

@ -6,7 +6,7 @@ This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or the Free Software Foundation; either version 2 of the License, or
(at your option) any later version. (at your option) any later version.
GNU Wget is distributed in the hope that it will be useful, GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -66,10 +66,13 @@ extern struct hash_table *downloaded_html_set;
/* Functions for maintaining the URL queue. */ /* Functions for maintaining the URL queue. */
struct queue_element { struct queue_element {
const char *url; const char *url; /* the URL to download */
const char *referer; const char *referer; /* the referring document */
int depth; int depth; /* the depth */
struct queue_element *next; unsigned int html_allowed :1; /* whether the document is allowed to
be treated as HTML. */
struct queue_element *next; /* next element in queue */
}; };
struct url_queue { struct url_queue {
@ -102,12 +105,13 @@ url_queue_delete (struct url_queue *queue)
static void static void
url_enqueue (struct url_queue *queue, url_enqueue (struct url_queue *queue,
const char *url, const char *referer, int depth) const char *url, const char *referer, int depth, int html_allowed)
{ {
struct queue_element *qel = xmalloc (sizeof (*qel)); struct queue_element *qel = xmalloc (sizeof (*qel));
qel->url = url; qel->url = url;
qel->referer = referer; qel->referer = referer;
qel->depth = depth; qel->depth = depth;
qel->html_allowed = html_allowed;
qel->next = NULL; qel->next = NULL;
++queue->count; ++queue->count;
@ -130,7 +134,8 @@ url_enqueue (struct url_queue *queue,
static int static int
url_dequeue (struct url_queue *queue, url_dequeue (struct url_queue *queue,
const char **url, const char **referer, int *depth) const char **url, const char **referer, int *depth,
int *html_allowed)
{ {
struct queue_element *qel = queue->head; struct queue_element *qel = queue->head;
@ -144,6 +149,7 @@ url_dequeue (struct url_queue *queue,
*url = qel->url; *url = qel->url;
*referer = qel->referer; *referer = qel->referer;
*depth = qel->depth; *depth = qel->depth;
*html_allowed = qel->html_allowed;
--queue->count; --queue->count;
@ -208,14 +214,14 @@ retrieve_tree (const char *start_url)
/* Enqueue the starting URL. Use start_url_parsed->url rather than /* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */ just URL so we enqueue the canonical form of the URL. */
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0); url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
string_set_add (blacklist, start_url_parsed->url); string_set_add (blacklist, start_url_parsed->url);
while (1) while (1)
{ {
int descend = 0; int descend = 0;
char *url, *referer, *file = NULL; char *url, *referer, *file = NULL;
int depth; int depth, html_allowed;
boolean dash_p_leaf_HTML = FALSE; boolean dash_p_leaf_HTML = FALSE;
if (downloaded_exceeds_quota ()) if (downloaded_exceeds_quota ())
@ -227,7 +233,7 @@ retrieve_tree (const char *start_url)
if (!url_dequeue (queue, if (!url_dequeue (queue,
(const char **)&url, (const char **)&referer, (const char **)&url, (const char **)&referer,
&depth)) &depth, &html_allowed))
break; break;
/* ...and download it. Note that this download is in most cases /* ...and download it. Note that this download is in most cases
@ -245,7 +251,8 @@ retrieve_tree (const char *start_url)
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
url, file)); url, file));
if (downloaded_html_set if (html_allowed
&& downloaded_html_set
&& string_set_contains (downloaded_html_set, file)) && string_set_contains (downloaded_html_set, file))
descend = 1; descend = 1;
} }
@ -259,7 +266,7 @@ retrieve_tree (const char *start_url)
status = retrieve_url (url, &file, &redirected, referer, &dt); status = retrieve_url (url, &file, &redirected, referer, &dt);
opt.recursive = oldrec; opt.recursive = oldrec;
if (file && status == RETROK if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML)) && (dt & RETROKF) && (dt & TEXTHTML))
descend = 1; descend = 1;
@ -341,7 +348,8 @@ retrieve_tree (const char *start_url)
blacklist)) blacklist))
{ {
url_enqueue (queue, xstrdup (child->url->url), url_enqueue (queue, xstrdup (child->url->url),
xstrdup (url), depth + 1); xstrdup (url), depth + 1,
child->link_expect_html);
/* We blacklist the URL we have enqueued, because we /* We blacklist the URL we have enqueued, because we
don't want to enqueue (and hence download) the don't want to enqueue (and hence download) the
same URL twice. */ same URL twice. */
@ -382,8 +390,9 @@ retrieve_tree (const char *start_url)
now. */ now. */
{ {
char *d1, *d2; char *d1, *d2;
int d3; int d3, d4;
while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3)) while (url_dequeue (queue,
(const char **)&d1, (const char **)&d2, &d3, &d4))
{ {
xfree (d1); xfree (d1);
FREE_MAYBE (d2); FREE_MAYBE (d2);