mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Don't descend into HTML that was downloaded by following <img src=...>
and such.
This commit is contained in:
parent
37e70109a4
commit
1b3cdef574
@ -1,3 +1,13 @@
|
|||||||
|
2003-10-10 Hrvoje Niksic <hniksic@xemacs.org>
|
||||||
|
|
||||||
|
* recur.c (retrieve_tree): Don't descend into documents that are
|
||||||
|
not expected to contain HTML, regardless of their content-type.
|
||||||
|
|
||||||
|
* html-url.c (tag_url_attributes): Record which attributes are
|
||||||
|
supposed to yield HTML links that can be followed.
|
||||||
|
(tag_find_urls): Propagate that information to the caller through
|
||||||
|
struct urlpos.
|
||||||
|
|
||||||
2003-10-10 Hrvoje Niksic <hniksic@xemacs.org>
|
2003-10-10 Hrvoje Niksic <hniksic@xemacs.org>
|
||||||
|
|
||||||
* hash.c (find_mapping): Return the next available mapping when
|
* hash.c (find_mapping): Return the next available mapping when
|
||||||
|
@ -56,11 +56,11 @@ struct urlpos {
|
|||||||
|
|
||||||
/* Information about the original link: */
|
/* Information about the original link: */
|
||||||
|
|
||||||
unsigned int link_relative_p :1; /* was the link relative? */
|
unsigned int link_relative_p :1; /* the link was relative */
|
||||||
unsigned int link_complete_p :1; /* was the link complete (with the
|
unsigned int link_complete_p :1; /* the link was complete (had host name) */
|
||||||
host name, etc.) */
|
unsigned int link_base_p :1; /* the url came from <base href=...> */
|
||||||
unsigned int link_base_p :1; /* was the link <base href=...> */
|
unsigned int link_inline_p :1; /* needed to render the page */
|
||||||
unsigned int link_inline_p :1; /* needed to render the page. */
|
unsigned int link_expect_html :1; /* expected to contain HTML */
|
||||||
|
|
||||||
unsigned int link_refresh_p :1; /* link was received from
|
unsigned int link_refresh_p :1; /* link was received from
|
||||||
<meta http-equiv=refresh content=...> */
|
<meta http-equiv=refresh content=...> */
|
||||||
|
@ -121,11 +121,19 @@ static struct known_tag {
|
|||||||
/* tag_url_attributes documents which attributes of which tags contain
|
/* tag_url_attributes documents which attributes of which tags contain
|
||||||
URLs to harvest. It is used by tag_find_urls. */
|
URLs to harvest. It is used by tag_find_urls. */
|
||||||
|
|
||||||
/* Defines for the FLAGS field; currently only one flag is defined. */
|
/* Defines for the FLAGS. */
|
||||||
|
|
||||||
/* This tag points to an external document not necessary for rendering this
|
/* The link is "inline", i.e. needs to be retrieved for this document
|
||||||
document (i.e. it's not an inlined image, stylesheet, etc.). */
|
to be correctly rendered. Inline links include inlined images,
|
||||||
#define TUA_EXTERNAL 1
|
stylesheets, children frames, etc. */
|
||||||
|
#define ATTR_INLINE 1
|
||||||
|
|
||||||
|
/* The link is expected to yield HTML contents. It's important not to
|
||||||
|
try to follow HTML obtained by following e.g. <img src="...">
|
||||||
|
regardless of content-type. Doing this causes infinite loops for
|
||||||
|
"images" that return non-404 error pages with links to the same
|
||||||
|
image. */
|
||||||
|
#define ATTR_HTML 2
|
||||||
|
|
||||||
/* For tags handled by tag_find_urls: attributes that contain URLs to
|
/* For tags handled by tag_find_urls: attributes that contain URLs to
|
||||||
download. */
|
download. */
|
||||||
@ -134,26 +142,26 @@ static struct {
|
|||||||
const char *attr_name;
|
const char *attr_name;
|
||||||
int flags;
|
int flags;
|
||||||
} tag_url_attributes[] = {
|
} tag_url_attributes[] = {
|
||||||
{ TAG_A, "href", TUA_EXTERNAL },
|
{ TAG_A, "href", ATTR_HTML },
|
||||||
{ TAG_APPLET, "code", 0 },
|
{ TAG_APPLET, "code", ATTR_INLINE },
|
||||||
{ TAG_AREA, "href", TUA_EXTERNAL },
|
{ TAG_AREA, "href", ATTR_HTML },
|
||||||
{ TAG_BGSOUND, "src", 0 },
|
{ TAG_BGSOUND, "src", ATTR_INLINE },
|
||||||
{ TAG_BODY, "background", 0 },
|
{ TAG_BODY, "background", ATTR_INLINE },
|
||||||
{ TAG_EMBED, "href", TUA_EXTERNAL },
|
{ TAG_EMBED, "href", ATTR_HTML },
|
||||||
{ TAG_EMBED, "src", 0 },
|
{ TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
|
||||||
{ TAG_FIG, "src", 0 },
|
{ TAG_FIG, "src", ATTR_INLINE },
|
||||||
{ TAG_FRAME, "src", 0 },
|
{ TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
|
||||||
{ TAG_IFRAME, "src", 0 },
|
{ TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
|
||||||
{ TAG_IMG, "href", 0 },
|
{ TAG_IMG, "href", ATTR_INLINE },
|
||||||
{ TAG_IMG, "lowsrc", 0 },
|
{ TAG_IMG, "lowsrc", ATTR_INLINE },
|
||||||
{ TAG_IMG, "src", 0 },
|
{ TAG_IMG, "src", ATTR_INLINE },
|
||||||
{ TAG_INPUT, "src", 0 },
|
{ TAG_INPUT, "src", ATTR_INLINE },
|
||||||
{ TAG_LAYER, "src", 0 },
|
{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
|
||||||
{ TAG_OVERLAY, "src", 0 },
|
{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
|
||||||
{ TAG_SCRIPT, "src", 0 },
|
{ TAG_SCRIPT, "src", ATTR_INLINE },
|
||||||
{ TAG_TABLE, "background", 0 },
|
{ TAG_TABLE, "background", ATTR_INLINE },
|
||||||
{ TAG_TD, "background", 0 },
|
{ TAG_TD, "background", ATTR_INLINE },
|
||||||
{ TAG_TH, "background", 0 }
|
{ TAG_TH, "background", ATTR_INLINE }
|
||||||
};
|
};
|
||||||
|
|
||||||
/* The lists of interesting tags and attributes are built dynamically,
|
/* The lists of interesting tags and attributes are built dynamically,
|
||||||
@ -262,7 +270,7 @@ struct map_context {
|
|||||||
size. */
|
size. */
|
||||||
|
|
||||||
static struct urlpos *
|
static struct urlpos *
|
||||||
append_one_url (const char *link_uri, int inlinep,
|
append_one_url (const char *link_uri,
|
||||||
struct taginfo *tag, int attrind, struct map_context *ctx)
|
struct taginfo *tag, int attrind, struct map_context *ctx)
|
||||||
{
|
{
|
||||||
int link_has_scheme = url_has_scheme (link_uri);
|
int link_has_scheme = url_has_scheme (link_uri);
|
||||||
@ -326,7 +334,6 @@ append_one_url (const char *link_uri, int inlinep,
|
|||||||
newel->url = url;
|
newel->url = url;
|
||||||
newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
|
newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
|
||||||
newel->size = tag->attrs[attrind].value_raw_size;
|
newel->size = tag->attrs[attrind].value_raw_size;
|
||||||
newel->link_inline_p = inlinep;
|
|
||||||
|
|
||||||
/* A URL is relative if the host is not named, and the name does not
|
/* A URL is relative if the host is not named, and the name does not
|
||||||
start with `/'. */
|
start with `/'. */
|
||||||
@ -392,9 +399,16 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|||||||
{
|
{
|
||||||
if (0 == strcasecmp (tag->attrs[attrind].name,
|
if (0 == strcasecmp (tag->attrs[attrind].name,
|
||||||
tag_url_attributes[i].attr_name))
|
tag_url_attributes[i].attr_name))
|
||||||
|
{
|
||||||
|
struct urlpos *up = append_one_url (link, tag, attrind, ctx);
|
||||||
|
if (up)
|
||||||
{
|
{
|
||||||
int flags = tag_url_attributes[i].flags;
|
int flags = tag_url_attributes[i].flags;
|
||||||
append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
|
if (flags & ATTR_INLINE)
|
||||||
|
up->link_inline_p = 1;
|
||||||
|
if (flags & ATTR_HTML)
|
||||||
|
up->link_expect_html = 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -411,7 +425,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|||||||
if (!newbase)
|
if (!newbase)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
|
base_urlpos = append_one_url (newbase, tag, attrind, ctx);
|
||||||
if (!base_urlpos)
|
if (!base_urlpos)
|
||||||
return;
|
return;
|
||||||
base_urlpos->ignore_when_downloading = 1;
|
base_urlpos->ignore_when_downloading = 1;
|
||||||
@ -434,10 +448,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|||||||
char *action = find_attr (tag, "action", &attrind);
|
char *action = find_attr (tag, "action", &attrind);
|
||||||
if (action)
|
if (action)
|
||||||
{
|
{
|
||||||
struct urlpos *action_urlpos = append_one_url (action, 0, tag,
|
struct urlpos *up = append_one_url (action, tag, attrind, ctx);
|
||||||
attrind, ctx);
|
if (up)
|
||||||
if (action_urlpos)
|
up->ignore_when_downloading = 1;
|
||||||
action_urlpos->ignore_when_downloading = 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -457,12 +470,16 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|||||||
<link rel="shortcut icon" href="...">
|
<link rel="shortcut icon" href="...">
|
||||||
*/
|
*/
|
||||||
if (href)
|
if (href)
|
||||||
|
{
|
||||||
|
struct urlpos *up = append_one_url (href, tag, attrind, ctx);
|
||||||
|
if (up)
|
||||||
{
|
{
|
||||||
char *rel = find_attr (tag, "rel", NULL);
|
char *rel = find_attr (tag, "rel", NULL);
|
||||||
int inlinep = (rel
|
if (rel
|
||||||
&& (0 == strcasecmp (rel, "stylesheet")
|
&& (0 == strcasecmp (rel, "stylesheet")
|
||||||
|| 0 == strcasecmp (rel, "shortcut icon")));
|
|| 0 == strcasecmp (rel, "shortcut icon")))
|
||||||
append_one_url (href, inlinep, tag, attrind, ctx);
|
up->link_inline_p = 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -511,7 +528,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
|
|||||||
while (ISSPACE (*p))
|
while (ISSPACE (*p))
|
||||||
++p;
|
++p;
|
||||||
|
|
||||||
entry = append_one_url (p, 0, tag, attrind, ctx);
|
entry = append_one_url (p, tag, attrind, ctx);
|
||||||
if (entry)
|
if (entry)
|
||||||
{
|
{
|
||||||
entry->link_refresh_p = 1;
|
entry->link_refresh_p = 1;
|
||||||
|
39
src/recur.c
39
src/recur.c
@ -6,7 +6,7 @@ This file is part of GNU Wget.
|
|||||||
GNU Wget is free software; you can redistribute it and/or modify
|
GNU Wget is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
(at your option) any later version.
|
(at your option) any later version.
|
||||||
|
|
||||||
GNU Wget is distributed in the hope that it will be useful,
|
GNU Wget is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
@ -66,10 +66,13 @@ extern struct hash_table *downloaded_html_set;
|
|||||||
/* Functions for maintaining the URL queue. */
|
/* Functions for maintaining the URL queue. */
|
||||||
|
|
||||||
struct queue_element {
|
struct queue_element {
|
||||||
const char *url;
|
const char *url; /* the URL to download */
|
||||||
const char *referer;
|
const char *referer; /* the referring document */
|
||||||
int depth;
|
int depth; /* the depth */
|
||||||
struct queue_element *next;
|
unsigned int html_allowed :1; /* whether the document is allowed to
|
||||||
|
be treated as HTML. */
|
||||||
|
|
||||||
|
struct queue_element *next; /* next element in queue */
|
||||||
};
|
};
|
||||||
|
|
||||||
struct url_queue {
|
struct url_queue {
|
||||||
@ -102,12 +105,13 @@ url_queue_delete (struct url_queue *queue)
|
|||||||
|
|
||||||
static void
|
static void
|
||||||
url_enqueue (struct url_queue *queue,
|
url_enqueue (struct url_queue *queue,
|
||||||
const char *url, const char *referer, int depth)
|
const char *url, const char *referer, int depth, int html_allowed)
|
||||||
{
|
{
|
||||||
struct queue_element *qel = xmalloc (sizeof (*qel));
|
struct queue_element *qel = xmalloc (sizeof (*qel));
|
||||||
qel->url = url;
|
qel->url = url;
|
||||||
qel->referer = referer;
|
qel->referer = referer;
|
||||||
qel->depth = depth;
|
qel->depth = depth;
|
||||||
|
qel->html_allowed = html_allowed;
|
||||||
qel->next = NULL;
|
qel->next = NULL;
|
||||||
|
|
||||||
++queue->count;
|
++queue->count;
|
||||||
@ -130,7 +134,8 @@ url_enqueue (struct url_queue *queue,
|
|||||||
|
|
||||||
static int
|
static int
|
||||||
url_dequeue (struct url_queue *queue,
|
url_dequeue (struct url_queue *queue,
|
||||||
const char **url, const char **referer, int *depth)
|
const char **url, const char **referer, int *depth,
|
||||||
|
int *html_allowed)
|
||||||
{
|
{
|
||||||
struct queue_element *qel = queue->head;
|
struct queue_element *qel = queue->head;
|
||||||
|
|
||||||
@ -144,6 +149,7 @@ url_dequeue (struct url_queue *queue,
|
|||||||
*url = qel->url;
|
*url = qel->url;
|
||||||
*referer = qel->referer;
|
*referer = qel->referer;
|
||||||
*depth = qel->depth;
|
*depth = qel->depth;
|
||||||
|
*html_allowed = qel->html_allowed;
|
||||||
|
|
||||||
--queue->count;
|
--queue->count;
|
||||||
|
|
||||||
@ -208,14 +214,14 @@ retrieve_tree (const char *start_url)
|
|||||||
|
|
||||||
/* Enqueue the starting URL. Use start_url_parsed->url rather than
|
/* Enqueue the starting URL. Use start_url_parsed->url rather than
|
||||||
just URL so we enqueue the canonical form of the URL. */
|
just URL so we enqueue the canonical form of the URL. */
|
||||||
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0);
|
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
|
||||||
string_set_add (blacklist, start_url_parsed->url);
|
string_set_add (blacklist, start_url_parsed->url);
|
||||||
|
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
int descend = 0;
|
int descend = 0;
|
||||||
char *url, *referer, *file = NULL;
|
char *url, *referer, *file = NULL;
|
||||||
int depth;
|
int depth, html_allowed;
|
||||||
boolean dash_p_leaf_HTML = FALSE;
|
boolean dash_p_leaf_HTML = FALSE;
|
||||||
|
|
||||||
if (downloaded_exceeds_quota ())
|
if (downloaded_exceeds_quota ())
|
||||||
@ -227,7 +233,7 @@ retrieve_tree (const char *start_url)
|
|||||||
|
|
||||||
if (!url_dequeue (queue,
|
if (!url_dequeue (queue,
|
||||||
(const char **)&url, (const char **)&referer,
|
(const char **)&url, (const char **)&referer,
|
||||||
&depth))
|
&depth, &html_allowed))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* ...and download it. Note that this download is in most cases
|
/* ...and download it. Note that this download is in most cases
|
||||||
@ -245,7 +251,8 @@ retrieve_tree (const char *start_url)
|
|||||||
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
|
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
|
||||||
url, file));
|
url, file));
|
||||||
|
|
||||||
if (downloaded_html_set
|
if (html_allowed
|
||||||
|
&& downloaded_html_set
|
||||||
&& string_set_contains (downloaded_html_set, file))
|
&& string_set_contains (downloaded_html_set, file))
|
||||||
descend = 1;
|
descend = 1;
|
||||||
}
|
}
|
||||||
@ -259,7 +266,7 @@ retrieve_tree (const char *start_url)
|
|||||||
status = retrieve_url (url, &file, &redirected, referer, &dt);
|
status = retrieve_url (url, &file, &redirected, referer, &dt);
|
||||||
opt.recursive = oldrec;
|
opt.recursive = oldrec;
|
||||||
|
|
||||||
if (file && status == RETROK
|
if (html_allowed && file && status == RETROK
|
||||||
&& (dt & RETROKF) && (dt & TEXTHTML))
|
&& (dt & RETROKF) && (dt & TEXTHTML))
|
||||||
descend = 1;
|
descend = 1;
|
||||||
|
|
||||||
@ -341,7 +348,8 @@ retrieve_tree (const char *start_url)
|
|||||||
blacklist))
|
blacklist))
|
||||||
{
|
{
|
||||||
url_enqueue (queue, xstrdup (child->url->url),
|
url_enqueue (queue, xstrdup (child->url->url),
|
||||||
xstrdup (url), depth + 1);
|
xstrdup (url), depth + 1,
|
||||||
|
child->link_expect_html);
|
||||||
/* We blacklist the URL we have enqueued, because we
|
/* We blacklist the URL we have enqueued, because we
|
||||||
don't want to enqueue (and hence download) the
|
don't want to enqueue (and hence download) the
|
||||||
same URL twice. */
|
same URL twice. */
|
||||||
@ -382,8 +390,9 @@ retrieve_tree (const char *start_url)
|
|||||||
now. */
|
now. */
|
||||||
{
|
{
|
||||||
char *d1, *d2;
|
char *d1, *d2;
|
||||||
int d3;
|
int d3, d4;
|
||||||
while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
|
while (url_dequeue (queue,
|
||||||
|
(const char **)&d1, (const char **)&d2, &d3, &d4))
|
||||||
{
|
{
|
||||||
xfree (d1);
|
xfree (d1);
|
||||||
FREE_MAYBE (d2);
|
FREE_MAYBE (d2);
|
||||||
|
Loading…
Reference in New Issue
Block a user