[svn] Don't descend into HTML that was downloaded by following <img src=...>

and such.
2024-07-03 16:38:41 -04:00 · 2003-10-10 07:25:10 -07:00 · 2003-10-10 07:25:10 -07:00 · 1b3cdef574
commit 1b3cdef574
parent 37e70109a4
4 changed files with 95 additions and 59 deletions
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -1,3 +1,13 @@
 2003-10-10  Hrvoje Niksic  <hniksic@xemacs.org>
 	* recur.c (retrieve_tree): Don't descend into documents that are
 	not expected to contain HTML, regardless of their content-type.
 	* html-url.c (tag_url_attributes): Record which attributes are
 	supposed to yield HTML links that can be followed.
 	(tag_find_urls): Propagate that information to the caller through
 	struct urlpos.
 2003-10-10  Hrvoje Niksic  <hniksic@xemacs.org>
 	* hash.c (find_mapping): Return the next available mapping when
--- a/src/convert.h
+++ b/src/convert.h
@ -56,11 +56,11 @@ struct urlpos {
  /* Information about the original link: */
-  unsigned int link_relative_p	:1; /* was the link relative? */
+  unsigned int link_relative_p	:1; /* the link was relative */
-  unsigned int link_complete_p	:1; /* was the link complete (with the
+  unsigned int link_complete_p	:1; /* the link was complete (had host name) */
-				       host name, etc.) */
+  unsigned int link_base_p	:1; /* the url came from <base href=...> */
-  unsigned int link_base_p	:1; /* was the link <base href=...> */
+  unsigned int link_inline_p	:1; /* needed to render the page */
-  unsigned int link_inline_p	:1; /* needed to render the page. */
+  unsigned int link_expect_html	:1; /* expected to contain HTML */
  unsigned int link_refresh_p	:1; /* link was received from
 				       <meta http-equiv=refresh content=...> */
--- a/src/html-url.c
+++ b/src/html-url.c
@ -121,11 +121,19 @@ static struct known_tag {
 /* tag_url_attributes documents which attributes of which tags contain
   URLs to harvest.  It is used by tag_find_urls.  */
-/* Defines for the FLAGS field; currently only one flag is defined. */
+/* Defines for the FLAGS. */
-/* This tag points to an external document not necessary for rendering this 
+/* The link is "inline", i.e. needs to be retrieved for this document
-   document (i.e. it's not an inlined image, stylesheet, etc.). */
+   to be correctly rendered.  Inline links include inlined images,
-#define TUA_EXTERNAL 1
+   stylesheets, children frames, etc.  */
 #define ATTR_INLINE	1
 /* The link is expected to yield HTML contents.  It's important not to
   try to follow HTML obtained by following e.g. <img src="...">
   regardless of content-type.  Doing this causes infinite loops for
   "images" that return non-404 error pages with links to the same
   image.  */
 #define ATTR_HTML	2
 /* For tags handled by tag_find_urls: attributes that contain URLs to
   download. */
@ -134,26 +142,26 @@ static struct {
  const char *attr_name;
  int flags;
 } tag_url_attributes[] = {
-  { TAG_A,		"href",		TUA_EXTERNAL },
+  { TAG_A,		"href",		ATTR_HTML },
-  { TAG_APPLET,		"code",		0 },
+  { TAG_APPLET,		"code",		ATTR_INLINE },
-  { TAG_AREA,		"href",		TUA_EXTERNAL },
+  { TAG_AREA,		"href",		ATTR_HTML },
-  { TAG_BGSOUND,	"src",		0 },
+  { TAG_BGSOUND,	"src",		ATTR_INLINE },
-  { TAG_BODY,		"background",	0 },
+  { TAG_BODY,		"background",	ATTR_INLINE },
-  { TAG_EMBED,		"href",		TUA_EXTERNAL },
+  { TAG_EMBED,		"href",		ATTR_HTML },
-  { TAG_EMBED,		"src",		0 },
+  { TAG_EMBED,		"src",		ATTR_INLINE | ATTR_HTML },
-  { TAG_FIG,		"src",		0 },
+  { TAG_FIG,		"src",		ATTR_INLINE },
-  { TAG_FRAME,		"src",		0 },
+  { TAG_FRAME,		"src",		ATTR_INLINE | ATTR_HTML },
-  { TAG_IFRAME,		"src",		0 },
+  { TAG_IFRAME,		"src",		ATTR_INLINE | ATTR_HTML },
-  { TAG_IMG,		"href",		0 },
+  { TAG_IMG,		"href",		ATTR_INLINE },
-  { TAG_IMG,		"lowsrc",	0 },
+  { TAG_IMG,		"lowsrc",	ATTR_INLINE },
-  { TAG_IMG,		"src",		0 },
+  { TAG_IMG,		"src",		ATTR_INLINE },
-  { TAG_INPUT,		"src",		0 },
+  { TAG_INPUT,		"src",		ATTR_INLINE },
-  { TAG_LAYER,		"src",		0 },
+  { TAG_LAYER,		"src",		ATTR_INLINE | ATTR_HTML },
-  { TAG_OVERLAY,	"src",		0 },
+  { TAG_OVERLAY,	"src",		ATTR_INLINE | ATTR_HTML },
-  { TAG_SCRIPT,		"src",		0 },
+  { TAG_SCRIPT,		"src",		ATTR_INLINE },
-  { TAG_TABLE,		"background",	0 },
+  { TAG_TABLE,		"background",	ATTR_INLINE },
-  { TAG_TD,		"background",	0 },
+  { TAG_TD,		"background",	ATTR_INLINE },
-  { TAG_TH,		"background",	0 }
+  { TAG_TH,		"background",	ATTR_INLINE }
 };
 /* The lists of interesting tags and attributes are built dynamically,
@ -262,7 +270,7 @@ struct map_context {
   size.  */
 static struct urlpos *
-append_one_url (const char *link_uri, int inlinep,
+append_one_url (const char *link_uri,
 		struct taginfo *tag, int attrind, struct map_context *ctx)
 {
  int link_has_scheme = url_has_scheme (link_uri);
@ -326,7 +334,6 @@ append_one_url (const char *link_uri, int inlinep,
  newel->url = url;
  newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
  newel->size = tag->attrs[attrind].value_raw_size;
  newel->link_inline_p = inlinep;
  /* A URL is relative if the host is not named, and the name does not
     start with `/'.  */
@ -393,8 +400,15 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 	  if (0 == strcasecmp (tag->attrs[attrind].name,
 			       tag_url_attributes[i].attr_name))
 	    {
-	      int flags = tag_url_attributes[i].flags;
+	      struct urlpos *up = append_one_url (link, tag, attrind, ctx);
-	      append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
+	      if (up)
 		{
 		  int flags = tag_url_attributes[i].flags;
 		  if (flags & ATTR_INLINE)
 		    up->link_inline_p = 1;
 		  if (flags & ATTR_HTML)
 		    up->link_expect_html = 1;
 		}
 	    }
 	}
    }
@ -411,7 +425,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
  if (!newbase)
    return;
-  base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
+  base_urlpos = append_one_url (newbase, tag, attrind, ctx);
  if (!base_urlpos)
    return;
  base_urlpos->ignore_when_downloading = 1;
@ -434,10 +448,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
  char *action = find_attr (tag, "action", &attrind);
  if (action)
    {
-      struct urlpos *action_urlpos = append_one_url (action, 0, tag,
+      struct urlpos *up = append_one_url (action, tag, attrind, ctx);
-						     attrind, ctx);
+      if (up)
-      if (action_urlpos)
+	up->ignore_when_downloading = 1;
 	action_urlpos->ignore_when_downloading = 1;
    }
 }
@ -458,11 +471,15 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
  */
  if (href)
    {
-      char *rel  = find_attr (tag, "rel", NULL);
+      struct urlpos *up = append_one_url (href, tag, attrind, ctx);
-      int inlinep = (rel
+      if (up)
-		     && (0 == strcasecmp (rel, "stylesheet")
+	{
-			 || 0 == strcasecmp (rel, "shortcut icon")));
+	  char *rel = find_attr (tag, "rel", NULL);
-      append_one_url (href, inlinep, tag, attrind, ctx);
+	  if (rel
 	      && (0 == strcasecmp (rel, "stylesheet")
 		  || 0 == strcasecmp (rel, "shortcut icon")))
 	    up->link_inline_p = 1;
 	}
    }
 }
@ -511,7 +528,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
      while (ISSPACE (*p))
 	++p;
-      entry = append_one_url (p, 0, tag, attrind, ctx);
+      entry = append_one_url (p, tag, attrind, ctx);
      if (entry)
 	{
 	  entry->link_refresh_p = 1;
--- a/src/recur.c
+++ b/src/recur.c
@ -6,7 +6,7 @@ This file is part of GNU Wget.
 GNU Wget is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+ (at your option) any later version.
 GNU Wget is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -66,10 +66,13 @@ extern struct hash_table *downloaded_html_set;
 /* Functions for maintaining the URL queue.  */
 struct queue_element {
-  const char *url;
+  const char *url;		/* the URL to download */
-  const char *referer;
+  const char *referer;		/* the referring document */
-  int depth;
+  int depth;			/* the depth */
-  struct queue_element *next;
+  unsigned int html_allowed :1;	/* whether the document is allowed to
 				   be treated as HTML. */
  struct queue_element *next;	/* next element in queue */
 };
 struct url_queue {
@ -102,12 +105,13 @@ url_queue_delete (struct url_queue *queue)
 static void
 url_enqueue (struct url_queue *queue,
-	     const char *url, const char *referer, int depth)
+	     const char *url, const char *referer, int depth, int html_allowed)
 {
  struct queue_element *qel = xmalloc (sizeof (*qel));
  qel->url = url;
  qel->referer = referer;
  qel->depth = depth;
  qel->html_allowed = html_allowed;
  qel->next = NULL;
  ++queue->count;
@ -130,7 +134,8 @@ url_enqueue (struct url_queue *queue,
 static int
 url_dequeue (struct url_queue *queue,
-	     const char **url, const char **referer, int *depth)
+	     const char **url, const char **referer, int *depth,
 	     int *html_allowed)
 {
  struct queue_element *qel = queue->head;
@ -144,6 +149,7 @@ url_dequeue (struct url_queue *queue,
  *url = qel->url;
  *referer = qel->referer;
  *depth = qel->depth;
  *html_allowed = qel->html_allowed;
  --queue->count;
@ -208,14 +214,14 @@ retrieve_tree (const char *start_url)
  /* Enqueue the starting URL.  Use start_url_parsed->url rather than
     just URL so we enqueue the canonical form of the URL.  */
-  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0);
+  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
  string_set_add (blacklist, start_url_parsed->url);
  while (1)
    {
      int descend = 0;
      char *url, *referer, *file = NULL;
-      int depth;
+      int depth, html_allowed;
      boolean dash_p_leaf_HTML = FALSE;
      if (downloaded_exceeds_quota ())
@ -227,7 +233,7 @@ retrieve_tree (const char *start_url)
      if (!url_dequeue (queue,
 			(const char **)&url, (const char **)&referer,
-			&depth))
+			&depth, &html_allowed))
 	break;
      /* ...and download it.  Note that this download is in most cases
@ -245,7 +251,8 @@ retrieve_tree (const char *start_url)
 	  DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 		   url, file));
-	  if (downloaded_html_set
+	  if (html_allowed
 	      && downloaded_html_set
 	      && string_set_contains (downloaded_html_set, file))
 	    descend = 1;
 	}
@ -259,7 +266,7 @@ retrieve_tree (const char *start_url)
 	  status = retrieve_url (url, &file, &redirected, referer, &dt);
 	  opt.recursive = oldrec;
-	  if (file && status == RETROK
+	  if (html_allowed && file && status == RETROK
 	      && (dt & RETROKF) && (dt & TEXTHTML))
 	    descend = 1;
@ -341,7 +348,8 @@ retrieve_tree (const char *start_url)
 					blacklist))
 		    {
 		      url_enqueue (queue, xstrdup (child->url->url),
-				   xstrdup (url), depth + 1);
+				   xstrdup (url), depth + 1,
 				   child->link_expect_html);
 		      /* We blacklist the URL we have enqueued, because we
 			 don't want to enqueue (and hence download) the
 			 same URL twice.  */
@ -382,8 +390,9 @@ retrieve_tree (const char *start_url)
     now.  */
  {
    char *d1, *d2;
-    int d3;
+    int d3, d4;
-    while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
+    while (url_dequeue (queue,
 			(const char **)&d1, (const char **)&d2, &d3, &d4))
      {
 	xfree (d1);
 	FREE_MAYBE (d2);