From 1b3cdef574ed696bad1d0e94f54e0963771e3239 Mon Sep 17 00:00:00 2001
From: hniksic <devnull@localhost>
Date: Fri, 10 Oct 2003 07:25:10 -0700
Subject: [PATCH] [svn] Don't descend into HTML that was downloaded by
 following <img src=...> and such.

---
 src/ChangeLog  | 10 ++++++
 src/convert.h  | 10 +++---
 src/html-url.c | 95 +++++++++++++++++++++++++++++---------------------
 src/recur.c    | 39 +++++++++++++--------
 4 files changed, 95 insertions(+), 59 deletions(-)
diff --git a/src/ChangeLog b/src/ChangeLog
index e31b1989..a554f150 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,13 @@
+2003-10-10  Hrvoje Niksic  <hniksic@xemacs.org>
+
+	* recur.c (retrieve_tree): Don't descend into documents that are
+	not expected to contain HTML, regardless of their content-type.
+
+	* html-url.c (tag_url_attributes): Record which attributes are
+	supposed to yield HTML links that can be followed.
+	(tag_find_urls): Propagate that information to the caller through
+	struct urlpos.
+
 2003-10-10  Hrvoje Niksic  <hniksic@xemacs.org>
 
 	* hash.c (find_mapping): Return the next available mapping when
diff --git a/src/convert.h b/src/convert.h
index 66b7dfc7..fff8410f 100644
--- a/src/convert.h
+++ b/src/convert.h
@@ -56,11 +56,11 @@ struct urlpos {
 
   /* Information about the original link: */
 
-  unsigned int link_relative_p	:1; /* was the link relative? */
-  unsigned int link_complete_p	:1; /* was the link complete (with the
-				       host name, etc.) */
-  unsigned int link_base_p	:1; /* was the link <base href=...> */
-  unsigned int link_inline_p	:1; /* needed to render the page. */
+  unsigned int link_relative_p	:1; /* the link was relative */
+  unsigned int link_complete_p	:1; /* the link was complete (had host name) */
+  unsigned int link_base_p	:1; /* the url came from <base href=...> */
+  unsigned int link_inline_p	:1; /* needed to render the page */
+  unsigned int link_expect_html	:1; /* expected to contain HTML */
 
   unsigned int link_refresh_p	:1; /* link was received from
 				       <meta http-equiv=refresh content=...> */
diff --git a/src/html-url.c b/src/html-url.c
index 80f5b96c..c2ed2c58 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -121,11 +121,19 @@ static struct known_tag {
 /* tag_url_attributes documents which attributes of which tags contain
    URLs to harvest.  It is used by tag_find_urls.  */
 
-/* Defines for the FLAGS field; currently only one flag is defined. */
+/* Defines for the FLAGS. */
 
-/* This tag points to an external document not necessary for rendering this 
-   document (i.e. it's not an inlined image, stylesheet, etc.). */
-#define TUA_EXTERNAL 1
+/* The link is "inline", i.e. needs to be retrieved for this document
+   to be correctly rendered.  Inline links include inlined images,
+   stylesheets, children frames, etc.  */
+#define ATTR_INLINE	1
+
+/* The link is expected to yield HTML contents.  It's important not to
+   try to follow HTML obtained by following e.g. <img src="...">
+   regardless of content-type.  Doing this causes infinite loops for
+   "images" that return non-404 error pages with links to the same
+   image.  */
+#define ATTR_HTML	2
 
 /* For tags handled by tag_find_urls: attributes that contain URLs to
    download. */
@@ -134,26 +142,26 @@ static struct {
   const char *attr_name;
   int flags;
 } tag_url_attributes[] = {
-  { TAG_A,		"href",		TUA_EXTERNAL },
-  { TAG_APPLET,		"code",		0 },
-  { TAG_AREA,		"href",		TUA_EXTERNAL },
-  { TAG_BGSOUND,	"src",		0 },
-  { TAG_BODY,		"background",	0 },
-  { TAG_EMBED,		"href",		TUA_EXTERNAL },
-  { TAG_EMBED,		"src",		0 },
-  { TAG_FIG,		"src",		0 },
-  { TAG_FRAME,		"src",		0 },
-  { TAG_IFRAME,		"src",		0 },
-  { TAG_IMG,		"href",		0 },
-  { TAG_IMG,		"lowsrc",	0 },
-  { TAG_IMG,		"src",		0 },
-  { TAG_INPUT,		"src",		0 },
-  { TAG_LAYER,		"src",		0 },
-  { TAG_OVERLAY,	"src",		0 },
-  { TAG_SCRIPT,		"src",		0 },
-  { TAG_TABLE,		"background",	0 },
-  { TAG_TD,		"background",	0 },
-  { TAG_TH,		"background",	0 }
+  { TAG_A,		"href",		ATTR_HTML },
+  { TAG_APPLET,		"code",		ATTR_INLINE },
+  { TAG_AREA,		"href",		ATTR_HTML },
+  { TAG_BGSOUND,	"src",		ATTR_INLINE },
+  { TAG_BODY,		"background",	ATTR_INLINE },
+  { TAG_EMBED,		"href",		ATTR_HTML },
+  { TAG_EMBED,		"src",		ATTR_INLINE | ATTR_HTML },
+  { TAG_FIG,		"src",		ATTR_INLINE },
+  { TAG_FRAME,		"src",		ATTR_INLINE | ATTR_HTML },
+  { TAG_IFRAME,		"src",		ATTR_INLINE | ATTR_HTML },
+  { TAG_IMG,		"href",		ATTR_INLINE },
+  { TAG_IMG,		"lowsrc",	ATTR_INLINE },
+  { TAG_IMG,		"src",		ATTR_INLINE },
+  { TAG_INPUT,		"src",		ATTR_INLINE },
+  { TAG_LAYER,		"src",		ATTR_INLINE | ATTR_HTML },
+  { TAG_OVERLAY,	"src",		ATTR_INLINE | ATTR_HTML },
+  { TAG_SCRIPT,		"src",		ATTR_INLINE },
+  { TAG_TABLE,		"background",	ATTR_INLINE },
+  { TAG_TD,		"background",	ATTR_INLINE },
+  { TAG_TH,		"background",	ATTR_INLINE }
 };
 
 /* The lists of interesting tags and attributes are built dynamically,
@@ -262,7 +270,7 @@ struct map_context {
    size.  */
 
 static struct urlpos *
-append_one_url (const char *link_uri, int inlinep,
+append_one_url (const char *link_uri,
 		struct taginfo *tag, int attrind, struct map_context *ctx)
 {
   int link_has_scheme = url_has_scheme (link_uri);
@@ -326,7 +334,6 @@ append_one_url (const char *link_uri, int inlinep,
   newel->url = url;
   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
   newel->size = tag->attrs[attrind].value_raw_size;
-  newel->link_inline_p = inlinep;
 
   /* A URL is relative if the host is not named, and the name does not
      start with `/'.  */
@@ -393,8 +400,15 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
 	  if (0 == strcasecmp (tag->attrs[attrind].name,
 			       tag_url_attributes[i].attr_name))
 	    {
-	      int flags = tag_url_attributes[i].flags;
-	      append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
+	      struct urlpos *up = append_one_url (link, tag, attrind, ctx);
+	      if (up)
+		{
+		  int flags = tag_url_attributes[i].flags;
+		  if (flags & ATTR_INLINE)
+		    up->link_inline_p = 1;
+		  if (flags & ATTR_HTML)
+		    up->link_expect_html = 1;
+		}
 	    }
 	}
     }
@@ -411,7 +425,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
   if (!newbase)
     return;
 
-  base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
+  base_urlpos = append_one_url (newbase, tag, attrind, ctx);
   if (!base_urlpos)
     return;
   base_urlpos->ignore_when_downloading = 1;
@@ -434,10 +448,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
   char *action = find_attr (tag, "action", &attrind);
   if (action)
     {
-      struct urlpos *action_urlpos = append_one_url (action, 0, tag,
-						     attrind, ctx);
-      if (action_urlpos)
-	action_urlpos->ignore_when_downloading = 1;
+      struct urlpos *up = append_one_url (action, tag, attrind, ctx);
+      if (up)
+	up->ignore_when_downloading = 1;
     }
 }
 
@@ -458,11 +471,15 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
   */
   if (href)
     {
-      char *rel  = find_attr (tag, "rel", NULL);
-      int inlinep = (rel
-		     && (0 == strcasecmp (rel, "stylesheet")
-			 || 0 == strcasecmp (rel, "shortcut icon")));
-      append_one_url (href, inlinep, tag, attrind, ctx);
+      struct urlpos *up = append_one_url (href, tag, attrind, ctx);
+      if (up)
+	{
+	  char *rel = find_attr (tag, "rel", NULL);
+	  if (rel
+	      && (0 == strcasecmp (rel, "stylesheet")
+		  || 0 == strcasecmp (rel, "shortcut icon")))
+	    up->link_inline_p = 1;
+	}
     }
 }
 
@@ -511,7 +528,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
       while (ISSPACE (*p))
 	++p;
 
-      entry = append_one_url (p, 0, tag, attrind, ctx);
+      entry = append_one_url (p, tag, attrind, ctx);
       if (entry)
 	{
 	  entry->link_refresh_p = 1;
diff --git a/src/recur.c b/src/recur.c
index 007354b7..bf367074 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -6,7 +6,7 @@ This file is part of GNU Wget.
 GNU Wget is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+ (at your option) any later version.
 
 GNU Wget is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -66,10 +66,13 @@ extern struct hash_table *downloaded_html_set;
 /* Functions for maintaining the URL queue.  */
 
 struct queue_element {
-  const char *url;
-  const char *referer;
-  int depth;
-  struct queue_element *next;
+  const char *url;		/* the URL to download */
+  const char *referer;		/* the referring document */
+  int depth;			/* the depth */
+  unsigned int html_allowed :1;	/* whether the document is allowed to
+				   be treated as HTML. */
+
+  struct queue_element *next;	/* next element in queue */
 };
 
 struct url_queue {
@@ -102,12 +105,13 @@ url_queue_delete (struct url_queue *queue)
 
 static void
 url_enqueue (struct url_queue *queue,
-	     const char *url, const char *referer, int depth)
+	     const char *url, const char *referer, int depth, int html_allowed)
 {
   struct queue_element *qel = xmalloc (sizeof (*qel));
   qel->url = url;
   qel->referer = referer;
   qel->depth = depth;
+  qel->html_allowed = html_allowed;
   qel->next = NULL;
 
   ++queue->count;
@@ -130,7 +134,8 @@ url_enqueue (struct url_queue *queue,
 
 static int
 url_dequeue (struct url_queue *queue,
-	     const char **url, const char **referer, int *depth)
+	     const char **url, const char **referer, int *depth,
+	     int *html_allowed)
 {
   struct queue_element *qel = queue->head;
 
@@ -144,6 +149,7 @@ url_dequeue (struct url_queue *queue,
   *url = qel->url;
   *referer = qel->referer;
   *depth = qel->depth;
+  *html_allowed = qel->html_allowed;
 
   --queue->count;
 
@@ -208,14 +214,14 @@ retrieve_tree (const char *start_url)
 
   /* Enqueue the starting URL.  Use start_url_parsed->url rather than
      just URL so we enqueue the canonical form of the URL.  */
-  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0);
+  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
   string_set_add (blacklist, start_url_parsed->url);
 
   while (1)
     {
       int descend = 0;
       char *url, *referer, *file = NULL;
-      int depth;
+      int depth, html_allowed;
       boolean dash_p_leaf_HTML = FALSE;
 
       if (downloaded_exceeds_quota ())
@@ -227,7 +233,7 @@ retrieve_tree (const char *start_url)
 
       if (!url_dequeue (queue,
 			(const char **)&url, (const char **)&referer,
-			&depth))
+			&depth, &html_allowed))
 	break;
 
       /* ...and download it.  Note that this download is in most cases
@@ -245,7 +251,8 @@ retrieve_tree (const char *start_url)
 	  DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 		   url, file));
 
-	  if (downloaded_html_set
+	  if (html_allowed
+	      && downloaded_html_set
 	      && string_set_contains (downloaded_html_set, file))
 	    descend = 1;
 	}
@@ -259,7 +266,7 @@ retrieve_tree (const char *start_url)
 	  status = retrieve_url (url, &file, &redirected, referer, &dt);
 	  opt.recursive = oldrec;
 
-	  if (file && status == RETROK
+	  if (html_allowed && file && status == RETROK
 	      && (dt & RETROKF) && (dt & TEXTHTML))
 	    descend = 1;
 
@@ -341,7 +348,8 @@ retrieve_tree (const char *start_url)
 					blacklist))
 		    {
 		      url_enqueue (queue, xstrdup (child->url->url),
-				   xstrdup (url), depth + 1);
+				   xstrdup (url), depth + 1,
+				   child->link_expect_html);
 		      /* We blacklist the URL we have enqueued, because we
 			 don't want to enqueue (and hence download) the
 			 same URL twice.  */
@@ -382,8 +390,9 @@ retrieve_tree (const char *start_url)
      now.  */
   {
     char *d1, *d2;
-    int d3;
-    while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
+    int d3, d4;
+    while (url_dequeue (queue,
+			(const char **)&d1, (const char **)&d2, &d3, &d4))
       {
 	xfree (d1);
 	FREE_MAYBE (d2);