[svn] Fix downloading of duplicate URLs.

Published in <sxsvgfmu2bj.fsf@florida.arsdigita.de>.
2024-07-03 16:38:41 -04:00 · 2001-12-04 13:03:35 -08:00 · 2001-12-04 13:03:35 -08:00 · 0fdc1bd8c0
commit 0fdc1bd8c0
parent abf215c1e3
4 changed files with 197 additions and 75 deletions
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -1,3 +1,22 @@
 2001-12-04  Hrvoje Niksic  <hniksic@arsdigita.com>
 	* recur.c (retrieve_tree): Check whether the URL was already
 	downloaded before downloading it again.
 	(descend_child_p): Renamed to download_child_p.
 	(register_download): When one URL is downloaded to a file already
 	"owned" by another URL, delete all references that map any URL to
 	that file.
 	(register_delete_file): New function.
 	(retrieve_tree): Use it after deleting a file.
 	* url.c (url_parse): Re-canonicalize the URL also if the path is
 	empty, so that e.g. "http://www.server.com" ->
 	"http://www.server.com/".
 	(lowercase_str): Use ISUPPER instead of !ISLOWER.
 	* retr.c (retrieve_url): Use the canonical URL form when calling
 	register_download().
 2001-12-04  Ian Abbott <abbotti@mev.co.uk>
 	* snprintf.c (dopr): Use `unsigned int' as the second argument to
--- a/src/recur.c
+++ b/src/recur.c
@ -58,6 +58,8 @@ static struct hash_table *dl_url_file_map;
   in order.  If you need to check whether a file has been downloaded,
   use a hash table, e.g. dl_file_url_map.  */
 static slist *downloaded_html_files;
 static void register_delete_file PARAMS ((const char *));
 /* Functions for maintaining the URL queue.  */
@ -150,8 +152,8 @@ url_dequeue (struct url_queue *queue,
  return 1;
 }
-static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
+static int download_child_p PARAMS ((const struct urlpos *, struct url *, int,
-				  struct url *, struct hash_table *));
+				     struct url *, struct hash_table *));
 static int descend_redirect_p PARAMS ((const char *, const char *, int,
 				       struct url *, struct hash_table *));
@ -205,52 +207,63 @@ retrieve_tree (const char *start_url)
      if (downloaded_exceeds_quota ())
 	break;
      if (status == FWRITEERR)
 	break;
-      /* Get the next URL from the queue. */
+      /* Get the next URL from the queue... */
      if (!url_dequeue (queue,
 			(const char **)&url, (const char **)&referer,
 			&depth))
 	break;
-      /* And download it. */
+      /* ...and download it.  Note that this download is in most cases
 	 unconditional, as download_child_p already makes sure a file
 	 doesn't get enqueued twice -- and yet this check is here, and
 	 not in download_child_p.  This is so that if you run `wget -r
 	 URL1 URL2', and a random URL is encountered once under URL1
 	 and again under URL2, but at a different (possibly smaller)
 	 depth, we want the URL's children to be taken into account
 	 the second time.  */
      if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
 	{
 	  DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 		   url, (char *)hash_table_get (dl_url_file_map, url)));
 	}
      else
 	{
 	  int dt = 0;
 	  char *redirected = NULL;
 	  int oldrec = opt.recursive;
-      {
+	  opt.recursive = 0;
-	int dt = 0;
+	  status = retrieve_url (url, &file, &redirected, NULL, &dt);
-	char *redirected = NULL;
+	  opt.recursive = oldrec;
 	int oldrec = opt.recursive;
-	opt.recursive = 0;
+	  if (file && status == RETROK
-	status = retrieve_url (url, &file, &redirected, NULL, &dt);
+	      && (dt & RETROKF) && (dt & TEXTHTML))
-	opt.recursive = oldrec;
+	    descend = 1;
-	if (file && status == RETROK
+	  if (redirected)
-	    && (dt & RETROKF) && (dt & TEXTHTML))
+	    {
-	  descend = 1;
+	      /* We have been redirected, possibly to another host, or
 		 different path, or wherever.  Check whether we really
 		 want to follow it.  */
 	      if (descend)
 		{
 		  if (!descend_redirect_p (redirected, url, depth,
 					   start_url_parsed, blacklist))
 		    descend = 0;
 		  else
 		    /* Make sure that the old pre-redirect form gets
 		       blacklisted. */
 		    string_set_add (blacklist, url);
 		}
-	if (redirected)
+	      xfree (url);
-	  {
+	      url = redirected;
-	    /* We have been redirected, possibly to another host, or
+	    }
-	       different path, or wherever.  Check whether we really
+	}
 	       want to follow it.  */
 	    if (descend)
 	      {
 		if (!descend_redirect_p (redirected, url, depth,
 					 start_url_parsed, blacklist))
 		  descend = 0;
 		else
 		  /* Make sure that the old pre-redirect form gets
 		     blacklisted. */
 		  string_set_add (blacklist, url);
 	      }
 	    xfree (url);
 	    url = redirected;
 	  }
      }
      if (descend
 	  && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
@ -305,8 +318,8 @@ retrieve_tree (const char *start_url)
 		    continue;
 		  if (dash_p_leaf_HTML && !child->link_inline_p)
 		    continue;
-		  if (descend_url_p (child, url_parsed, depth, start_url_parsed,
+		  if (download_child_p (child, url_parsed, depth, start_url_parsed,
-				     blacklist))
+					blacklist))
 		    {
 		      url_enqueue (queue, xstrdup (child->url->url),
 				   xstrdup (url), depth + 1);
@ -338,6 +351,7 @@ retrieve_tree (const char *start_url)
 		     file);
 	  if (unlink (file))
 	    logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 	  register_delete_file (file);
 	}
      xfree (url);
@ -379,8 +393,8 @@ retrieve_tree (const char *start_url)
   will help if those URLs are encountered many times.  */
 static int
-descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
+download_child_p (const struct urlpos *upos, struct url *parent, int depth,
-	       struct url *start_url_parsed, struct hash_table *blacklist)
+		  struct url *start_url_parsed, struct hash_table *blacklist)
 {
  struct url *u = upos->url;
  const char *url = u->url;
@ -555,10 +569,10 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
  return 0;
 }
-/* This function determines whether we should descend the children of
+/* This function determines whether we will consider downloading the
-   the URL whose download resulted in a redirection, possibly to
+   children of a URL whose download resulted in a redirection,
-   another host, etc.  It is needed very rarely, and thus it is merely
+   possibly to another host, etc.  It is needed very rarely, and thus
-   a simple-minded wrapper around descend_url_p.  */
+   it is merely a simple-minded wrapper around download_child_p.  */
 static int
 descend_redirect_p (const char *redirected, const char *original, int depth,
@ -578,8 +592,8 @@ descend_redirect_p (const char *redirected, const char *original, int depth,
  memset (upos, 0, sizeof (*upos));
  upos->url = new_parsed;
-  success = descend_url_p (upos, orig_parsed, depth,
+  success = download_child_p (upos, orig_parsed, depth,
-			   start_url_parsed, blacklist);
+			      start_url_parsed, blacklist);
  url_free (orig_parsed);
  url_free (new_parsed);
@ -592,22 +606,95 @@ descend_redirect_p (const char *redirected, const char *original, int depth,
 }
-/* Register that URL has been successfully downloaded to FILE. */
+#define ENSURE_TABLES_EXIST do {			\
  if (!dl_file_url_map)					\
    dl_file_url_map = make_string_hash_table (0);	\
  if (!dl_url_file_map)					\
    dl_url_file_map = make_string_hash_table (0);	\
 } while (0)
 static int
 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 {
  char *mapping_url = (char *)key;
  char *mapping_file = (char *)value;
  char *file = (char *)arg;
  if (0 == strcmp (mapping_file, file))
    {
      hash_table_remove (dl_url_file_map, mapping_url);
      xfree (mapping_url);
      xfree (mapping_file);
    }
  /* Continue mapping. */
  return 0;
 }
 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 static void
 dissociate_urls_from_file (const char *file)
 {
  hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
 		  (char *)file);
 }
 /* Register that URL has been successfully downloaded to FILE.  This
   is used by the link conversion code to convert references to URLs
   to references to local files.  It is also being used to check if a
   URL has already been downloaded.  */
 void
 register_download (const char *url, const char *file)
 {
-  if (!opt.convert_links)
+  char *old_file, *old_url;
    return;
  if (!dl_file_url_map)
    dl_file_url_map = make_string_hash_table (0);
  if (!dl_url_file_map)
    dl_url_file_map = make_string_hash_table (0);
-  if (!hash_table_contains (dl_file_url_map, file))
+  ENSURE_TABLES_EXIST;
-    hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
+
-  if (!hash_table_contains (dl_url_file_map, url))
+  /* With some forms of retrieval, it is possible, although not
-    hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
+     likely, for different URLs to resolve to the same file name.  For
     example, "http://www.server.com/" and
     "http://www.server.com/index.html" will both resolve to the same
     file, "index.html".  If both are downloaded, the second download
     will override the first one.
     If that happens, dissociate the old file name from the URL.  */
  if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
    {
      if (0 == strcmp (url, old_url))
 	/* We have somehow managed to download the same URL twice.
 	   Nothing to do.  */
 	return;
      hash_table_remove (dl_file_url_map, file);
      xfree (old_file);
      xfree (old_url);
      /* Remove all the URLs that point to this file.  Yes, there can
 	 be more than one such URL, because we store redirections as
 	 multiple entries in dl_url_file_map.  For example, if URL1
 	 redirects to URL2 which gets downloaded to FILE, we map both
 	 URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 	 only points to URL2.)  When another URL gets loaded to FILE,
 	 we want both URL1 and URL2 dissociated from it.
         This is a relatively expensive operation because it performs
         a linear search of the whole hash table, but it should be
         called very rarely, only when two URLs resolve to the same
         file name, *and* the "<file>.1" extensions are turned off.
         In other words, almost never.  */
      dissociate_urls_from_file (file);
    }
  /* A URL->FILE mapping is not possible without a FILE->URL mapping.
     If the latter were present, it should have been removed by the
     above `if'.  */
  assert (!hash_table_contains (dl_url_file_map, url));
  hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
  hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 }
 /* Register that FROM has been redirected to TO.  This assumes that TO
@ -619,8 +706,7 @@ register_redirection (const char *from, const char *to)
 {
  char *file;
-  if (!opt.convert_links)
+  ENSURE_TABLES_EXIST;
    return;
  file = hash_table_get (dl_url_file_map, to);
  assert (file != NULL);
@ -628,7 +714,25 @@ register_redirection (const char *from, const char *to)
    hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 }
-/* Register that URL corresponds to the HTML file FILE. */
+/* Register that the file has been deleted. */
 static void
 register_delete_file (const char *file)
 {
  char *old_url, *old_file;
  ENSURE_TABLES_EXIST;
  if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
    return;
  hash_table_remove (dl_file_url_map, file);
  xfree (old_file);
  xfree (old_url);
  dissociate_urls_from_file (file);
 }
 /* Register that FILE is an HTML file that has been downloaded. */
 void
 register_html (const char *url, const char *file)
@ -672,15 +776,16 @@ convert_all_links (void)
      struct urlpos *urls, *cur_url;
      char *url;
      DEBUGP (("Rescanning %s\n", html->string));
      /* Determine the URL of the HTML file.  get_urls_html will need
 	 it.  */
      url = hash_table_get (dl_file_url_map, html->string);
-      if (url)
+      if (!url)
-	DEBUGP (("It should correspond to %s.\n", url));
+	{
-      else
+	  DEBUGP (("Apparently %s has been removed.\n", html->string));
-	DEBUGP (("I cannot find the corresponding URL.\n"));
+	  continue;
 	}
      DEBUGP (("Rescanning %s (from %s)\n", html->string, url));
      /* Parse the HTML file...  */
      urls = get_urls_html (html->string, url, NULL);
--- a/src/retr.c
+++ b/src/retr.c
@ -511,11 +511,11 @@ retrieve_url (const char *origurl, char **file, char **newloc,
    {
      if (*dt & RETROKF)
 	{
-	  register_download (url, local_file);
+	  register_download (u->url, local_file);
 	  if (redirections)
-	    register_all_redirections (redirections, url);
+	    register_all_redirections (redirections, u->url);
 	  if (*dt & TEXTHTML)
-	    register_html (url, local_file);
+	    register_html (u->url, local_file);
 	}
    }
--- a/src/url.c
+++ b/src/url.c
@ -595,7 +595,7 @@ lowercase_str (char *str)
 {
  int change = 0;
  for (; *str; str++)
-    if (!ISLOWER (*str))
+    if (ISUPPER (*str))
      {
 	change = 1;
 	*str = TOLOWER (*str);
@ -787,13 +787,11 @@ url_parse (const char *url, int *error)
  if (fragment_b)
    u->fragment = strdupdelim (fragment_b, fragment_e);
-
+  if (path_modified || u->fragment || host_modified || path_b == path_e)
  if (path_modified || u->fragment || host_modified)
    {
-      /* If path_simplify modified the path, or if a fragment is
+      /* If we suspect that a transformation has rendered what
-	 present, or if the original host name had caps in it, make
+	 url_string might return different from URL_ENCODED, rebuild
-	 sure that u->url is equivalent to what would be printed by
+	 u->url using url_string.  */
 	 url_string.  */
      u->url = url_string (u, 0);
      if (url_encoded != url)