[svn] Recursion and progress bar tweaks.

Published in <sxsd727cvc0.fsf@florida.arsdigita.de>.
2024-07-03 16:38:41 -04:00 · 2001-11-25 13:03:30 -08:00 · 2001-11-25 13:03:30 -08:00 · 3afb9c659a
commit 3afb9c659a
parent df05e7ff10
6 changed files with 78 additions and 53 deletions
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -1,3 +1,15 @@
+2001-11-25  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* recur.c (descend_url_p): Be more conservative with blacklisting
+	URLs.
+	(convert_all_links): Print how many files have been converted, and
+	how long it took.
+
+	* progress.c (create_image): Place the number of downloaded bytes
+	right after the progress bar.
+
+	* utils.c (suffix): Return a pointer into the string.
+
 2001-11-25  Hrvoje Niksic  <hniksic@arsdigita.com>

 	* url.c (convert_links): Handle CO_NULLIFY_BASE.
--- a/src/http.c
+++ b/src/http.c
@ -1453,7 +1453,6 @@ File `%s' already there, will not retrieve.\n"), *hstat.local_file);
 	  && (!strcmp (suf, "html") || !strcmp (suf, "htm")))
 	*dt |= TEXTHTML;

-      FREE_MAYBE (suf);
      FREE_MAYBE (dummy);
      return RETROK;
    }
--- a/src/progress.c
+++ b/src/progress.c
@ -477,24 +477,24 @@ create_image (struct bar_progress *bp, long dltime)
  long size = bp->initial_length + bp->count;

  /* The progress bar should look like this:
-     xx% [=======>             ] xx KB/s nnnnn ETA 00:00
+     xx% [=======>             ] nn.nnn rrK/s ETA 00:00

     Calculate its geometry:

-     "xx% " or "100%" - percentage                - 4 chars exactly
-     "[]"             - progress bar decorations  - 2 chars exactly
-     "1012.56K/s "    - dl rate                   - 11 chars exactly
-     "n,nnn,nnn,nnn " - downloaded bytes          - 14 or less chars
-     "ETA xx:xx:xx"   - ETA                       - 12 or less chars
+     "xx% " or "100%"  - percentage                - 4 chars exactly
+     "[]"              - progress bar decorations  - 2 chars exactly
+     " n,nnn,nnn,nnn"  - downloaded bytes          - 14 or less chars
+     " 1012.56K/s"     - dl rate                   - 11 chars exactly
+     " ETA xx:xx:xx"   - ETA                       - 13 or less chars

-     "=====>..."      - progress bar content      - the rest
+     "=====>..."       - progress bar content      - the rest
  */
-  int progress_size = screen_width - (4 + 2 + 11 + 14 + 12);
+  int progress_size = screen_width - (4 + 2 + 14 + 11 + 13);

  if (progress_size < 5)
    progress_size = 0;

-  /* "xxx%" */
+  /* "xx% " */
  if (bp->total_length > 0)
    {
      int percentage = (int)(100.0 * size / bp->total_length);
@ -509,12 +509,13 @@ create_image (struct bar_progress *bp, long dltime)
    }
  else
    {
-      int i = 5;
-      while (i--)
-	*p++ = ' ';
+      *p++ = ' ';
+      *p++ = ' ';
+      *p++ = ' ';
+      *p++ = ' ';
    }

-  /* The progress bar: "|====>      |" */
+  /* The progress bar: "[====>      ]" */
  if (progress_size && bp->total_length > 0)
    {
      double fraction = (double)size / bp->total_length;
@ -566,30 +567,30 @@ create_image (struct bar_progress *bp, long dltime)
      ++bp->tick;
    }

-  /* "1012.45K/s " */
+  /* " 1,234,567" */
+  /* If there are 7 or less digits (9 because of "legible" comas),
+     print the number in constant space.  This will prevent the rest
+     of the line jerking at the beginning of download, but without
+     assigning maximum width in all cases.  */
+  sprintf (p, " %9s", legible (size));
+  p += strlen (p);
+
+  /* " 1012.45K/s" */
  if (dltime && bp->count)
    {
      static char *short_units[] = { "B/s", "K/s", "M/s", "G/s" };
      int units = 0;
      double dlrate = calc_rate (bp->count, dltime, &units);
-      sprintf (p, "%7.2f%s ", dlrate, short_units[units]);
+      sprintf (p, " %7.2f%s", dlrate, short_units[units]);
      p += strlen (p);
    }
  else
    {
-      strcpy (p, "  --.-- K/s ");
-      p += 12;
+      strcpy (p, "   --.--K/s");
+      p += 11;
    }

-  /* "1,234,567 " */
-  /* If there are 7 or less digits (9 because of "legible" comas),
-     print the number in constant space.  This will prevent the "ETA"
-     string from jerking as the data begins to arrive.  */
-  sprintf (p, "%9s", legible (size));
-  p += strlen (p);
-  *p++ = ' ';
-
-  /* "ETA xx:xx:xx" */
+  /* " ETA xx:xx:xx" */
  if (bp->total_length > 0 && bp->count > 0)
    {
      int eta, eta_hrs, eta_min, eta_sec;
@ -605,6 +606,7 @@ create_image (struct bar_progress *bp, long dltime)
      /*printf ("\neta: %d, %d %d %d\n", eta, eta_hrs, eta_min, eta_sec);*/
      /*printf ("\n%ld %f %ld %ld\n", dltime, tm_sofar, bytes_remaining, bp->count);*/

+      *p++ = ' ';
      *p++ = 'E';
      *p++ = 'T';
      *p++ = 'A';
@ -621,8 +623,8 @@ create_image (struct bar_progress *bp, long dltime)
    }
  else if (bp->total_length > 0)
    {
-      strcpy (p, "ETA --:--");
-      p += 9;
+      strcpy (p, " ETA --:--");
+      p += 10;
    }

  assert (p - bp->buffer <= screen_width);
--- a/src/recur.c
+++ b/src/recur.c
@ -149,7 +149,7 @@ url_dequeue (struct url_queue *queue,
  xfree (qel);
  return 1;
 }
-
+
 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
 				  struct url *, struct hash_table *));

@ -182,7 +182,8 @@ retrieve_tree (const char *start_url)
  /* The queue of URLs we need to load. */
  struct url_queue *queue = url_queue_new ();

-  /* The URLs we decided we don't want to load. */
+  /* The URLs we do not wish to enqueue, because they are already in
+     the queue, but haven't been downloaded yet.  */
  struct hash_table *blacklist = make_string_hash_table (0);

  /* We'll need various components of this, so better get it over with
@ -242,9 +243,6 @@ retrieve_tree (const char *start_url)
 	       tree.  The recursion is partial in that we won't
 	       traverse any <A> or <AREA> tags, nor any <LINK> tags
 	       except for <LINK REL="stylesheet">. */
-	    /* #### This would be the place to implement the TODO
-	       entry saying that -p should do two more hops on
-	       framesets.  */
 	    dash_p_leaf_HTML = TRUE;
 	  else
 	    {
@ -348,7 +346,11 @@ retrieve_tree (const char *start_url)

 /* Based on the context provided by retrieve_tree, decide whether a
   URL is to be descended to.  This is only ever called from
-   retrieve_tree, but is in a separate function for clarity.  */
+   retrieve_tree, but is in a separate function for clarity.
+
+   The most expensive checks (such as those for robots) are memoized
+   by storing these URLs to BLACKLIST.  This may or may not help.  It
+   will help if those URLs are encountered many times.  */

 static int
 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
@ -391,7 +393,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
      && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
    {
      DEBUGP (("Not following non-HTTP schemes.\n"));
-      goto blacklist;
+      goto out;
    }

  /* 2. If it is an absolute link and they are not followed, throw it
@ -400,7 +402,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
    if (opt.relative_only && !upos->link_relative_p)
      {
 	DEBUGP (("It doesn't really look like a relative link.\n"));
-	goto blacklist;
+	goto out;
      }

  /* 3. If its domain is not to be accepted/looked-up, chuck it
@ -408,7 +410,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
  if (!accept_domain (u))
    {
      DEBUGP (("The domain was not accepted.\n"));
-      goto blacklist;
+      goto out;
    }

  /* 4. Check for parent directory.
@ -423,7 +425,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
      if (!frontcmp (parent->dir, u->dir))
 	{
 	  DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
-	  goto blacklist;
+	  goto out;
 	}
    }

@ -435,13 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
      if (!accdir (u->dir, ALLABS))
 	{
 	  DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
-	  goto blacklist;
+	  goto out;
 	}
    }

  /* 6. */
  {
-    char *suf = NULL;
+    char *suf;
    /* Check for acceptance/rejection rules.  We ignore these rules
       for HTML documents because they might lead to other files which
       need to be downloaded.  Of course, we don't know which
@ -466,11 +468,9 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
 	  {
 	    DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 		     url, u->file));
-	    FREE_MAYBE (suf);
-	    goto blacklist;
+	    goto out;
 	  }
      }
-    FREE_MAYBE (suf);
  }

  /* 7. */
@ -479,7 +479,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
      {
 	DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 		 u->host, parent->host));
-	goto blacklist;
+	goto out;
      }

  /* 8. */
@ -509,7 +509,8 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
      if (!res_match_path (specs, u->path))
 	{
 	  DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
-	  goto blacklist;
+	  string_set_add (blacklist, url);
+	  goto out;
 	}
    }

@ -519,9 +520,6 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,

  return 1;

- blacklist:
-  string_set_add (blacklist, url);
-
 out:
  DEBUGP (("Decided NOT to load it.\n"));

@ -604,6 +602,11 @@ void
 convert_all_links (void)
 {
  slist *html;
+  struct wget_timer *timer;
+  long msecs;
+  int file_count = 0;
+
+  timer = wtimer_new ();

  /* Destructively reverse downloaded_html_files to get it in the right order.
     recursive_retrieve() used slist_prepend() consistently.  */
@ -675,11 +678,19 @@ convert_all_links (void)
 	      cur_url->local_name = NULL;
 	    }
 	}
+
      /* Convert the links in the file.  */
      convert_links (html->string, urls);
+      ++file_count;
+
      /* Free the data.  */
      free_urlpos (urls);
    }
+
+  msecs = wtimer_elapsed (timer);
+  wtimer_delete (timer);
+  logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
+	     file_count, (double)msecs / 1000);
 }

 /* Cleanup the data structures associated with recursive retrieving
--- a/src/retr.c
+++ b/src/retr.c
@ -336,7 +336,6 @@ retrieve_url (const char *origurl, char **file, char **newloc,
 	  char *suf = suffix (u->local);
 	  if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
 	    *dt |= TEXTHTML;
-	  FREE_MAYBE (suf);
 	}
 #endif
    }
--- a/src/utils.c
+++ b/src/utils.c
@ -904,7 +904,7 @@ in_acclist (const char *const *accepts, const char *s, int backward)
  return 0;
 }

-/* Return the malloc-ed suffix of STR.  For instance:
+/* Return the location of STR's suffix (file extension).  Examples:
   suffix ("foo.bar")       -> "bar"
   suffix ("foo.bar.baz")   -> "baz"
   suffix ("/foo/bar")      -> NULL
@ -914,9 +914,11 @@ suffix (const char *str)
 {
  int i;

-  for (i = strlen (str); i && str[i] != '/' && str[i] != '.'; i--);
+  for (i = strlen (str); i && str[i] != '/' && str[i] != '.'; i--)
+    ;
+
  if (str[i++] == '.')
-    return xstrdup (str + i);
+    return (char *)str + i;
  else
    return NULL;
 }