1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Recursion and progress bar tweaks.

Published in <sxsd727cvc0.fsf@florida.arsdigita.de>.
This commit is contained in:
hniksic 2001-11-25 13:03:30 -08:00
parent df05e7ff10
commit 3afb9c659a
6 changed files with 78 additions and 53 deletions

View File

@ -1,3 +1,15 @@
2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
* recur.c (descend_url_p): Be more conservative with blacklisting
URLs.
(convert_all_links): Print how many files have been converted, and
how long it took.
* progress.c (create_image): Place the number of downloaded bytes
right after the progress bar.
* utils.c (suffix): Return a pointer into the string.
2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com> 2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
* url.c (convert_links): Handle CO_NULLIFY_BASE. * url.c (convert_links): Handle CO_NULLIFY_BASE.

View File

@ -1453,7 +1453,6 @@ File `%s' already there, will not retrieve.\n"), *hstat.local_file);
&& (!strcmp (suf, "html") || !strcmp (suf, "htm"))) && (!strcmp (suf, "html") || !strcmp (suf, "htm")))
*dt |= TEXTHTML; *dt |= TEXTHTML;
FREE_MAYBE (suf);
FREE_MAYBE (dummy); FREE_MAYBE (dummy);
return RETROK; return RETROK;
} }

View File

@ -477,24 +477,24 @@ create_image (struct bar_progress *bp, long dltime)
long size = bp->initial_length + bp->count; long size = bp->initial_length + bp->count;
/* The progress bar should look like this: /* The progress bar should look like this:
xx% [=======> ] xx KB/s nnnnn ETA 00:00 xx% [=======> ] nn.nnn rrK/s ETA 00:00
Calculate its geometry: Calculate its geometry:
"xx% " or "100%" - percentage - 4 chars exactly "xx% " or "100%" - percentage - 4 chars exactly
"[]" - progress bar decorations - 2 chars exactly "[]" - progress bar decorations - 2 chars exactly
"1012.56K/s " - dl rate - 11 chars exactly " n,nnn,nnn,nnn" - downloaded bytes - 14 or less chars
"n,nnn,nnn,nnn " - downloaded bytes - 14 or less chars " 1012.56K/s" - dl rate - 11 chars exactly
"ETA xx:xx:xx" - ETA - 12 or less chars " ETA xx:xx:xx" - ETA - 13 or less chars
"=====>..." - progress bar content - the rest "=====>..." - progress bar content - the rest
*/ */
int progress_size = screen_width - (4 + 2 + 11 + 14 + 12); int progress_size = screen_width - (4 + 2 + 14 + 11 + 13);
if (progress_size < 5) if (progress_size < 5)
progress_size = 0; progress_size = 0;
/* "xxx%" */ /* "xx% " */
if (bp->total_length > 0) if (bp->total_length > 0)
{ {
int percentage = (int)(100.0 * size / bp->total_length); int percentage = (int)(100.0 * size / bp->total_length);
@ -509,12 +509,13 @@ create_image (struct bar_progress *bp, long dltime)
} }
else else
{ {
int i = 5; *p++ = ' ';
while (i--) *p++ = ' ';
*p++ = ' '; *p++ = ' ';
*p++ = ' ';
} }
/* The progress bar: "|====> |" */ /* The progress bar: "[====> ]" */
if (progress_size && bp->total_length > 0) if (progress_size && bp->total_length > 0)
{ {
double fraction = (double)size / bp->total_length; double fraction = (double)size / bp->total_length;
@ -566,30 +567,30 @@ create_image (struct bar_progress *bp, long dltime)
++bp->tick; ++bp->tick;
} }
/* "1012.45K/s " */ /* " 1,234,567" */
/* If there are 7 or less digits (9 because of "legible" comas),
print the number in constant space. This will prevent the rest
of the line jerking at the beginning of download, but without
assigning maximum width in all cases. */
sprintf (p, " %9s", legible (size));
p += strlen (p);
/* " 1012.45K/s" */
if (dltime && bp->count) if (dltime && bp->count)
{ {
static char *short_units[] = { "B/s", "K/s", "M/s", "G/s" }; static char *short_units[] = { "B/s", "K/s", "M/s", "G/s" };
int units = 0; int units = 0;
double dlrate = calc_rate (bp->count, dltime, &units); double dlrate = calc_rate (bp->count, dltime, &units);
sprintf (p, "%7.2f%s ", dlrate, short_units[units]); sprintf (p, " %7.2f%s", dlrate, short_units[units]);
p += strlen (p); p += strlen (p);
} }
else else
{ {
strcpy (p, " --.-- K/s "); strcpy (p, " --.--K/s");
p += 12; p += 11;
} }
/* "1,234,567 " */ /* " ETA xx:xx:xx" */
/* If there are 7 or less digits (9 because of "legible" comas),
print the number in constant space. This will prevent the "ETA"
string from jerking as the data begins to arrive. */
sprintf (p, "%9s", legible (size));
p += strlen (p);
*p++ = ' ';
/* "ETA xx:xx:xx" */
if (bp->total_length > 0 && bp->count > 0) if (bp->total_length > 0 && bp->count > 0)
{ {
int eta, eta_hrs, eta_min, eta_sec; int eta, eta_hrs, eta_min, eta_sec;
@ -605,6 +606,7 @@ create_image (struct bar_progress *bp, long dltime)
/*printf ("\neta: %d, %d %d %d\n", eta, eta_hrs, eta_min, eta_sec);*/ /*printf ("\neta: %d, %d %d %d\n", eta, eta_hrs, eta_min, eta_sec);*/
/*printf ("\n%ld %f %ld %ld\n", dltime, tm_sofar, bytes_remaining, bp->count);*/ /*printf ("\n%ld %f %ld %ld\n", dltime, tm_sofar, bytes_remaining, bp->count);*/
*p++ = ' ';
*p++ = 'E'; *p++ = 'E';
*p++ = 'T'; *p++ = 'T';
*p++ = 'A'; *p++ = 'A';
@ -621,8 +623,8 @@ create_image (struct bar_progress *bp, long dltime)
} }
else if (bp->total_length > 0) else if (bp->total_length > 0)
{ {
strcpy (p, "ETA --:--"); strcpy (p, " ETA --:--");
p += 9; p += 10;
} }
assert (p - bp->buffer <= screen_width); assert (p - bp->buffer <= screen_width);

View File

@ -149,7 +149,7 @@ url_dequeue (struct url_queue *queue,
xfree (qel); xfree (qel);
return 1; return 1;
} }
static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int, static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
struct url *, struct hash_table *)); struct url *, struct hash_table *));
@ -182,7 +182,8 @@ retrieve_tree (const char *start_url)
/* The queue of URLs we need to load. */ /* The queue of URLs we need to load. */
struct url_queue *queue = url_queue_new (); struct url_queue *queue = url_queue_new ();
/* The URLs we decided we don't want to load. */ /* The URLs we do not wish to enqueue, because they are already in
the queue, but haven't been downloaded yet. */
struct hash_table *blacklist = make_string_hash_table (0); struct hash_table *blacklist = make_string_hash_table (0);
/* We'll need various components of this, so better get it over with /* We'll need various components of this, so better get it over with
@ -242,9 +243,6 @@ retrieve_tree (const char *start_url)
tree. The recursion is partial in that we won't tree. The recursion is partial in that we won't
traverse any <A> or <AREA> tags, nor any <LINK> tags traverse any <A> or <AREA> tags, nor any <LINK> tags
except for <LINK REL="stylesheet">. */ except for <LINK REL="stylesheet">. */
/* #### This would be the place to implement the TODO
entry saying that -p should do two more hops on
framesets. */
dash_p_leaf_HTML = TRUE; dash_p_leaf_HTML = TRUE;
else else
{ {
@ -348,7 +346,11 @@ retrieve_tree (const char *start_url)
/* Based on the context provided by retrieve_tree, decide whether a /* Based on the context provided by retrieve_tree, decide whether a
URL is to be descended to. This is only ever called from URL is to be descended to. This is only ever called from
retrieve_tree, but is in a separate function for clarity. */ retrieve_tree, but is in a separate function for clarity.
The most expensive checks (such as those for robots) are memoized
by storing these URLs to BLACKLIST. This may or may not help. It
will help if those URLs are encountered many times. */
static int static int
descend_url_p (const struct urlpos *upos, struct url *parent, int depth, descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
@ -391,7 +393,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
&& !(u->scheme == SCHEME_FTP && opt.follow_ftp)) && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
{ {
DEBUGP (("Not following non-HTTP schemes.\n")); DEBUGP (("Not following non-HTTP schemes.\n"));
goto blacklist; goto out;
} }
/* 2. If it is an absolute link and they are not followed, throw it /* 2. If it is an absolute link and they are not followed, throw it
@ -400,7 +402,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (opt.relative_only && !upos->link_relative_p) if (opt.relative_only && !upos->link_relative_p)
{ {
DEBUGP (("It doesn't really look like a relative link.\n")); DEBUGP (("It doesn't really look like a relative link.\n"));
goto blacklist; goto out;
} }
/* 3. If its domain is not to be accepted/looked-up, chuck it /* 3. If its domain is not to be accepted/looked-up, chuck it
@ -408,7 +410,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (!accept_domain (u)) if (!accept_domain (u))
{ {
DEBUGP (("The domain was not accepted.\n")); DEBUGP (("The domain was not accepted.\n"));
goto blacklist; goto out;
} }
/* 4. Check for parent directory. /* 4. Check for parent directory.
@ -423,7 +425,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (!frontcmp (parent->dir, u->dir)) if (!frontcmp (parent->dir, u->dir))
{ {
DEBUGP (("Trying to escape the root directory with no_parent in effect.\n")); DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
goto blacklist; goto out;
} }
} }
@ -435,13 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (!accdir (u->dir, ALLABS)) if (!accdir (u->dir, ALLABS))
{ {
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
goto blacklist; goto out;
} }
} }
/* 6. */ /* 6. */
{ {
char *suf = NULL; char *suf;
/* Check for acceptance/rejection rules. We ignore these rules /* Check for acceptance/rejection rules. We ignore these rules
for HTML documents because they might lead to other files which for HTML documents because they might lead to other files which
need to be downloaded. Of course, we don't know which need to be downloaded. Of course, we don't know which
@ -466,11 +468,9 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
{ {
DEBUGP (("%s (%s) does not match acc/rej rules.\n", DEBUGP (("%s (%s) does not match acc/rej rules.\n",
url, u->file)); url, u->file));
FREE_MAYBE (suf); goto out;
goto blacklist;
} }
} }
FREE_MAYBE (suf);
} }
/* 7. */ /* 7. */
@ -479,7 +479,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
{ {
DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n", DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
u->host, parent->host)); u->host, parent->host));
goto blacklist; goto out;
} }
/* 8. */ /* 8. */
@ -509,7 +509,8 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (!res_match_path (specs, u->path)) if (!res_match_path (specs, u->path))
{ {
DEBUGP (("Not following %s because robots.txt forbids it.\n", url)); DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
goto blacklist; string_set_add (blacklist, url);
goto out;
} }
} }
@ -519,9 +520,6 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
return 1; return 1;
blacklist:
string_set_add (blacklist, url);
out: out:
DEBUGP (("Decided NOT to load it.\n")); DEBUGP (("Decided NOT to load it.\n"));
@ -604,6 +602,11 @@ void
convert_all_links (void) convert_all_links (void)
{ {
slist *html; slist *html;
struct wget_timer *timer;
long msecs;
int file_count = 0;
timer = wtimer_new ();
/* Destructively reverse downloaded_html_files to get it in the right order. /* Destructively reverse downloaded_html_files to get it in the right order.
recursive_retrieve() used slist_prepend() consistently. */ recursive_retrieve() used slist_prepend() consistently. */
@ -675,11 +678,19 @@ convert_all_links (void)
cur_url->local_name = NULL; cur_url->local_name = NULL;
} }
} }
/* Convert the links in the file. */ /* Convert the links in the file. */
convert_links (html->string, urls); convert_links (html->string, urls);
++file_count;
/* Free the data. */ /* Free the data. */
free_urlpos (urls); free_urlpos (urls);
} }
msecs = wtimer_elapsed (timer);
wtimer_delete (timer);
logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
file_count, (double)msecs / 1000);
} }
/* Cleanup the data structures associated with recursive retrieving /* Cleanup the data structures associated with recursive retrieving

View File

@ -336,7 +336,6 @@ retrieve_url (const char *origurl, char **file, char **newloc,
char *suf = suffix (u->local); char *suf = suffix (u->local);
if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm"))) if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
*dt |= TEXTHTML; *dt |= TEXTHTML;
FREE_MAYBE (suf);
} }
#endif #endif
} }

View File

@ -904,7 +904,7 @@ in_acclist (const char *const *accepts, const char *s, int backward)
return 0; return 0;
} }
/* Return the malloc-ed suffix of STR. For instance: /* Return the location of STR's suffix (file extension). Examples:
suffix ("foo.bar") -> "bar" suffix ("foo.bar") -> "bar"
suffix ("foo.bar.baz") -> "baz" suffix ("foo.bar.baz") -> "baz"
suffix ("/foo/bar") -> NULL suffix ("/foo/bar") -> NULL
@ -914,9 +914,11 @@ suffix (const char *str)
{ {
int i; int i;
for (i = strlen (str); i && str[i] != '/' && str[i] != '.'; i--); for (i = strlen (str); i && str[i] != '/' && str[i] != '.'; i--)
;
if (str[i++] == '.') if (str[i++] == '.')
return xstrdup (str + i); return (char *)str + i;
else else
return NULL; return NULL;
} }