1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Recursion and progress bar tweaks.

Published in <sxsd727cvc0.fsf@florida.arsdigita.de>.
This commit is contained in:
hniksic 2001-11-25 13:03:30 -08:00
parent df05e7ff10
commit 3afb9c659a
6 changed files with 78 additions and 53 deletions

View File

@ -1,3 +1,15 @@
2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
* recur.c (descend_url_p): Be more conservative with blacklisting
URLs.
(convert_all_links): Print how many files have been converted, and
how long it took.
* progress.c (create_image): Place the number of downloaded bytes
right after the progress bar.
* utils.c (suffix): Return a pointer into the string.
2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
* url.c (convert_links): Handle CO_NULLIFY_BASE.

View File

@ -1453,7 +1453,6 @@ File `%s' already there, will not retrieve.\n"), *hstat.local_file);
&& (!strcmp (suf, "html") || !strcmp (suf, "htm")))
*dt |= TEXTHTML;
FREE_MAYBE (suf);
FREE_MAYBE (dummy);
return RETROK;
}

View File

@ -477,24 +477,24 @@ create_image (struct bar_progress *bp, long dltime)
long size = bp->initial_length + bp->count;
/* The progress bar should look like this:
xx% [=======> ] xx KB/s nnnnn ETA 00:00
xx% [=======> ] nn.nnn rrK/s ETA 00:00
Calculate its geometry:
"xx% " or "100%" - percentage - 4 chars exactly
"[]" - progress bar decorations - 2 chars exactly
"1012.56K/s " - dl rate - 11 chars exactly
"n,nnn,nnn,nnn " - downloaded bytes - 14 or less chars
"ETA xx:xx:xx" - ETA - 12 or less chars
"xx% " or "100%" - percentage - 4 chars exactly
"[]" - progress bar decorations - 2 chars exactly
" n,nnn,nnn,nnn" - downloaded bytes - 14 or less chars
" 1012.56K/s" - dl rate - 11 chars exactly
" ETA xx:xx:xx" - ETA - 13 or less chars
"=====>..." - progress bar content - the rest
"=====>..." - progress bar content - the rest
*/
int progress_size = screen_width - (4 + 2 + 11 + 14 + 12);
int progress_size = screen_width - (4 + 2 + 14 + 11 + 13);
if (progress_size < 5)
progress_size = 0;
/* "xxx%" */
/* "xx% " */
if (bp->total_length > 0)
{
int percentage = (int)(100.0 * size / bp->total_length);
@ -509,12 +509,13 @@ create_image (struct bar_progress *bp, long dltime)
}
else
{
int i = 5;
while (i--)
*p++ = ' ';
*p++ = ' ';
*p++ = ' ';
*p++ = ' ';
*p++ = ' ';
}
/* The progress bar: "|====> |" */
/* The progress bar: "[====> ]" */
if (progress_size && bp->total_length > 0)
{
double fraction = (double)size / bp->total_length;
@ -566,30 +567,30 @@ create_image (struct bar_progress *bp, long dltime)
++bp->tick;
}
/* "1012.45K/s " */
/* " 1,234,567" */
/* If there are 7 or less digits (9 because of "legible" comas),
print the number in constant space. This will prevent the rest
of the line jerking at the beginning of download, but without
assigning maximum width in all cases. */
sprintf (p, " %9s", legible (size));
p += strlen (p);
/* " 1012.45K/s" */
if (dltime && bp->count)
{
static char *short_units[] = { "B/s", "K/s", "M/s", "G/s" };
int units = 0;
double dlrate = calc_rate (bp->count, dltime, &units);
sprintf (p, "%7.2f%s ", dlrate, short_units[units]);
sprintf (p, " %7.2f%s", dlrate, short_units[units]);
p += strlen (p);
}
else
{
strcpy (p, " --.-- K/s ");
p += 12;
strcpy (p, " --.--K/s");
p += 11;
}
/* "1,234,567 " */
/* If there are 7 or less digits (9 because of "legible" comas),
print the number in constant space. This will prevent the "ETA"
string from jerking as the data begins to arrive. */
sprintf (p, "%9s", legible (size));
p += strlen (p);
*p++ = ' ';
/* "ETA xx:xx:xx" */
/* " ETA xx:xx:xx" */
if (bp->total_length > 0 && bp->count > 0)
{
int eta, eta_hrs, eta_min, eta_sec;
@ -605,6 +606,7 @@ create_image (struct bar_progress *bp, long dltime)
/*printf ("\neta: %d, %d %d %d\n", eta, eta_hrs, eta_min, eta_sec);*/
/*printf ("\n%ld %f %ld %ld\n", dltime, tm_sofar, bytes_remaining, bp->count);*/
*p++ = ' ';
*p++ = 'E';
*p++ = 'T';
*p++ = 'A';
@ -621,8 +623,8 @@ create_image (struct bar_progress *bp, long dltime)
}
else if (bp->total_length > 0)
{
strcpy (p, "ETA --:--");
p += 9;
strcpy (p, " ETA --:--");
p += 10;
}
assert (p - bp->buffer <= screen_width);

View File

@ -149,7 +149,7 @@ url_dequeue (struct url_queue *queue,
xfree (qel);
return 1;
}
static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
struct url *, struct hash_table *));
@ -182,7 +182,8 @@ retrieve_tree (const char *start_url)
/* The queue of URLs we need to load. */
struct url_queue *queue = url_queue_new ();
/* The URLs we decided we don't want to load. */
/* The URLs we do not wish to enqueue, because they are already in
the queue, but haven't been downloaded yet. */
struct hash_table *blacklist = make_string_hash_table (0);
/* We'll need various components of this, so better get it over with
@ -242,9 +243,6 @@ retrieve_tree (const char *start_url)
tree. The recursion is partial in that we won't
traverse any <A> or <AREA> tags, nor any <LINK> tags
except for <LINK REL="stylesheet">. */
/* #### This would be the place to implement the TODO
entry saying that -p should do two more hops on
framesets. */
dash_p_leaf_HTML = TRUE;
else
{
@ -348,7 +346,11 @@ retrieve_tree (const char *start_url)
/* Based on the context provided by retrieve_tree, decide whether a
URL is to be descended to. This is only ever called from
retrieve_tree, but is in a separate function for clarity. */
retrieve_tree, but is in a separate function for clarity.
The most expensive checks (such as those for robots) are memoized
by storing these URLs to BLACKLIST. This may or may not help. It
will help if those URLs are encountered many times. */
static int
descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
@ -391,7 +393,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
&& !(u->scheme == SCHEME_FTP && opt.follow_ftp))
{
DEBUGP (("Not following non-HTTP schemes.\n"));
goto blacklist;
goto out;
}
/* 2. If it is an absolute link and they are not followed, throw it
@ -400,7 +402,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (opt.relative_only && !upos->link_relative_p)
{
DEBUGP (("It doesn't really look like a relative link.\n"));
goto blacklist;
goto out;
}
/* 3. If its domain is not to be accepted/looked-up, chuck it
@ -408,7 +410,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (!accept_domain (u))
{
DEBUGP (("The domain was not accepted.\n"));
goto blacklist;
goto out;
}
/* 4. Check for parent directory.
@ -423,7 +425,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (!frontcmp (parent->dir, u->dir))
{
DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
goto blacklist;
goto out;
}
}
@ -435,13 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (!accdir (u->dir, ALLABS))
{
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
goto blacklist;
goto out;
}
}
/* 6. */
{
char *suf = NULL;
char *suf;
/* Check for acceptance/rejection rules. We ignore these rules
for HTML documents because they might lead to other files which
need to be downloaded. Of course, we don't know which
@ -466,11 +468,9 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
{
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
url, u->file));
FREE_MAYBE (suf);
goto blacklist;
goto out;
}
}
FREE_MAYBE (suf);
}
/* 7. */
@ -479,7 +479,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
{
DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
u->host, parent->host));
goto blacklist;
goto out;
}
/* 8. */
@ -509,7 +509,8 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (!res_match_path (specs, u->path))
{
DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
goto blacklist;
string_set_add (blacklist, url);
goto out;
}
}
@ -519,9 +520,6 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
return 1;
blacklist:
string_set_add (blacklist, url);
out:
DEBUGP (("Decided NOT to load it.\n"));
@ -604,6 +602,11 @@ void
convert_all_links (void)
{
slist *html;
struct wget_timer *timer;
long msecs;
int file_count = 0;
timer = wtimer_new ();
/* Destructively reverse downloaded_html_files to get it in the right order.
recursive_retrieve() used slist_prepend() consistently. */
@ -675,11 +678,19 @@ convert_all_links (void)
cur_url->local_name = NULL;
}
}
/* Convert the links in the file. */
convert_links (html->string, urls);
++file_count;
/* Free the data. */
free_urlpos (urls);
}
msecs = wtimer_elapsed (timer);
wtimer_delete (timer);
logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
file_count, (double)msecs / 1000);
}
/* Cleanup the data structures associated with recursive retrieving

View File

@ -336,7 +336,6 @@ retrieve_url (const char *origurl, char **file, char **newloc,
char *suf = suffix (u->local);
if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
*dt |= TEXTHTML;
FREE_MAYBE (suf);
}
#endif
}

View File

@ -904,7 +904,7 @@ in_acclist (const char *const *accepts, const char *s, int backward)
return 0;
}
/* Return the malloc-ed suffix of STR. For instance:
/* Return the location of STR's suffix (file extension). Examples:
suffix ("foo.bar") -> "bar"
suffix ("foo.bar.baz") -> "baz"
suffix ("/foo/bar") -> NULL
@ -914,9 +914,11 @@ suffix (const char *str)
{
int i;
for (i = strlen (str); i && str[i] != '/' && str[i] != '.'; i--);
for (i = strlen (str); i && str[i] != '/' && str[i] != '.'; i--)
;
if (str[i++] == '.')
return xstrdup (str + i);
return (char *)str + i;
else
return NULL;
}