1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Be careful whether we want to descend into results of redirection.

Published in <sxs7kse8hmq.fsf@florida.arsdigita.de>.
This commit is contained in:
hniksic 2001-11-25 17:11:48 -08:00
parent 2c41d783c6
commit f6921edc73
2 changed files with 75 additions and 29 deletions

View File

@ -1,3 +1,8 @@
2001-11-26 Hrvoje Niksic <hniksic@arsdigita.com>
* recur.c (descend_redirect_p): New function.
(retrieve_tree): Make sure redirections are not blindly followed.
2001-11-04 Alan Eldridge <alane@geeksrus.net>
* config.h.in: added HAVE_RANDOM.

View File

@ -152,6 +152,9 @@ url_dequeue (struct url_queue *queue,
static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
struct url *, struct hash_table *));
static int descend_redirect_p PARAMS ((const char *, const char *, int,
struct url *, struct hash_table *));
/* Retrieve a part of the web beginning with START_URL. This used to
be called "recursive retrieval", because the old function was
@ -224,14 +227,25 @@ retrieve_tree (const char *start_url)
status = retrieve_url (url, &file, &redirected, NULL, &dt);
opt.recursive = oldrec;
if (redirected)
{
xfree (url);
url = redirected;
}
if (file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
descend = 1;
if (redirected)
{
/* We have been redirected, possibly to another host, or
different path, or wherever. Check whether we really
want to follow it. */
if (descend)
{
if (!descend_redirect_p (redirected, url, depth,
start_url_parsed, blacklist))
descend = 0;
}
xfree (url);
url = redirected;
}
}
if (descend
@ -307,7 +321,8 @@ retrieve_tree (const char *start_url)
opt.delete_after ? "--delete-after" :
"recursive rejection criteria"));
logprintf (LOG_VERBOSE,
(opt.delete_after ? _("Removing %s.\n")
(opt.delete_after
? _("Removing %s.\n")
: _("Removing %s since it should be rejected.\n")),
file);
if (unlink (file))
@ -525,6 +540,43 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
return 0;
}
/* This function determines whether we should descend the children of
the URL whose download resulted in a redirection, possibly to
another host, etc. It is needed very rarely, and thus it is merely
a simple-minded wrapper around descend_url_p. */
static int
descend_redirect_p (const char *redirected, const char *original, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
{
struct url *orig_parsed, *new_parsed;
struct urlpos *upos;
int success;
orig_parsed = url_parse (original, NULL);
assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL);
assert (new_parsed != NULL);
upos = xmalloc (sizeof (struct urlpos));
memset (upos, 0, sizeof (*upos));
upos->url = new_parsed;
success = descend_url_p (upos, orig_parsed, depth,
start_url_parsed, blacklist);
url_free (orig_parsed);
url_free (new_parsed);
xfree (upos);
if (!success)
DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
return success;
}
/* Register that URL has been successfully downloaded to FILE. */
@ -572,32 +624,21 @@ register_html (const char *url, const char *file)
downloaded_html_files = slist_prepend (downloaded_html_files, file);
}
/* convert_links() is called from recursive_retrieve() after we're
done with an HTML file. This call to convert_links is not complete
because it converts only the downloaded files, and Wget cannot know
which files will be downloaded afterwards. So, if we have file
fileone.html with:
/* This function is called when the retrieval is done to convert the
links that have been downloaded. It has to be called at the end of
the retrieval, because only then does Wget know conclusively which
URLs have been downloaded, and which not, so it can tell which
direction to convert to.
<a href="/c/something.gif">
The "direction" means that the URLs to the files that have been
downloaded get converted to the relative URL which will point to
that file. And the other URLs get converted to the remote URL on
the server.
and /c/something.gif was not downloaded because it exceeded the
recursion depth, the reference will *not* be changed.
All the downloaded HTMLs are kept in downloaded_html_files, and
downloaded URLs in urls_downloaded. All the information is
extracted from these two lists. */
However, later we can encounter /c/something.gif from an "upper"
level HTML (let's call it filetwo.html), and it gets downloaded.
But now we have a problem because /c/something.gif will be
correctly transformed in filetwo.html, but not in fileone.html,
since Wget could not have known that /c/something.gif will be
downloaded in the future.
This is why Wget must, after the whole retrieval, call
convert_all_links to go once more through the entire list of
retrieved HTMLs, and re-convert them.
All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
in urls_downloaded. From these two lists information is
extracted. */
void
convert_all_links (void)
{