From 0dbef4ccb4f359c1bb9ad81120618d8ab436d9b0 Mon Sep 17 00:00:00 2001 From: mtortonesi Date: Thu, 24 Aug 2006 08:27:57 -0700 Subject: [PATCH] [svn] Several fixes for recursive spider mode. --- src/ChangeLog | 24 +++++++ src/Makefile.in | 16 ++--- src/convert.c | 120 --------------------------------- src/convert.h | 3 - src/http.c | 73 ++++++++++++++------ src/recur.c | 13 ++++ src/res.c | 8 ++- src/spider.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++ src/spider.h | 37 ++++++++++ 9 files changed, 317 insertions(+), 152 deletions(-) create mode 100644 src/spider.c create mode 100644 src/spider.h diff --git a/src/ChangeLog b/src/ChangeLog index f21afddf..4bcb98cc 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,27 @@ +2006-08-24 Mauro Tortonesi + + * Makefile.in: Added spider.c to the list of files to compile and + spider.h to the list of header files. Updated copyright information. + + * http.c: Major changes to recursive spider mode. Now for every + resource we are supposed to check, we send a HEAD request to find out + if it exists. If the resource is a HTML file, we retrieve it and parse + it to discover links to other resources. + + * recur.c: Ditto. + + * res.c (res_retrieve_file): Disable opt.timestamping and opt.spider + when retrieving robots.txt. Updated copyright information. + + * convert.c: Moved code tracking broken links to spider.c. + + * convert.h: Ditto. + + * spider.c: Created new file to keep track of visited URLs in spider + mode. + + * spider.h: Ditto. + 2006-08-21 Mauro Tortonesi * http.c: Fixed timestamping-related bug. diff --git a/src/Makefile.in b/src/Makefile.in index e0313220..bcacd7dd 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -1,5 +1,5 @@ # Makefile for `wget' utility -# Copyright (C) 1995-2005 Free Software Foundation, Inc. +# Copyright (C) 1995-2006 Free Software Foundation, Inc. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -76,8 +76,8 @@ OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o \ ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o \ host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o \ log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o recur.o \ - res.o retr.o safe-ctype.o snprintf.o $(SSL_OBJ) url.o \ - utils.o version.o xmalloc.o + res.o retr.o safe-ctype.o snprintf.o spider.o $(SSL_OBJ) \ + url.o utils.o version.o xmalloc.o .SUFFIXES: .SUFFIXES: .c .o @@ -96,11 +96,11 @@ wget$(exeext): $(OBJ) # time, and it's a lot safer than attempting to get all the # dependencies right. -$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \ - gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \ - http-ntlm.h init.h log.h mswindows.h netrc.h options.h \ - progress.h ptimer.h recur.h res.h retr.h safe-ctype.h ssl.h \ - sysdep.h url.h utils.h wget.h xmalloc.h +$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \ + gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \ + http-ntlm.h init.h log.h mswindows.h netrc.h options.h \ + progress.h ptimer.h recur.h res.h retr.h safe-ctype.h \ + spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h # # Dependencies for installing diff --git a/src/convert.c b/src/convert.c index 7def7c89..a59e3c51 100644 --- a/src/convert.c +++ b/src/convert.c @@ -54,8 +54,6 @@ struct hash_table *dl_url_file_map; conversion after Wget is done. */ struct hash_table *downloaded_html_set; -static struct hash_table *nonexisting_urls_hash; - static void convert_links (const char *, struct urlpos *); /* This function is called when the retrieval is done to convert the @@ -835,7 +833,6 @@ register_html (const char *url, const char *file) } static void downloaded_files_free (void); -static void nonexisting_urls_free (void); /* Cleanup the data structures associated with this file. */ @@ -857,7 +854,6 @@ convert_cleanup (void) if (downloaded_html_set) string_set_free (downloaded_html_set); downloaded_files_free (); - nonexisting_urls_free (); if (converted_files) string_set_free (converted_files); } @@ -957,122 +953,6 @@ downloaded_files_free (void) downloaded_files_hash = NULL; } } - -/* Remembers broken links. */ - -struct broken_urls_list -{ - char *url; - struct broken_urls_list *next; -}; - -static bool -in_list (const struct broken_urls_list *list, const char *url) -{ - const struct broken_urls_list *ptr; - - for (ptr = list; ptr; ptr = ptr->next) - { - /* str[case]cmp is inadequate for URL comparison */ - if (are_urls_equal (url, ptr->url) == 0) return true; - } - - return false; -} - -void -nonexisting_url (const char *url, const char *referrer) -{ - struct broken_urls_list *list; - - /* Ignore robots.txt URLs */ - if (is_robots_txt_url (url)) - return; - - if (!nonexisting_urls_hash) - nonexisting_urls_hash = make_string_hash_table (0); - - list = hash_table_get (nonexisting_urls_hash, url); - if (!list) - { - list = (struct broken_urls_list *) xnew0 (struct broken_urls_list); - list->url = referrer ? xstrdup (referrer) : NULL; - hash_table_put (nonexisting_urls_hash, xstrdup (url), list); - } - else if (list && !in_list (list, referrer)) - { - /* Append referrer at the end of the list */ - struct broken_urls_list *newnode; - - while (list->next) list = list->next; - - newnode = xnew0 (struct broken_urls_list); - newnode->url = xstrdup (referrer); - list->next = newnode; - } -} - -static void -nonexisting_urls_free (void) -{ - if (nonexisting_urls_hash) - { - hash_table_iterator iter; - for (hash_table_iterate (nonexisting_urls_hash, &iter); - hash_table_iter_next (&iter); - ) - { - xfree (iter.key); - xfree (iter.value); - } - hash_table_destroy (nonexisting_urls_hash); - nonexisting_urls_hash = NULL; - } -} - -void -print_broken_links (void) -{ - hash_table_iterator iter; - int num_elems; - - if (!nonexisting_urls_hash) - { - logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n")); - return; - } - - num_elems = hash_table_count (nonexisting_urls_hash); - assert (num_elems > 0); - - if (num_elems > 1) - { - logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"), - num_elems); - } - else - { - logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n")); - } - - for (hash_table_iterate (nonexisting_urls_hash, &iter); - hash_table_iter_next (&iter); - ) - { - struct broken_urls_list *list; - - logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key); - - for (list = (struct broken_urls_list *) iter.value; - list; - list = list->next) - { - logprintf (LOG_NOTQUIET, _(" %s\n"), list->url); - } - } - logputs (LOG_NOTQUIET, "\n"); -} - /* The function returns the pointer to the malloc-ed quoted version of string s. It will recognize and quote numeric and special graphic diff --git a/src/convert.h b/src/convert.h index 6104b393..11d6a5f1 100644 --- a/src/convert.h +++ b/src/convert.h @@ -104,7 +104,4 @@ void convert_cleanup (void); char *html_quote_string (const char *); -void nonexisting_url (const char *, const char *); -void print_broken_links (void); - #endif /* CONVERT_H */ diff --git a/src/http.c b/src/http.c index 5da467cf..a6fb4d1d 100644 --- a/src/http.c +++ b/src/http.c @@ -2281,6 +2281,10 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, /* Get the current time string. */ tms = time_str (time (NULL)); + if (opt.spider && !got_head) + logprintf (LOG_VERBOSE, _("\ +Spider mode enabled. Check if remote file exists.\n")); + /* Print fetch message, if opt.verbose. */ if (opt.verbose) { @@ -2308,8 +2312,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, /* Default document type is empty. However, if spider mode is on or time-stamping is employed, HEAD_ONLY commands is encoded within *dt. */ - if ((opt.spider && !opt.recursive) - || (opt.timestamping && !got_head) + if (((opt.spider || opt.timestamping) && !got_head) || (opt.always_rest && !got_name)) *dt |= HEAD_ONLY; else @@ -2412,13 +2415,22 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, hurl = url_string (u, true); logprintf (LOG_NONVERBOSE, "%s:\n", hurl); } - if (opt.spider && opt.recursive) + /* Maybe we should always keep track of broken links, not just in + * spider mode. */ + if (opt.spider) { - if (!hurl) hurl = url_string (u, true); - nonexisting_url (hurl, referer); + /* #### Again: ugly ugly ugly! */ + if (!hurl) + hurl = url_string (u, true); + nonexisting_url (hurl); + logprintf (LOG_NOTQUIET, _("\ +Remote file does not exist -- broken link!!!\n")); + } + else + { + logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), + tms, hstat.statcode, escnonprint (hstat.error)); } - logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), - tms, hstat.statcode, escnonprint (hstat.error)); logputs (LOG_VERBOSE, "\n"); ret = WRONGCODE; xfree_null (hurl); @@ -2447,10 +2459,12 @@ Last-modified header invalid -- time-stamp ignored.\n")); /* The time-stamping section. */ if (opt.timestamping) { - if (hstat.orig_file_name) /* Perform this check only if the file we're - supposed to download already exists. */ + if (hstat.orig_file_name) /* Perform the following checks only + if the file we're supposed to + download already exists. */ { - if (hstat.remote_time && tmr != (time_t) (-1)) + if (hstat.remote_time && + tmr != (time_t) (-1)) { /* Now time-stamping can be used validly. Time-stamping means that if the sizes of the local and remote file @@ -2459,7 +2473,8 @@ Last-modified header invalid -- time-stamp ignored.\n")); download procedure is resumed. */ if (hstat.orig_file_tstamp >= tmr) { - if (hstat.contlen == -1 || hstat.orig_file_size == hstat.contlen) + if (hstat.contlen == -1 + || hstat.orig_file_size == hstat.contlen) { logprintf (LOG_VERBOSE, _("\ Server file no newer than local file `%s' -- not retrieving.\n\n"), @@ -2492,6 +2507,33 @@ The sizes do not match (local %s) -- retrieving.\n"), got_name = true; restart_loop = true; } + + if (opt.spider) + { + if (opt.recursive) + { + if (*dt & TEXTHTML) + { + logputs (LOG_VERBOSE, _("\ +Remote file exists and could contain links to other resources -- retrieving.\n\n")); + restart_loop = true; + } + else + { + logprintf (LOG_VERBOSE, _("\ +Remote file exists but does not contain any link -- not retrieving.\n\n")); + ret = RETRUNNEEDED; + goto exit; + } + } + else + { + logprintf (LOG_VERBOSE, _("\ +Remote file exists but recursion is disabled -- not retrieving.\n\n")); + ret = RETRUNNEEDED; + goto exit; + } + } got_head = true; /* no more time-stamping */ *dt &= ~HEAD_ONLY; @@ -2502,7 +2544,6 @@ The sizes do not match (local %s) -- retrieving.\n"), } if ((tmr != (time_t) (-1)) - && (!opt.spider || opt.recursive) && ((hstat.len == hstat.contlen) || ((hstat.res == 0) && (hstat.contlen == -1)))) { @@ -2521,14 +2562,6 @@ The sizes do not match (local %s) -- retrieving.\n"), } /* End of time-stamping section. */ - if (opt.spider && !opt.recursive) - { - logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode, - escnonprint (hstat.error)); - ret = RETROK; - goto exit; - } - tmrate = retr_rate (hstat.rd_size, hstat.dltime); total_download_time += hstat.dltime; diff --git a/src/recur.c b/src/recur.c index b746332b..33e32bec 100644 --- a/src/recur.c +++ b/src/recur.c @@ -274,6 +274,11 @@ retrieve_tree (const char *start_url) } } + if (opt.spider) + { + visited_url (url, referer); + } + if (descend && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) { @@ -365,6 +370,7 @@ retrieve_tree (const char *start_url) file); if (unlink (file)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); + logputs (LOG_VERBOSE, "\n"); register_delete_file (file); } @@ -420,6 +426,13 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (string_set_contains (blacklist, url)) { + if (opt.spider) + { + char *referrer = url_string (parent, true); + DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url)); + visited_url (url, referrer); + xfree (referrer); + } DEBUGP (("Already on the black list.\n")); goto out; } diff --git a/src/res.c b/src/res.c index 103bc4e7..a160591f 100644 --- a/src/res.c +++ b/src/res.c @@ -1,5 +1,5 @@ /* Support for Robot Exclusion Standard (RES). - Copyright (C) 2001 Free Software Foundation, Inc. + Copyright (C) 2001,2006 Free Software Foundation, Inc. This file is part of Wget. @@ -539,10 +539,16 @@ res_retrieve_file (const char *url, char **file) { uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); + int saved_ts_val = opt.timestamping; + int saved_sp_val = opt.spider; logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; + opt.timestamping = false; + opt.spider = false; err = retrieve_url (robots_url, file, NULL, NULL, NULL, false); + opt.timestamping = saved_ts_val; + opt.spider = saved_sp_val; xfree (robots_url); if (err != RETROK && *file != NULL) diff --git a/src/spider.c b/src/spider.c new file mode 100644 index 00000000..d8cf8361 --- /dev/null +++ b/src/spider.c @@ -0,0 +1,175 @@ +/* Keep track of visited URLs in spider mode. + Copyright (C) 2006 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ + +#include + +#include +#include +#include + +#include "wget.h" +#include "spider.h" +#include "url.h" +#include "utils.h" +#include "hash.h" +#include "res.h" + + +static struct hash_table *visited_urls_hash; +static struct hash_table *nonexisting_urls_set; + +/* Cleanup the data structures associated with this file. */ + +void +spider_cleanup (void) +{ + if (visited_urls_hash) + { + free_keys_and_values (visited_urls_hash); + hash_table_destroy (visited_urls_hash); + visited_urls_hash = NULL; + } + if (nonexisting_urls_set) + string_set_free (nonexisting_urls_set); +} + +/* Remembers visited files. */ + +struct url_list +{ + char *url; + struct url_list *next; +}; + +static bool +in_url_list_p (const struct url_list *list, const char *url, bool verbose) +{ + const struct url_list *ptr; + + for (ptr = list; ptr; ptr = ptr->next) + { + /* str[case]cmp is inadequate for URL comparison */ + if (are_urls_equal (url, ptr->url)) + return true; + } + + return false; +} + +void +visited_url (const char *url, const char *referrer) +{ + struct url_list *list; + + /* Ignore robots.txt URLs */ + if (is_robots_txt_url (url)) + return; + + if (!visited_urls_hash) + visited_urls_hash = make_string_hash_table (0); + + list = hash_table_get (visited_urls_hash, url); + if (!list) + { + list = (struct url_list *) xnew0 (struct url_list); + list->url = referrer ? xstrdup (referrer) : NULL; + hash_table_put (visited_urls_hash, xstrdup (url), list); + } + else if (referrer && !in_url_list_p (list, referrer, false)) + { + /* Append referrer at the end of the list */ + struct url_list *newnode; + + while (list->next) + list = list->next; + + newnode = (struct url_list *) xnew0 (struct url_list); + newnode->url = xstrdup (referrer); + list->next = newnode; + } +} + +/* Remembers broken links. */ +void +nonexisting_url (const char *url) +{ + /* Ignore robots.txt URLs */ + if (is_robots_txt_url (url)) + return; + if (!nonexisting_urls_set) + nonexisting_urls_set = make_string_hash_table (0); + string_set_add (nonexisting_urls_set, url); +} + +void +print_broken_links (void) +{ + hash_table_iterator iter; + int num_elems; + + if (!nonexisting_urls_set) + { + logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n")); + return; + } + + num_elems = hash_table_count (nonexisting_urls_set); + assert (num_elems > 0); + + if (num_elems > 1) + { + logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"), + num_elems); + } + else + { + logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n")); + } + + for (hash_table_iterate (nonexisting_urls_set, &iter); + hash_table_iter_next (&iter); ) + { + struct url_list *list; + const char *url = (const char *) iter.key; + + logprintf (LOG_NOTQUIET, _("%s referred by:\n"), url); + + for (list = (struct url_list *) hash_table_get (visited_urls_hash, url); + list; list = list->next) + { + logprintf (LOG_NOTQUIET, _(" %s\n"), list->url); + } + } + logputs (LOG_NOTQUIET, "\n"); +} + +/* + * vim: et ts=2 sw=2 + */ + diff --git a/src/spider.h b/src/spider.h new file mode 100644 index 00000000..9cf71e80 --- /dev/null +++ b/src/spider.h @@ -0,0 +1,37 @@ +/* Declarations for spider.c + Copyright (C) 2006 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ + +#ifndef SPIDER_H +#define SPIDER_H + +void visited_url (const char *, const char *); +void nonexisting_url (const char *); +void print_broken_links (void); + +#endif /* SPIDER_H */