From 01093c0c33687dab3a435ff4a7f5ff3e14678ae2 Mon Sep 17 00:00:00 2001 From: mtortonesi Date: Thu, 25 May 2006 09:11:29 -0700 Subject: [PATCH] [svn] Fixed recursive spider mode. --- src/ChangeLog | 24 +++++++++++ src/convert.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/convert.h | 3 ++ src/ftp.c | 8 ++-- src/ftp.h | 2 +- src/http.c | 18 +++++--- src/main.c | 9 +++- src/recur.c | 21 ++++----- src/res.c | 2 +- src/retr.c | 13 +++--- src/retr.h | 2 +- src/wget.h | 25 ++++++----- 12 files changed, 202 insertions(+), 41 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index b8e207a7..9b9ffcf4 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,27 @@ +2006-05-25 Mauro Tortonesi + + * convert.c: Added mechanisms to keep track broken links. + + * convert.h: Ditto. + + * wget.h: Reordered and enumerated uerr_t constants. + + * recur.c: Fixes to support recursive spider mode. + + * http.c: Ditto. + + * main.c: Print broken links in case of recursive spider mode. + + * retr.c: Changed interface of retrieve_url. + + * retr.h: Ditto. + + * ftp.c: Changed interface of ftp_loop. + + * ftp.h: Ditto. + + * res.c: Minor change to reflect changes in interface of retrieve_url. + 2006-05-18 Lawrence Jones * ftp-ls.c (ftp_parse_unix_ls): Correct size parsing, add size diff --git a/src/convert.c b/src/convert.c index a460a4b4..98133733 100644 --- a/src/convert.c +++ b/src/convert.c @@ -53,6 +53,8 @@ struct hash_table *dl_url_file_map; conversion after Wget is done. */ struct hash_table *downloaded_html_set; +static struct hash_table *nonexisting_urls_hash; + static void convert_links (const char *, struct urlpos *); /* This function is called when the retrieval is done to convert the @@ -832,6 +834,7 @@ register_html (const char *url, const char *file) } static void downloaded_files_free (void); +static void nonexisting_urls_free (void); /* Cleanup the data structures associated with this file. */ @@ -853,6 +856,7 @@ convert_cleanup (void) if (downloaded_html_set) string_set_free (downloaded_html_set); downloaded_files_free (); + nonexisting_urls_free (); if (converted_files) string_set_free (converted_files); } @@ -952,6 +956,118 @@ downloaded_files_free (void) downloaded_files_hash = NULL; } } + +/* Remembers broken links. */ + +struct broken_urls_list +{ + char *url; + struct broken_urls_list *next; +}; + +static bool +in_list (const struct broken_urls_list *list, const char *url) +{ + const struct broken_urls_list *ptr; + + for (ptr = list; ptr; ptr = ptr->next) + { + /* TODO: strcasecmp may not be appropriate to compare URLs */ + if (strcasecmp (url, ptr->url) == 0) return true; + } + + return false; +} + +void +nonexisting_url (const char *url, const char *referrer) +{ + struct broken_urls_list *list; + + if (!nonexisting_urls_hash) + nonexisting_urls_hash = make_string_hash_table (0); + + list = hash_table_get (nonexisting_urls_hash, url); + if (!list) + { + list = (struct broken_urls_list *) xnew0 (struct broken_urls_list); + list->url = referrer ? xstrdup (referrer) : NULL; + hash_table_put (nonexisting_urls_hash, xstrdup (url), list); + } + else if (list && !in_list (list, referrer)) + { + /* Append referrer at the end of the list */ + struct broken_urls_list *newnode; + + while (list->next) list = list->next; + + newnode = xnew0 (struct broken_urls_list); + newnode->url = xstrdup (referrer); + list->next = newnode; + } +} + +static void +nonexisting_urls_free (void) +{ + if (nonexisting_urls_hash) + { + hash_table_iterator iter; + for (hash_table_iterate (nonexisting_urls_hash, &iter); + hash_table_iter_next (&iter); + ) + { + xfree (iter.key); + xfree (iter.value); + } + hash_table_destroy (nonexisting_urls_hash); + nonexisting_urls_hash = NULL; + } +} + +void +print_broken_links (void) +{ + hash_table_iterator iter; + int num_elems; + + if (!nonexisting_urls_hash) + { + logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n")); + return; + } + + num_elems = hash_table_count (nonexisting_urls_hash); + assert (num_elems > 0); + + if (num_elems > 1) + { + logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"), + num_elems); + } + else + { + logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n")); + } + + for (hash_table_iterate (nonexisting_urls_hash, &iter); + hash_table_iter_next (&iter); + ) + { + struct broken_urls_list *list; + + logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key); + + for (list = (struct broken_urls_list *) iter.value; + list; + list = list->next) + { + logprintf (LOG_NOTQUIET, _(" %s\n"), list->url); + } + } + logputs (LOG_NOTQUIET, "\n"); +} + /* The function returns the pointer to the malloc-ed quoted version of string s. It will recognize and quote numeric and special graphic diff --git a/src/convert.h b/src/convert.h index fea808cf..d2367885 100644 --- a/src/convert.h +++ b/src/convert.h @@ -104,4 +104,7 @@ void convert_cleanup (void); char *html_quote_string (const char *); +void nonexisting_url (const char *, const char *); +void print_broken_links (void); + #endif /* CONVERT_H */ diff --git a/src/ftp.c b/src/ftp.c index b79b5d14..0ecb4188 100644 --- a/src/ftp.c +++ b/src/ftp.c @@ -1773,7 +1773,7 @@ ftp_retrieve_glob (struct url *u, ccon *con, int action) of URL. Inherently, its capabilities are limited on what can be encoded into a URL. */ uerr_t -ftp_loop (struct url *u, int *dt, struct url *proxy) +ftp_loop (struct url *u, int *dt, struct url *proxy, bool recursive, bool glob) { ccon con; /* FTP connection */ uerr_t res; @@ -1791,7 +1791,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy) /* If the file name is empty, the user probably wants a directory index. We'll provide one, properly HTML-ized. Unless opt.htmlify is 0, of course. :-) */ - if (!*u->file && !opt.recursive) + if (!*u->file && !recursive) { struct fileinfo *f; res = ftp_get_listing (u, &con, &f); @@ -1832,7 +1832,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy) else { bool ispattern = false; - if (opt.ftp_glob) + if (glob) { /* Treat the URL as a pattern if the file name part of the URL path contains wildcards. (Don't check for u->file @@ -1843,7 +1843,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy) file_part = u->path; ispattern = has_wildcards_p (file_part); } - if (ispattern || opt.recursive || opt.timestamping) + if (ispattern || recursive || opt.timestamping) { /* ftp_retrieve_glob is a catch-all function that gets called if we need globbing, time-stamping or recursion. Its diff --git a/src/ftp.h b/src/ftp.h index eed6bf76..9110d818 100644 --- a/src/ftp.h +++ b/src/ftp.h @@ -119,7 +119,7 @@ enum wget_ftp_fstatus }; struct fileinfo *ftp_parse_ls (const char *, const enum stype); -uerr_t ftp_loop (struct url *, int *, struct url *); +uerr_t ftp_loop (struct url *, int *, struct url *, bool, bool); uerr_t ftp_index (const char *, struct url *, struct fileinfo *); diff --git a/src/http.c b/src/http.c index a0e04bfb..54091bf4 100644 --- a/src/http.c +++ b/src/http.c @@ -2309,7 +2309,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, /* Default document type is empty. However, if spider mode is on or time-stamping is employed, HEAD_ONLY commands is encoded within *dt. */ - if (opt.spider || (opt.timestamping && !got_head)) + if ((opt.spider && !opt.recursive) || (opt.timestamping && !got_head)) *dt |= HEAD_ONLY; else *dt &= ~HEAD_ONLY; @@ -2400,20 +2400,26 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, /* All possibilities should have been exhausted. */ abort (); } - + if (!(*dt & RETROKF)) { + char *hurl = NULL; if (!opt.verbose) { /* #### Ugly ugly ugly! */ - char *hurl = url_string (u, true); + hurl = url_string (u, true); logprintf (LOG_NONVERBOSE, "%s:\n", hurl); - xfree (hurl); + } + if (opt.spider && opt.recursive) + { + if (!hurl) hurl = url_string (u, true); + nonexisting_url (hurl, referer); } logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), tms, hstat.statcode, escnonprint (hstat.error)); logputs (LOG_VERBOSE, "\n"); ret = WRONGCODE; + xfree_null (hurl); goto exit; } @@ -2479,7 +2485,7 @@ The sizes do not match (local %s) -- retrieving.\n"), } if ((tmr != (time_t) (-1)) - && !opt.spider + && (!opt.spider || opt.recursive) && ((hstat.len == hstat.contlen) || ((hstat.res == 0) && (hstat.contlen == -1)))) { @@ -2498,7 +2504,7 @@ The sizes do not match (local %s) -- retrieving.\n"), } /* End of time-stamping section. */ - if (opt.spider) + if (opt.spider && !opt.recursive) { logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode, escnonprint (hstat.error)); diff --git a/src/main.c b/src/main.c index 1f3fb25a..9bc979aa 100644 --- a/src/main.c +++ b/src/main.c @@ -948,7 +948,7 @@ Can't timestamp and not clobber old files at the same time.\n")); && url_scheme (*t) != SCHEME_FTP) status = retrieve_tree (*t); else - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt); + status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); if (opt.delete_after && file_exists_p(filename)) { @@ -971,6 +971,13 @@ Can't timestamp and not clobber old files at the same time.\n")); logprintf (LOG_NOTQUIET, _("No URLs found in %s.\n"), opt.input_filename); } + + /* Print broken links. */ + if (opt.recursive && opt.spider) + { + print_broken_links(); + } + /* Print the downloaded sum. */ if ((opt.recursive || opt.page_requisites || nurl > 1 diff --git a/src/recur.c b/src/recur.c index 1e277ca3..611e3606 100644 --- a/src/recur.c +++ b/src/recur.c @@ -246,11 +246,8 @@ retrieve_tree (const char *start_url) { int dt = 0; char *redirected = NULL; - bool oldrec = opt.recursive; - opt.recursive = false; - status = retrieve_url (url, &file, &redirected, referer, &dt); - opt.recursive = oldrec; + status = retrieve_url (url, &file, &redirected, referer, &dt, false); if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) @@ -348,17 +345,21 @@ retrieve_tree (const char *start_url) } } - if (opt.delete_after || (file && !acceptable (file))) + if (file + && (opt.delete_after + || opt.spider /* opt.recursive is implicitely true */ + || !acceptable (file))) { /* Either --delete-after was specified, or we loaded this - otherwise rejected (e.g. by -R) HTML file just so we - could harvest its hyperlinks -- in either case, delete - the local file. */ + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, + delete the local file. */ DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : - "recursive rejection criteria")); + (opt.spider ? "--spider" : + "recursive rejection criteria"))); logprintf (LOG_VERBOSE, - (opt.delete_after + (opt.delete_after || opt.spider ? _("Removing %s.\n") : _("Removing %s since it should be rejected.\n")), file); diff --git a/src/res.c b/src/res.c index 630d74b5..656f2895 100644 --- a/src/res.c +++ b/src/res.c @@ -538,7 +538,7 @@ res_retrieve_file (const char *url, char **file) logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; - err = retrieve_url (robots_url, file, NULL, NULL, NULL); + err = retrieve_url (robots_url, file, NULL, NULL, NULL, false); xfree (robots_url); if (err != RETROK && *file != NULL) diff --git a/src/retr.c b/src/retr.c index 14f4ffab..18a7b323 100644 --- a/src/retr.c +++ b/src/retr.c @@ -602,7 +602,7 @@ static char *getproxy (struct url *); uerr_t retrieve_url (const char *origurl, char **file, char **newloc, - const char *refurl, int *dt) + const char *refurl, int *dt, bool recursive) { uerr_t result; char *url; @@ -684,13 +684,12 @@ retrieve_url (const char *origurl, char **file, char **newloc, /* If this is a redirection, temporarily turn off opt.ftp_glob and opt.recursive, both being undesirable when following redirects. */ - bool oldrec = opt.recursive, oldglob = opt.ftp_glob; + bool oldrec = recursive, glob = opt.ftp_glob; if (redirection_count) - opt.recursive = opt.ftp_glob = false; + oldrec = glob = false; - result = ftp_loop (u, dt, proxy_url); - opt.recursive = oldrec; - opt.ftp_glob = oldglob; + result = ftp_loop (u, dt, proxy_url, recursive, glob); + recursive = oldrec; /* There is a possibility of having HTTP being redirected to FTP. In these cases we must decide whether the text is HTML @@ -848,7 +847,7 @@ retrieve_from_file (const char *file, bool html, int *count) && cur_url->url->scheme != SCHEME_FTP) status = retrieve_tree (cur_url->url->url); else - status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt); + status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive); if (filename && opt.delete_after && file_exists_p (filename)) { diff --git a/src/retr.h b/src/retr.h index a6124587..3928cfd5 100644 --- a/src/retr.h +++ b/src/retr.h @@ -50,7 +50,7 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int); char *fd_read_hunk (int, hunk_terminator_t, long, long); char *fd_read_line (int); -uerr_t retrieve_url (const char *, char **, char **, const char *, int *); +uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool); uerr_t retrieve_from_file (const char *, bool, int *); const char *retr_rate (wgint, double); diff --git a/src/wget.h b/src/wget.h index bfcaf8dd..d0303e40 100644 --- a/src/wget.h +++ b/src/wget.h @@ -283,18 +283,23 @@ enum simplified. */ typedef enum { + /* 0 */ NOCONERROR, HOSTERR, CONSOCKERR, CONERROR, CONSSLERR, - CONIMPOSSIBLE, NEWLOCATION, NOTENOUGHMEM, CONPORTERR, - CONCLOSED, FTPOK, FTPLOGINC, FTPLOGREFUSED, FTPPORTERR, FTPSYSERR, - FTPNSFOD, FTPRETROK, FTPUNKNOWNTYPE, FTPRERR, - FTPREXC, FTPSRVERR, FTPRETRINT, FTPRESTFAIL, URLERROR, - FOPENERR, FOPEN_EXCL_ERR, FWRITEERR, HOK, HLEXC, HEOF, + CONIMPOSSIBLE, NEWLOCATION, NOTENOUGHMEM, CONPORTERR, CONCLOSED, + /* 10 */ + FTPOK, FTPLOGINC, FTPLOGREFUSED, FTPPORTERR, FTPSYSERR, + FTPNSFOD, FTPRETROK, FTPUNKNOWNTYPE, FTPRERR, FTPREXC, + /* 20 */ + FTPSRVERR, FTPRETRINT, FTPRESTFAIL, URLERROR, FOPENERR, + FOPEN_EXCL_ERR, FWRITEERR, HOK, HLEXC, HEOF, + /* 30 */ HERR, RETROK, RECLEVELEXC, FTPACCDENIED, WRONGCODE, - FTPINVPASV, FTPNOPASV, - CONTNOTSUPPORTED, RETRUNNEEDED, RETRFINISHED, READERR, TRYLIMEXC, - URLBADPATTERN, FILEBADFILE, RANGEERR, RETRBADPATTERN, - RETNOTSUP, ROBOTSOK, NOROBOTS, PROXERR, AUTHFAILED, - QUOTEXC, WRITEFAILED, SSLINITFAILED + FTPINVPASV, FTPNOPASV, CONTNOTSUPPORTED, RETRUNNEEDED, RETRFINISHED, + /* 40 */ + READERR, TRYLIMEXC, URLBADPATTERN, FILEBADFILE, RANGEERR, + RETRBADPATTERN, RETNOTSUP, ROBOTSOK, NOROBOTS, PROXERR, + /* 50 */ + AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED } uerr_t; #endif /* WGET_H */