mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Fixed recursive spider mode.
This commit is contained in:
parent
739cc5ec16
commit
01093c0c33
@ -1,3 +1,27 @@
|
||||
2006-05-25 Mauro Tortonesi <mauro@ferrara.linux.it>
|
||||
|
||||
* convert.c: Added mechanisms to keep track broken links.
|
||||
|
||||
* convert.h: Ditto.
|
||||
|
||||
* wget.h: Reordered and enumerated uerr_t constants.
|
||||
|
||||
* recur.c: Fixes to support recursive spider mode.
|
||||
|
||||
* http.c: Ditto.
|
||||
|
||||
* main.c: Print broken links in case of recursive spider mode.
|
||||
|
||||
* retr.c: Changed interface of retrieve_url.
|
||||
|
||||
* retr.h: Ditto.
|
||||
|
||||
* ftp.c: Changed interface of ftp_loop.
|
||||
|
||||
* ftp.h: Ditto.
|
||||
|
||||
* res.c: Minor change to reflect changes in interface of retrieve_url.
|
||||
|
||||
2006-05-18 Lawrence Jones <lawrence.jones@ugs.com>
|
||||
|
||||
* ftp-ls.c (ftp_parse_unix_ls): Correct size parsing, add size
|
||||
|
116
src/convert.c
116
src/convert.c
@ -53,6 +53,8 @@ struct hash_table *dl_url_file_map;
|
||||
conversion after Wget is done. */
|
||||
struct hash_table *downloaded_html_set;
|
||||
|
||||
static struct hash_table *nonexisting_urls_hash;
|
||||
|
||||
static void convert_links (const char *, struct urlpos *);
|
||||
|
||||
/* This function is called when the retrieval is done to convert the
|
||||
@ -832,6 +834,7 @@ register_html (const char *url, const char *file)
|
||||
}
|
||||
|
||||
static void downloaded_files_free (void);
|
||||
static void nonexisting_urls_free (void);
|
||||
|
||||
/* Cleanup the data structures associated with this file. */
|
||||
|
||||
@ -853,6 +856,7 @@ convert_cleanup (void)
|
||||
if (downloaded_html_set)
|
||||
string_set_free (downloaded_html_set);
|
||||
downloaded_files_free ();
|
||||
nonexisting_urls_free ();
|
||||
if (converted_files)
|
||||
string_set_free (converted_files);
|
||||
}
|
||||
@ -952,6 +956,118 @@ downloaded_files_free (void)
|
||||
downloaded_files_hash = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remembers broken links. */
|
||||
|
||||
struct broken_urls_list
|
||||
{
|
||||
char *url;
|
||||
struct broken_urls_list *next;
|
||||
};
|
||||
|
||||
static bool
|
||||
in_list (const struct broken_urls_list *list, const char *url)
|
||||
{
|
||||
const struct broken_urls_list *ptr;
|
||||
|
||||
for (ptr = list; ptr; ptr = ptr->next)
|
||||
{
|
||||
/* TODO: strcasecmp may not be appropriate to compare URLs */
|
||||
if (strcasecmp (url, ptr->url) == 0) return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
nonexisting_url (const char *url, const char *referrer)
|
||||
{
|
||||
struct broken_urls_list *list;
|
||||
|
||||
if (!nonexisting_urls_hash)
|
||||
nonexisting_urls_hash = make_string_hash_table (0);
|
||||
|
||||
list = hash_table_get (nonexisting_urls_hash, url);
|
||||
if (!list)
|
||||
{
|
||||
list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
|
||||
list->url = referrer ? xstrdup (referrer) : NULL;
|
||||
hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
|
||||
}
|
||||
else if (list && !in_list (list, referrer))
|
||||
{
|
||||
/* Append referrer at the end of the list */
|
||||
struct broken_urls_list *newnode;
|
||||
|
||||
while (list->next) list = list->next;
|
||||
|
||||
newnode = xnew0 (struct broken_urls_list);
|
||||
newnode->url = xstrdup (referrer);
|
||||
list->next = newnode;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
nonexisting_urls_free (void)
|
||||
{
|
||||
if (nonexisting_urls_hash)
|
||||
{
|
||||
hash_table_iterator iter;
|
||||
for (hash_table_iterate (nonexisting_urls_hash, &iter);
|
||||
hash_table_iter_next (&iter);
|
||||
)
|
||||
{
|
||||
xfree (iter.key);
|
||||
xfree (iter.value);
|
||||
}
|
||||
hash_table_destroy (nonexisting_urls_hash);
|
||||
nonexisting_urls_hash = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
print_broken_links (void)
|
||||
{
|
||||
hash_table_iterator iter;
|
||||
int num_elems;
|
||||
|
||||
if (!nonexisting_urls_hash)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
|
||||
return;
|
||||
}
|
||||
|
||||
num_elems = hash_table_count (nonexisting_urls_hash);
|
||||
assert (num_elems > 0);
|
||||
|
||||
if (num_elems > 1)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
|
||||
num_elems);
|
||||
}
|
||||
else
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
|
||||
}
|
||||
|
||||
for (hash_table_iterate (nonexisting_urls_hash, &iter);
|
||||
hash_table_iter_next (&iter);
|
||||
)
|
||||
{
|
||||
struct broken_urls_list *list;
|
||||
|
||||
logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
|
||||
|
||||
for (list = (struct broken_urls_list *) iter.value;
|
||||
list;
|
||||
list = list->next)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
|
||||
}
|
||||
}
|
||||
logputs (LOG_NOTQUIET, "\n");
|
||||
}
|
||||
|
||||
|
||||
/* The function returns the pointer to the malloc-ed quoted version of
|
||||
string s. It will recognize and quote numeric and special graphic
|
||||
|
@ -104,4 +104,7 @@ void convert_cleanup (void);
|
||||
|
||||
char *html_quote_string (const char *);
|
||||
|
||||
void nonexisting_url (const char *, const char *);
|
||||
void print_broken_links (void);
|
||||
|
||||
#endif /* CONVERT_H */
|
||||
|
@ -1773,7 +1773,7 @@ ftp_retrieve_glob (struct url *u, ccon *con, int action)
|
||||
of URL. Inherently, its capabilities are limited on what can be
|
||||
encoded into a URL. */
|
||||
uerr_t
|
||||
ftp_loop (struct url *u, int *dt, struct url *proxy)
|
||||
ftp_loop (struct url *u, int *dt, struct url *proxy, bool recursive, bool glob)
|
||||
{
|
||||
ccon con; /* FTP connection */
|
||||
uerr_t res;
|
||||
@ -1791,7 +1791,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy)
|
||||
/* If the file name is empty, the user probably wants a directory
|
||||
index. We'll provide one, properly HTML-ized. Unless
|
||||
opt.htmlify is 0, of course. :-) */
|
||||
if (!*u->file && !opt.recursive)
|
||||
if (!*u->file && !recursive)
|
||||
{
|
||||
struct fileinfo *f;
|
||||
res = ftp_get_listing (u, &con, &f);
|
||||
@ -1832,7 +1832,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy)
|
||||
else
|
||||
{
|
||||
bool ispattern = false;
|
||||
if (opt.ftp_glob)
|
||||
if (glob)
|
||||
{
|
||||
/* Treat the URL as a pattern if the file name part of the
|
||||
URL path contains wildcards. (Don't check for u->file
|
||||
@ -1843,7 +1843,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy)
|
||||
file_part = u->path;
|
||||
ispattern = has_wildcards_p (file_part);
|
||||
}
|
||||
if (ispattern || opt.recursive || opt.timestamping)
|
||||
if (ispattern || recursive || opt.timestamping)
|
||||
{
|
||||
/* ftp_retrieve_glob is a catch-all function that gets called
|
||||
if we need globbing, time-stamping or recursion. Its
|
||||
|
@ -119,7 +119,7 @@ enum wget_ftp_fstatus
|
||||
};
|
||||
|
||||
struct fileinfo *ftp_parse_ls (const char *, const enum stype);
|
||||
uerr_t ftp_loop (struct url *, int *, struct url *);
|
||||
uerr_t ftp_loop (struct url *, int *, struct url *, bool, bool);
|
||||
|
||||
uerr_t ftp_index (const char *, struct url *, struct fileinfo *);
|
||||
|
||||
|
18
src/http.c
18
src/http.c
@ -2309,7 +2309,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
|
||||
/* Default document type is empty. However, if spider mode is
|
||||
on or time-stamping is employed, HEAD_ONLY commands is
|
||||
encoded within *dt. */
|
||||
if (opt.spider || (opt.timestamping && !got_head))
|
||||
if ((opt.spider && !opt.recursive) || (opt.timestamping && !got_head))
|
||||
*dt |= HEAD_ONLY;
|
||||
else
|
||||
*dt &= ~HEAD_ONLY;
|
||||
@ -2400,20 +2400,26 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
|
||||
/* All possibilities should have been exhausted. */
|
||||
abort ();
|
||||
}
|
||||
|
||||
|
||||
if (!(*dt & RETROKF))
|
||||
{
|
||||
char *hurl = NULL;
|
||||
if (!opt.verbose)
|
||||
{
|
||||
/* #### Ugly ugly ugly! */
|
||||
char *hurl = url_string (u, true);
|
||||
hurl = url_string (u, true);
|
||||
logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
|
||||
xfree (hurl);
|
||||
}
|
||||
if (opt.spider && opt.recursive)
|
||||
{
|
||||
if (!hurl) hurl = url_string (u, true);
|
||||
nonexisting_url (hurl, referer);
|
||||
}
|
||||
logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
|
||||
tms, hstat.statcode, escnonprint (hstat.error));
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
ret = WRONGCODE;
|
||||
xfree_null (hurl);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@ -2479,7 +2485,7 @@ The sizes do not match (local %s) -- retrieving.\n"),
|
||||
}
|
||||
|
||||
if ((tmr != (time_t) (-1))
|
||||
&& !opt.spider
|
||||
&& (!opt.spider || opt.recursive)
|
||||
&& ((hstat.len == hstat.contlen) ||
|
||||
((hstat.res == 0) && (hstat.contlen == -1))))
|
||||
{
|
||||
@ -2498,7 +2504,7 @@ The sizes do not match (local %s) -- retrieving.\n"),
|
||||
}
|
||||
/* End of time-stamping section. */
|
||||
|
||||
if (opt.spider)
|
||||
if (opt.spider && !opt.recursive)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode,
|
||||
escnonprint (hstat.error));
|
||||
|
@ -948,7 +948,7 @@ Can't timestamp and not clobber old files at the same time.\n"));
|
||||
&& url_scheme (*t) != SCHEME_FTP)
|
||||
status = retrieve_tree (*t);
|
||||
else
|
||||
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt);
|
||||
status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
|
||||
|
||||
if (opt.delete_after && file_exists_p(filename))
|
||||
{
|
||||
@ -971,6 +971,13 @@ Can't timestamp and not clobber old files at the same time.\n"));
|
||||
logprintf (LOG_NOTQUIET, _("No URLs found in %s.\n"),
|
||||
opt.input_filename);
|
||||
}
|
||||
|
||||
/* Print broken links. */
|
||||
if (opt.recursive && opt.spider)
|
||||
{
|
||||
print_broken_links();
|
||||
}
|
||||
|
||||
/* Print the downloaded sum. */
|
||||
if ((opt.recursive || opt.page_requisites
|
||||
|| nurl > 1
|
||||
|
21
src/recur.c
21
src/recur.c
@ -246,11 +246,8 @@ retrieve_tree (const char *start_url)
|
||||
{
|
||||
int dt = 0;
|
||||
char *redirected = NULL;
|
||||
bool oldrec = opt.recursive;
|
||||
|
||||
opt.recursive = false;
|
||||
status = retrieve_url (url, &file, &redirected, referer, &dt);
|
||||
opt.recursive = oldrec;
|
||||
status = retrieve_url (url, &file, &redirected, referer, &dt, false);
|
||||
|
||||
if (html_allowed && file && status == RETROK
|
||||
&& (dt & RETROKF) && (dt & TEXTHTML))
|
||||
@ -348,17 +345,21 @@ retrieve_tree (const char *start_url)
|
||||
}
|
||||
}
|
||||
|
||||
if (opt.delete_after || (file && !acceptable (file)))
|
||||
if (file
|
||||
&& (opt.delete_after
|
||||
|| opt.spider /* opt.recursive is implicitely true */
|
||||
|| !acceptable (file)))
|
||||
{
|
||||
/* Either --delete-after was specified, or we loaded this
|
||||
otherwise rejected (e.g. by -R) HTML file just so we
|
||||
could harvest its hyperlinks -- in either case, delete
|
||||
the local file. */
|
||||
(otherwise unneeded because of --spider or rejected by -R)
|
||||
HTML file just to harvest its hyperlinks -- in either case,
|
||||
delete the local file. */
|
||||
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
|
||||
opt.delete_after ? "--delete-after" :
|
||||
"recursive rejection criteria"));
|
||||
(opt.spider ? "--spider" :
|
||||
"recursive rejection criteria")));
|
||||
logprintf (LOG_VERBOSE,
|
||||
(opt.delete_after
|
||||
(opt.delete_after || opt.spider
|
||||
? _("Removing %s.\n")
|
||||
: _("Removing %s since it should be rejected.\n")),
|
||||
file);
|
||||
|
@ -538,7 +538,7 @@ res_retrieve_file (const char *url, char **file)
|
||||
|
||||
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
|
||||
*file = NULL;
|
||||
err = retrieve_url (robots_url, file, NULL, NULL, NULL);
|
||||
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
|
||||
xfree (robots_url);
|
||||
|
||||
if (err != RETROK && *file != NULL)
|
||||
|
13
src/retr.c
13
src/retr.c
@ -602,7 +602,7 @@ static char *getproxy (struct url *);
|
||||
|
||||
uerr_t
|
||||
retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
const char *refurl, int *dt)
|
||||
const char *refurl, int *dt, bool recursive)
|
||||
{
|
||||
uerr_t result;
|
||||
char *url;
|
||||
@ -684,13 +684,12 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
/* If this is a redirection, temporarily turn off opt.ftp_glob
|
||||
and opt.recursive, both being undesirable when following
|
||||
redirects. */
|
||||
bool oldrec = opt.recursive, oldglob = opt.ftp_glob;
|
||||
bool oldrec = recursive, glob = opt.ftp_glob;
|
||||
if (redirection_count)
|
||||
opt.recursive = opt.ftp_glob = false;
|
||||
oldrec = glob = false;
|
||||
|
||||
result = ftp_loop (u, dt, proxy_url);
|
||||
opt.recursive = oldrec;
|
||||
opt.ftp_glob = oldglob;
|
||||
result = ftp_loop (u, dt, proxy_url, recursive, glob);
|
||||
recursive = oldrec;
|
||||
|
||||
/* There is a possibility of having HTTP being redirected to
|
||||
FTP. In these cases we must decide whether the text is HTML
|
||||
@ -848,7 +847,7 @@ retrieve_from_file (const char *file, bool html, int *count)
|
||||
&& cur_url->url->scheme != SCHEME_FTP)
|
||||
status = retrieve_tree (cur_url->url->url);
|
||||
else
|
||||
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
|
||||
status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
|
||||
|
||||
if (filename && opt.delete_after && file_exists_p (filename))
|
||||
{
|
||||
|
@ -50,7 +50,7 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
|
||||
char *fd_read_hunk (int, hunk_terminator_t, long, long);
|
||||
char *fd_read_line (int);
|
||||
|
||||
uerr_t retrieve_url (const char *, char **, char **, const char *, int *);
|
||||
uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
|
||||
uerr_t retrieve_from_file (const char *, bool, int *);
|
||||
|
||||
const char *retr_rate (wgint, double);
|
||||
|
25
src/wget.h
25
src/wget.h
@ -283,18 +283,23 @@ enum
|
||||
simplified. */
|
||||
typedef enum
|
||||
{
|
||||
/* 0 */
|
||||
NOCONERROR, HOSTERR, CONSOCKERR, CONERROR, CONSSLERR,
|
||||
CONIMPOSSIBLE, NEWLOCATION, NOTENOUGHMEM, CONPORTERR,
|
||||
CONCLOSED, FTPOK, FTPLOGINC, FTPLOGREFUSED, FTPPORTERR, FTPSYSERR,
|
||||
FTPNSFOD, FTPRETROK, FTPUNKNOWNTYPE, FTPRERR,
|
||||
FTPREXC, FTPSRVERR, FTPRETRINT, FTPRESTFAIL, URLERROR,
|
||||
FOPENERR, FOPEN_EXCL_ERR, FWRITEERR, HOK, HLEXC, HEOF,
|
||||
CONIMPOSSIBLE, NEWLOCATION, NOTENOUGHMEM, CONPORTERR, CONCLOSED,
|
||||
/* 10 */
|
||||
FTPOK, FTPLOGINC, FTPLOGREFUSED, FTPPORTERR, FTPSYSERR,
|
||||
FTPNSFOD, FTPRETROK, FTPUNKNOWNTYPE, FTPRERR, FTPREXC,
|
||||
/* 20 */
|
||||
FTPSRVERR, FTPRETRINT, FTPRESTFAIL, URLERROR, FOPENERR,
|
||||
FOPEN_EXCL_ERR, FWRITEERR, HOK, HLEXC, HEOF,
|
||||
/* 30 */
|
||||
HERR, RETROK, RECLEVELEXC, FTPACCDENIED, WRONGCODE,
|
||||
FTPINVPASV, FTPNOPASV,
|
||||
CONTNOTSUPPORTED, RETRUNNEEDED, RETRFINISHED, READERR, TRYLIMEXC,
|
||||
URLBADPATTERN, FILEBADFILE, RANGEERR, RETRBADPATTERN,
|
||||
RETNOTSUP, ROBOTSOK, NOROBOTS, PROXERR, AUTHFAILED,
|
||||
QUOTEXC, WRITEFAILED, SSLINITFAILED
|
||||
FTPINVPASV, FTPNOPASV, CONTNOTSUPPORTED, RETRUNNEEDED, RETRFINISHED,
|
||||
/* 40 */
|
||||
READERR, TRYLIMEXC, URLBADPATTERN, FILEBADFILE, RANGEERR,
|
||||
RETRBADPATTERN, RETNOTSUP, ROBOTSOK, NOROBOTS, PROXERR,
|
||||
/* 50 */
|
||||
AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED
|
||||
} uerr_t;
|
||||
|
||||
#endif /* WGET_H */
|
||||
|
Loading…
Reference in New Issue
Block a user