mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Add option to write URL rejections to a tab-delimited CSV log.
* main.c: Add "--rejected-log" option. * init.c: Add "rejectedlog" command. * options.h: Add "rejected_log" parameter string. * wget.texi: Add brief documentation on new --rejected-log option. * recur.c: Optionally log details of URLs not traversed. Add reject_reason enum. (download_child_p -> download_child): Return a reject_reason. (descend_redirect_p -> descend_redirect): Return a reject_reason. (retrieve_tree): Support logging reasons for rejection. Add write_reject_log_header that writes a CSV format header to a file. Add write_reject_log_url that writes a url struct to a file in CSV format. Add write_reject_log_reason that writes the URL and parent URL as well as the rejection reason to a CSV file. * Test--rejected-log.px: Add a basic test for the --rejected-log command. * tests/Makefile.am: Run Test--rejected-log.px. This allows you to figure out why URLs are being rejected and some context around it. CSV is used as the output format since it can be used easily parsed, it's delimited by tabs instead of commas to allow using all (quoted) URL characters and includes column names which may be used for compatibility.
This commit is contained in:
parent
670eb924e7
commit
e4db00d74d
@ -551,6 +551,11 @@ would be resolved to @samp{http://foo/baz/b.html}.
|
|||||||
@cindex specify config
|
@cindex specify config
|
||||||
@item --config=@var{FILE}
|
@item --config=@var{FILE}
|
||||||
Specify the location of a startup file you wish to use.
|
Specify the location of a startup file you wish to use.
|
||||||
|
|
||||||
|
@item --rejected-log=@var{logfile}
|
||||||
|
Logs all URL rejections to @var{logfile} as comma separated values. The values
|
||||||
|
include the reason of rejection, the URL and the parent URL it was found in.
|
||||||
|
|
||||||
@end table
|
@end table
|
||||||
|
|
||||||
@node Download Options, Directory Options, Logging and Input File Options, Invoking
|
@node Download Options, Directory Options, Logging and Input File Options, Invoking
|
||||||
|
@ -274,6 +274,7 @@ static const struct {
|
|||||||
{ "referer", &opt.referer, cmd_string },
|
{ "referer", &opt.referer, cmd_string },
|
||||||
{ "regextype", &opt.regex_type, cmd_spec_regex_type },
|
{ "regextype", &opt.regex_type, cmd_spec_regex_type },
|
||||||
{ "reject", &opt.rejects, cmd_vector },
|
{ "reject", &opt.rejects, cmd_vector },
|
||||||
|
{ "rejectedlog", &opt.rejected_log, cmd_file },
|
||||||
{ "rejectregex", &opt.rejectregex_s, cmd_string },
|
{ "rejectregex", &opt.rejectregex_s, cmd_string },
|
||||||
{ "relativeonly", &opt.relative_only, cmd_boolean },
|
{ "relativeonly", &opt.relative_only, cmd_boolean },
|
||||||
{ "remoteencoding", &opt.encoding_remote, cmd_string },
|
{ "remoteencoding", &opt.encoding_remote, cmd_string },
|
||||||
@ -1856,6 +1857,7 @@ cleanup (void)
|
|||||||
xfree (opt.post_data);
|
xfree (opt.post_data);
|
||||||
xfree (opt.body_data);
|
xfree (opt.body_data);
|
||||||
xfree (opt.body_file);
|
xfree (opt.body_file);
|
||||||
|
xfree (opt.rejected_log);
|
||||||
|
|
||||||
#endif /* DEBUG_MALLOC */
|
#endif /* DEBUG_MALLOC */
|
||||||
}
|
}
|
||||||
|
@ -321,6 +321,7 @@ static struct cmdline_option option_data[] =
|
|||||||
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
|
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
|
||||||
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
|
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
|
||||||
{ "local-encoding", 0, OPT_VALUE, "localencoding", -1 },
|
{ "local-encoding", 0, OPT_VALUE, "localencoding", -1 },
|
||||||
|
{ "rejected-log", 0, OPT_VALUE, "rejectedlog", -1 },
|
||||||
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
|
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
|
||||||
#ifdef HAVE_METALINK
|
#ifdef HAVE_METALINK
|
||||||
{ "metalink-over-http", 0, OPT_BOOLEAN, "metalink-over-http", -1 },
|
{ "metalink-over-http", 0, OPT_BOOLEAN, "metalink-over-http", -1 },
|
||||||
@ -576,6 +577,8 @@ Logging and input file:\n"),
|
|||||||
--config=FILE specify config file to use\n"),
|
--config=FILE specify config file to use\n"),
|
||||||
N_("\
|
N_("\
|
||||||
--no-config do not read any config file\n"),
|
--no-config do not read any config file\n"),
|
||||||
|
N_("\
|
||||||
|
--rejected-log=FILE log reasons for URL rejection to FILE\n"),
|
||||||
"\n",
|
"\n",
|
||||||
|
|
||||||
N_("\
|
N_("\
|
||||||
|
@ -296,6 +296,8 @@ struct options
|
|||||||
name. */
|
name. */
|
||||||
bool report_bps; /*Output bandwidth in bits format*/
|
bool report_bps; /*Output bandwidth in bits format*/
|
||||||
|
|
||||||
|
char *rejected_log; /* The file to log rejected URLS to. */
|
||||||
|
|
||||||
#ifdef HAVE_HSTS
|
#ifdef HAVE_HSTS
|
||||||
bool hsts;
|
bool hsts;
|
||||||
char *hsts_file;
|
char *hsts_file;
|
||||||
|
189
src/recur.c
189
src/recur.c
@ -182,11 +182,19 @@ static int blacklist_contains (struct hash_table *blacklist, const char *url)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool download_child_p (const struct urlpos *, struct url *, int,
|
typedef enum
|
||||||
struct url *, struct hash_table *, struct iri *);
|
{
|
||||||
static bool descend_redirect_p (const char *, struct url *, int,
|
SUCCESS, BLACKLIST, NOTHTTPS, NONHTTP, ABSOLUTE, DOMAIN, PARENT, LIST, REGEX,
|
||||||
struct url *, struct hash_table *, struct iri *);
|
RULES, SPANNEDHOST, ROBOTS
|
||||||
|
} reject_reason;
|
||||||
|
|
||||||
|
static reject_reason download_child (const struct urlpos *, struct url *, int,
|
||||||
|
struct url *, struct hash_table *, struct iri *);
|
||||||
|
static reject_reason descend_redirect (const char *, struct url *, int,
|
||||||
|
struct url *, struct hash_table *, struct iri *);
|
||||||
|
static void write_reject_log_header (FILE *);
|
||||||
|
static void write_reject_log_reason (FILE *, reject_reason, struct url *,
|
||||||
|
struct url *);
|
||||||
|
|
||||||
/* Retrieve a part of the web beginning with START_URL. This used to
|
/* Retrieve a part of the web beginning with START_URL. This used to
|
||||||
be called "recursive retrieval", because the old function was
|
be called "recursive retrieval", because the old function was
|
||||||
@ -244,6 +252,15 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
|||||||
false);
|
false);
|
||||||
blacklist_add (blacklist, start_url_parsed->url);
|
blacklist_add (blacklist, start_url_parsed->url);
|
||||||
|
|
||||||
|
FILE *rejectedlog = 0; /* Don't write a rejected log. */
|
||||||
|
if (opt.rejected_log)
|
||||||
|
{
|
||||||
|
rejectedlog = fopen (opt.rejected_log, "w");
|
||||||
|
write_reject_log_header (rejectedlog);
|
||||||
|
if (!rejectedlog)
|
||||||
|
logprintf (LOG_NOTQUIET, "%s: %s\n", opt.rejected_log, strerror (errno));
|
||||||
|
}
|
||||||
|
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
bool descend = false;
|
bool descend = false;
|
||||||
@ -266,9 +283,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
/* ...and download it. Note that this download is in most cases
|
/* ...and download it. Note that this download is in most cases
|
||||||
unconditional, as download_child_p already makes sure a file
|
unconditional, as download_child already makes sure a file
|
||||||
doesn't get enqueued twice -- and yet this check is here, and
|
doesn't get enqueued twice -- and yet this check is here, and
|
||||||
not in download_child_p. This is so that if you run `wget -r
|
not in download_child. This is so that if you run `wget -r
|
||||||
URL1 URL2', and a random URL is encountered once under URL1
|
URL1 URL2', and a random URL is encountered once under URL1
|
||||||
and again under URL2, but at a different (possibly smaller)
|
and again under URL2, but at a different (possibly smaller)
|
||||||
depth, we want the URL's children to be taken into account
|
depth, we want the URL's children to be taken into account
|
||||||
@ -337,13 +354,19 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
|||||||
want to follow it. */
|
want to follow it. */
|
||||||
if (descend)
|
if (descend)
|
||||||
{
|
{
|
||||||
if (!descend_redirect_p (redirected, url_parsed, depth,
|
reject_reason r = descend_redirect (redirected, url_parsed,
|
||||||
start_url_parsed, blacklist, i))
|
depth, start_url_parsed, blacklist, i);
|
||||||
descend = false;
|
if (r == SUCCESS)
|
||||||
|
{
|
||||||
|
/* Make sure that the old pre-redirect form gets
|
||||||
|
blacklisted. */
|
||||||
|
blacklist_add (blacklist, url);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
/* Make sure that the old pre-redirect form gets
|
{
|
||||||
blacklisted. */
|
write_reject_log_reason (rejectedlog, r, url_parsed, start_url_parsed);
|
||||||
blacklist_add (blacklist, url);
|
descend = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
xfree (url);
|
xfree (url);
|
||||||
@ -425,8 +448,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
|||||||
continue;
|
continue;
|
||||||
if (dash_p_leaf_HTML && !child->link_inline_p)
|
if (dash_p_leaf_HTML && !child->link_inline_p)
|
||||||
continue;
|
continue;
|
||||||
if (download_child_p (child, url_parsed, depth, start_url_parsed,
|
reject_reason r = download_child (child, url_parsed, depth,
|
||||||
blacklist, i))
|
start_url_parsed, blacklist, i);
|
||||||
|
if (r == SUCCESS)
|
||||||
{
|
{
|
||||||
ci = iri_new ();
|
ci = iri_new ();
|
||||||
set_uri_encoding (ci, i->content_encoding, false);
|
set_uri_encoding (ci, i->content_encoding, false);
|
||||||
@ -439,6 +463,10 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
|||||||
same URL twice. */
|
same URL twice. */
|
||||||
blacklist_add (blacklist, child->url->url);
|
blacklist_add (blacklist, child->url->url);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
write_reject_log_reason (rejectedlog, r, child->url, url_parsed);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strip_auth)
|
if (strip_auth)
|
||||||
@ -478,6 +506,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
|||||||
iri_free (i);
|
iri_free (i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (rejectedlog)
|
||||||
|
fclose (rejectedlog);
|
||||||
|
|
||||||
/* If anything is left of the queue due to a premature exit, free it
|
/* If anything is left of the queue due to a premature exit, free it
|
||||||
now. */
|
now. */
|
||||||
{
|
{
|
||||||
@ -513,14 +544,15 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
|||||||
by storing these URLs to BLACKLIST. This may or may not help. It
|
by storing these URLs to BLACKLIST. This may or may not help. It
|
||||||
will help if those URLs are encountered many times. */
|
will help if those URLs are encountered many times. */
|
||||||
|
|
||||||
static bool
|
static reject_reason
|
||||||
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
download_child (const struct urlpos *upos, struct url *parent, int depth,
|
||||||
struct url *start_url_parsed, struct hash_table *blacklist,
|
struct url *start_url_parsed, struct hash_table *blacklist,
|
||||||
struct iri *iri)
|
struct iri *iri)
|
||||||
{
|
{
|
||||||
struct url *u = upos->url;
|
struct url *u = upos->url;
|
||||||
const char *url = u->url;
|
const char *url = u->url;
|
||||||
bool u_scheme_like_http;
|
bool u_scheme_like_http;
|
||||||
|
reject_reason reason = SUCCESS;
|
||||||
|
|
||||||
DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
|
DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
|
||||||
|
|
||||||
@ -529,11 +561,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
if (opt.spider)
|
if (opt.spider)
|
||||||
{
|
{
|
||||||
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
|
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
|
||||||
DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
|
DEBUGP (("download_child: parent->url is: %s\n", quote (parent->url)));
|
||||||
visited_url (url, referrer);
|
visited_url (url, referrer);
|
||||||
xfree (referrer);
|
xfree (referrer);
|
||||||
}
|
}
|
||||||
DEBUGP (("Already on the black list.\n"));
|
DEBUGP (("Already on the black list.\n"));
|
||||||
|
reason = BLACKLIST;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -563,6 +596,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
if (opt.https_only && u->scheme != SCHEME_HTTPS)
|
if (opt.https_only && u->scheme != SCHEME_HTTPS)
|
||||||
{
|
{
|
||||||
DEBUGP (("Not following non-HTTPS links.\n"));
|
DEBUGP (("Not following non-HTTPS links.\n"));
|
||||||
|
reason = NOTHTTPS;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -574,6 +608,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
|
if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
|
||||||
{
|
{
|
||||||
DEBUGP (("Not following non-HTTP schemes.\n"));
|
DEBUGP (("Not following non-HTTP schemes.\n"));
|
||||||
|
reason = NONHTTP;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -583,6 +618,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
if (opt.relative_only && !upos->link_relative_p)
|
if (opt.relative_only && !upos->link_relative_p)
|
||||||
{
|
{
|
||||||
DEBUGP (("It doesn't really look like a relative link.\n"));
|
DEBUGP (("It doesn't really look like a relative link.\n"));
|
||||||
|
reason = ABSOLUTE;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -591,6 +627,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
if (!accept_domain (u))
|
if (!accept_domain (u))
|
||||||
{
|
{
|
||||||
DEBUGP (("The domain was not accepted.\n"));
|
DEBUGP (("The domain was not accepted.\n"));
|
||||||
|
reason = DOMAIN;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -610,6 +647,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
{
|
{
|
||||||
DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
|
DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
|
||||||
u->dir, start_url_parsed->dir));
|
u->dir, start_url_parsed->dir));
|
||||||
|
reason = PARENT;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -622,12 +660,14 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
if (!accdir (u->dir))
|
if (!accdir (u->dir))
|
||||||
{
|
{
|
||||||
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
|
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
|
||||||
|
reason = LIST;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!accept_url (url))
|
if (!accept_url (url))
|
||||||
{
|
{
|
||||||
DEBUGP (("%s is excluded/not-included through regex.\n", url));
|
DEBUGP (("%s is excluded/not-included through regex.\n", url));
|
||||||
|
reason = REGEX;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -652,6 +692,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
{
|
{
|
||||||
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
|
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
|
||||||
url, u->file));
|
url, u->file));
|
||||||
|
reason = RULES;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -662,6 +703,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
{
|
{
|
||||||
DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
|
DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
|
||||||
u->host, parent->host));
|
u->host, parent->host));
|
||||||
|
reason = SPANNEDHOST;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -704,35 +746,36 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
{
|
{
|
||||||
DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
|
DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
|
||||||
blacklist_add (blacklist, url);
|
blacklist_add (blacklist, url);
|
||||||
|
reason = ROBOTS;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The URL has passed all the tests. It can be placed in the
|
out:
|
||||||
download queue. */
|
|
||||||
DEBUGP (("Decided to load it.\n"));
|
|
||||||
|
|
||||||
return true;
|
if (reason == SUCCESS)
|
||||||
|
/* The URL has passed all the tests. It can be placed in the
|
||||||
|
download queue. */
|
||||||
|
DEBUGP (("Decided to load it.\n"));
|
||||||
|
else
|
||||||
|
DEBUGP (("Decided NOT to load it.\n"));
|
||||||
|
|
||||||
out:
|
return reason;
|
||||||
DEBUGP (("Decided NOT to load it.\n"));
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This function determines whether we will consider downloading the
|
/* This function determines whether we will consider downloading the
|
||||||
children of a URL whose download resulted in a redirection,
|
children of a URL whose download resulted in a redirection,
|
||||||
possibly to another host, etc. It is needed very rarely, and thus
|
possibly to another host, etc. It is needed very rarely, and thus
|
||||||
it is merely a simple-minded wrapper around download_child_p. */
|
it is merely a simple-minded wrapper around download_child. */
|
||||||
|
|
||||||
static bool
|
static reject_reason
|
||||||
descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
|
descend_redirect (const char *redirected, struct url *orig_parsed, int depth,
|
||||||
struct url *start_url_parsed, struct hash_table *blacklist,
|
struct url *start_url_parsed, struct hash_table *blacklist,
|
||||||
struct iri *iri)
|
struct iri *iri)
|
||||||
{
|
{
|
||||||
struct url *new_parsed;
|
struct url *new_parsed;
|
||||||
struct urlpos *upos;
|
struct urlpos *upos;
|
||||||
bool success;
|
reject_reason reason;
|
||||||
|
|
||||||
assert (orig_parsed != NULL);
|
assert (orig_parsed != NULL);
|
||||||
|
|
||||||
@ -742,10 +785,10 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
|
|||||||
upos = xnew0 (struct urlpos);
|
upos = xnew0 (struct urlpos);
|
||||||
upos->url = new_parsed;
|
upos->url = new_parsed;
|
||||||
|
|
||||||
success = download_child_p (upos, orig_parsed, depth,
|
reason = download_child (upos, orig_parsed, depth,
|
||||||
start_url_parsed, blacklist, iri);
|
start_url_parsed, blacklist, iri);
|
||||||
|
|
||||||
if (success)
|
if (reason == SUCCESS)
|
||||||
blacklist_add (blacklist, upos->url->url);
|
blacklist_add (blacklist, upos->url->url);
|
||||||
else
|
else
|
||||||
DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
|
DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
|
||||||
@ -753,7 +796,89 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
|
|||||||
url_free (new_parsed);
|
url_free (new_parsed);
|
||||||
xfree (upos);
|
xfree (upos);
|
||||||
|
|
||||||
return success;
|
return reason;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* This function writes the rejected log header. */
|
||||||
|
static void
|
||||||
|
write_reject_log_header (FILE *f)
|
||||||
|
{
|
||||||
|
if (!f)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* Note: Update this header when columns change in any way. */
|
||||||
|
fprintf (f, "REASON\t"
|
||||||
|
"U_URL\tU_SCHEME\tU_HOST\tU_PORT\tU_PATH\tU_PARAMS\tU_QUERY\tU_FRAGMENT\t"
|
||||||
|
"P_URL\tP_SCHEME\tP_HOST\tP_PORT\tP_PATH\tP_PARAMS\tP_QUERY\tP_FRAGMENT\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This function writes a URL to the reject log. Internal use only. */
|
||||||
|
static void
|
||||||
|
write_reject_log_url (FILE *f, struct url *url)
|
||||||
|
{
|
||||||
|
if (!f)
|
||||||
|
return;
|
||||||
|
|
||||||
|
char *escaped_str = url_escape (url->url);
|
||||||
|
char const *scheme_str = 0;
|
||||||
|
char empty_str[] = "";
|
||||||
|
|
||||||
|
switch (url->scheme)
|
||||||
|
{
|
||||||
|
case SCHEME_HTTP: scheme_str = "SCHEME_HTTP"; break;
|
||||||
|
#ifdef HAVE_SSL
|
||||||
|
case SCHEME_HTTPS: scheme_str = "SCHEME_HTTPS"; break;
|
||||||
|
#endif
|
||||||
|
case SCHEME_FTP: scheme_str = "SCHEME_FTP"; break;
|
||||||
|
case SCHEME_INVALID: scheme_str = "SCHEME_INVALID"; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf (f, "%s\t%s\t%s\t%i\t%s\t%s\t%s\t%s",
|
||||||
|
escaped_str,
|
||||||
|
scheme_str,
|
||||||
|
url->host,
|
||||||
|
url->port,
|
||||||
|
url->path,
|
||||||
|
url->params ? url->params : empty_str,
|
||||||
|
url->query ? url->query : empty_str,
|
||||||
|
url->fragment ? url->fragment : empty_str);
|
||||||
|
|
||||||
|
free (escaped_str);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This function writes out information on why a URL was rejected and its
|
||||||
|
context from download_child such as the URL being rejected and it's
|
||||||
|
parent's URL. The format it uses is comma separated values but with tabs. */
|
||||||
|
static void
|
||||||
|
write_reject_log_reason (FILE *f, reject_reason r, struct url *url,
|
||||||
|
struct url *parent)
|
||||||
|
{
|
||||||
|
if (!f)
|
||||||
|
return;
|
||||||
|
|
||||||
|
char const *reason_str = 0;
|
||||||
|
switch (r)
|
||||||
|
{
|
||||||
|
case SUCCESS: reason_str = "SUCCESS"; break;
|
||||||
|
case BLACKLIST: reason_str = "BLACKLIST"; break;
|
||||||
|
case NOTHTTPS: reason_str = "NOTHTTPS"; break;
|
||||||
|
case NONHTTP: reason_str = "NONHTTP"; break;
|
||||||
|
case ABSOLUTE: reason_str = "ABSOLUTE"; break;
|
||||||
|
case DOMAIN: reason_str = "DOMAIN"; break;
|
||||||
|
case PARENT: reason_str = "PARENT"; break;
|
||||||
|
case LIST: reason_str = "LIST"; break;
|
||||||
|
case REGEX: reason_str = "REGEX"; break;
|
||||||
|
case RULES: reason_str = "RULES"; break;
|
||||||
|
case SPANNEDHOST: reason_str = "SPANNEDHOST"; break;
|
||||||
|
case ROBOTS: reason_str = "ROBOTS"; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf (f, "%s\t", reason_str);
|
||||||
|
write_reject_log_url (f, url);
|
||||||
|
fprintf (f, "\t");
|
||||||
|
write_reject_log_url (f, parent);
|
||||||
|
fprintf (f, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* vim:set sts=2 sw=2 cino+={s: */
|
/* vim:set sts=2 sw=2 cino+={s: */
|
||||||
|
@ -127,6 +127,7 @@ PX_TESTS = \
|
|||||||
Test--start-pos.px \
|
Test--start-pos.px \
|
||||||
Test--start-pos--continue.px \
|
Test--start-pos--continue.px \
|
||||||
Test--httpsonly-r.px \
|
Test--httpsonly-r.px \
|
||||||
|
Test--rejected-log.px \
|
||||||
Test-204.px
|
Test-204.px
|
||||||
|
|
||||||
EXTRA_DIST = FTPServer.pm FTPTest.pm HTTPServer.pm HTTPTest.pm \
|
EXTRA_DIST = FTPServer.pm FTPTest.pm HTTPServer.pm HTTPTest.pm \
|
||||||
|
138
tests/Test--rejected-log.px
Executable file
138
tests/Test--rejected-log.px
Executable file
@ -0,0 +1,138 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
|
||||||
|
use HTTPTest;
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $mainpage = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Main Page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Recurse to a <a href="http://localhost:{{port}}/secondpage.html">second page</a>.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $secondpage = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Second Page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Recurse to a <a href="http://localhost:{{port}}/thirdpage.html">third page</a>.
|
||||||
|
Try the blacklisted <a href="http://localhost:{{port}}/index.html">main page</a>.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $thirdpage = <<EOF;
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Third Page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
Try a hidden <a href="http://localhost:{{port}}/dummy.txt">dummy file</a>.
|
||||||
|
Try to leave to <a href="http://no.such.domain/">another domain</a>.
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $robots = <<EOF;
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /dummy.txt
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my $log = <<EOF;
|
||||||
|
REASON U_URL U_SCHEME U_HOST U_PORT U_PATH U_PARAMS U_QUERY U_FRAGMENT P_URL P_SCHEME P_HOST P_PORT P_PATH P_PARAMS P_QUERY P_FRAGMENT
|
||||||
|
BLACKLIST http%3A//localhost%3A{{port}}/index.html SCHEME_HTTP localhost {{port}} index.html http%3A//localhost%3A{{port}}/secondpage.html SCHEME_HTTP localhost {{port}} secondpage.html
|
||||||
|
ROBOTS http%3A//localhost%3A{{port}}/dummy.txt SCHEME_HTTP localhost {{port}} dummy.txt http%3A//localhost%3A{{port}}/thirdpage.html SCHEME_HTTP localhost {{port}} thirdpage.html
|
||||||
|
SPANNEDHOST http%3A//no.such.domain/ SCHEME_HTTP no.such.domain 80 http%3A//localhost%3A{{port}}/thirdpage.html SCHEME_HTTP localhost {{port}} thirdpage.html
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# code, msg, headers, content
|
||||||
|
my %urls = (
|
||||||
|
'/index.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Dontcare",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html",
|
||||||
|
},
|
||||||
|
content => $mainpage,
|
||||||
|
},
|
||||||
|
'/secondpage.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Dontcare",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html",
|
||||||
|
},
|
||||||
|
content => $secondpage,
|
||||||
|
},
|
||||||
|
'/thirdpage.html' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Dontcare",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/html",
|
||||||
|
},
|
||||||
|
content => $thirdpage,
|
||||||
|
},
|
||||||
|
'/dummy.txt' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Dontcare",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => "",
|
||||||
|
},
|
||||||
|
'/robots.txt' => {
|
||||||
|
code => "200",
|
||||||
|
msg => "Dontcare",
|
||||||
|
headers => {
|
||||||
|
"Content-type" => "text/plain",
|
||||||
|
},
|
||||||
|
content => $robots
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
my $cmdline = $WgetTest::WGETPATH . " -nd -r --rejected-log log.csv http://localhost:{{port}}/index.html";
|
||||||
|
|
||||||
|
my $expected_error_code = 0;
|
||||||
|
|
||||||
|
my %expected_downloaded_files = (
|
||||||
|
"index.html" => {
|
||||||
|
content => $mainpage,
|
||||||
|
},
|
||||||
|
"secondpage.html" => {
|
||||||
|
content => $secondpage,
|
||||||
|
},
|
||||||
|
"thirdpage.html" => {
|
||||||
|
content => $thirdpage,
|
||||||
|
},
|
||||||
|
"robots.txt" => {
|
||||||
|
content => $robots,
|
||||||
|
},
|
||||||
|
"log.csv" => {
|
||||||
|
content => $log,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
my $the_test = HTTPTest->new (input => \%urls,
|
||||||
|
cmdline => $cmdline,
|
||||||
|
errcode => $expected_error_code,
|
||||||
|
output => \%expected_downloaded_files);
|
||||||
|
exit $the_test->run();
|
||||||
|
|
||||||
|
# vim: et ts=4 sw=4
|
Loading…
Reference in New Issue
Block a user