mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Add option to write URL rejections to a tab-delimited CSV log.
* main.c: Add "--rejected-log" option. * init.c: Add "rejectedlog" command. * options.h: Add "rejected_log" parameter string. * wget.texi: Add brief documentation on new --rejected-log option. * recur.c: Optionally log details of URLs not traversed. Add reject_reason enum. (download_child_p -> download_child): Return a reject_reason. (descend_redirect_p -> descend_redirect): Return a reject_reason. (retrieve_tree): Support logging reasons for rejection. Add write_reject_log_header that writes a CSV format header to a file. Add write_reject_log_url that writes a url struct to a file in CSV format. Add write_reject_log_reason that writes the URL and parent URL as well as the rejection reason to a CSV file. * Test--rejected-log.px: Add a basic test for the --rejected-log command. * tests/Makefile.am: Run Test--rejected-log.px. This allows you to figure out why URLs are being rejected and some context around it. CSV is used as the output format since it can be used easily parsed, it's delimited by tabs instead of commas to allow using all (quoted) URL characters and includes column names which may be used for compatibility.
This commit is contained in:
parent
670eb924e7
commit
e4db00d74d
@ -551,6 +551,11 @@ would be resolved to @samp{http://foo/baz/b.html}.
|
||||
@cindex specify config
|
||||
@item --config=@var{FILE}
|
||||
Specify the location of a startup file you wish to use.
|
||||
|
||||
@item --rejected-log=@var{logfile}
|
||||
Logs all URL rejections to @var{logfile} as comma separated values. The values
|
||||
include the reason of rejection, the URL and the parent URL it was found in.
|
||||
|
||||
@end table
|
||||
|
||||
@node Download Options, Directory Options, Logging and Input File Options, Invoking
|
||||
|
@ -274,6 +274,7 @@ static const struct {
|
||||
{ "referer", &opt.referer, cmd_string },
|
||||
{ "regextype", &opt.regex_type, cmd_spec_regex_type },
|
||||
{ "reject", &opt.rejects, cmd_vector },
|
||||
{ "rejectedlog", &opt.rejected_log, cmd_file },
|
||||
{ "rejectregex", &opt.rejectregex_s, cmd_string },
|
||||
{ "relativeonly", &opt.relative_only, cmd_boolean },
|
||||
{ "remoteencoding", &opt.encoding_remote, cmd_string },
|
||||
@ -1856,6 +1857,7 @@ cleanup (void)
|
||||
xfree (opt.post_data);
|
||||
xfree (opt.body_data);
|
||||
xfree (opt.body_file);
|
||||
xfree (opt.rejected_log);
|
||||
|
||||
#endif /* DEBUG_MALLOC */
|
||||
}
|
||||
|
@ -321,6 +321,7 @@ static struct cmdline_option option_data[] =
|
||||
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
|
||||
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
|
||||
{ "local-encoding", 0, OPT_VALUE, "localencoding", -1 },
|
||||
{ "rejected-log", 0, OPT_VALUE, "rejectedlog", -1 },
|
||||
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
|
||||
#ifdef HAVE_METALINK
|
||||
{ "metalink-over-http", 0, OPT_BOOLEAN, "metalink-over-http", -1 },
|
||||
@ -576,6 +577,8 @@ Logging and input file:\n"),
|
||||
--config=FILE specify config file to use\n"),
|
||||
N_("\
|
||||
--no-config do not read any config file\n"),
|
||||
N_("\
|
||||
--rejected-log=FILE log reasons for URL rejection to FILE\n"),
|
||||
"\n",
|
||||
|
||||
N_("\
|
||||
|
@ -296,6 +296,8 @@ struct options
|
||||
name. */
|
||||
bool report_bps; /*Output bandwidth in bits format*/
|
||||
|
||||
char *rejected_log; /* The file to log rejected URLS to. */
|
||||
|
||||
#ifdef HAVE_HSTS
|
||||
bool hsts;
|
||||
char *hsts_file;
|
||||
|
179
src/recur.c
179
src/recur.c
@ -182,11 +182,19 @@ static int blacklist_contains (struct hash_table *blacklist, const char *url)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool download_child_p (const struct urlpos *, struct url *, int,
|
||||
struct url *, struct hash_table *, struct iri *);
|
||||
static bool descend_redirect_p (const char *, struct url *, int,
|
||||
struct url *, struct hash_table *, struct iri *);
|
||||
typedef enum
|
||||
{
|
||||
SUCCESS, BLACKLIST, NOTHTTPS, NONHTTP, ABSOLUTE, DOMAIN, PARENT, LIST, REGEX,
|
||||
RULES, SPANNEDHOST, ROBOTS
|
||||
} reject_reason;
|
||||
|
||||
static reject_reason download_child (const struct urlpos *, struct url *, int,
|
||||
struct url *, struct hash_table *, struct iri *);
|
||||
static reject_reason descend_redirect (const char *, struct url *, int,
|
||||
struct url *, struct hash_table *, struct iri *);
|
||||
static void write_reject_log_header (FILE *);
|
||||
static void write_reject_log_reason (FILE *, reject_reason, struct url *,
|
||||
struct url *);
|
||||
|
||||
/* Retrieve a part of the web beginning with START_URL. This used to
|
||||
be called "recursive retrieval", because the old function was
|
||||
@ -244,6 +252,15 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
||||
false);
|
||||
blacklist_add (blacklist, start_url_parsed->url);
|
||||
|
||||
FILE *rejectedlog = 0; /* Don't write a rejected log. */
|
||||
if (opt.rejected_log)
|
||||
{
|
||||
rejectedlog = fopen (opt.rejected_log, "w");
|
||||
write_reject_log_header (rejectedlog);
|
||||
if (!rejectedlog)
|
||||
logprintf (LOG_NOTQUIET, "%s: %s\n", opt.rejected_log, strerror (errno));
|
||||
}
|
||||
|
||||
while (1)
|
||||
{
|
||||
bool descend = false;
|
||||
@ -266,9 +283,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
||||
break;
|
||||
|
||||
/* ...and download it. Note that this download is in most cases
|
||||
unconditional, as download_child_p already makes sure a file
|
||||
unconditional, as download_child already makes sure a file
|
||||
doesn't get enqueued twice -- and yet this check is here, and
|
||||
not in download_child_p. This is so that if you run `wget -r
|
||||
not in download_child. This is so that if you run `wget -r
|
||||
URL1 URL2', and a random URL is encountered once under URL1
|
||||
and again under URL2, but at a different (possibly smaller)
|
||||
depth, we want the URL's children to be taken into account
|
||||
@ -337,14 +354,20 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
||||
want to follow it. */
|
||||
if (descend)
|
||||
{
|
||||
if (!descend_redirect_p (redirected, url_parsed, depth,
|
||||
start_url_parsed, blacklist, i))
|
||||
descend = false;
|
||||
else
|
||||
reject_reason r = descend_redirect (redirected, url_parsed,
|
||||
depth, start_url_parsed, blacklist, i);
|
||||
if (r == SUCCESS)
|
||||
{
|
||||
/* Make sure that the old pre-redirect form gets
|
||||
blacklisted. */
|
||||
blacklist_add (blacklist, url);
|
||||
}
|
||||
else
|
||||
{
|
||||
write_reject_log_reason (rejectedlog, r, url_parsed, start_url_parsed);
|
||||
descend = false;
|
||||
}
|
||||
}
|
||||
|
||||
xfree (url);
|
||||
url = redirected;
|
||||
@ -425,8 +448,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
||||
continue;
|
||||
if (dash_p_leaf_HTML && !child->link_inline_p)
|
||||
continue;
|
||||
if (download_child_p (child, url_parsed, depth, start_url_parsed,
|
||||
blacklist, i))
|
||||
reject_reason r = download_child (child, url_parsed, depth,
|
||||
start_url_parsed, blacklist, i);
|
||||
if (r == SUCCESS)
|
||||
{
|
||||
ci = iri_new ();
|
||||
set_uri_encoding (ci, i->content_encoding, false);
|
||||
@ -439,6 +463,10 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
||||
same URL twice. */
|
||||
blacklist_add (blacklist, child->url->url);
|
||||
}
|
||||
else
|
||||
{
|
||||
write_reject_log_reason (rejectedlog, r, child->url, url_parsed);
|
||||
}
|
||||
}
|
||||
|
||||
if (strip_auth)
|
||||
@ -478,6 +506,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
||||
iri_free (i);
|
||||
}
|
||||
|
||||
if (rejectedlog)
|
||||
fclose (rejectedlog);
|
||||
|
||||
/* If anything is left of the queue due to a premature exit, free it
|
||||
now. */
|
||||
{
|
||||
@ -513,14 +544,15 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
|
||||
by storing these URLs to BLACKLIST. This may or may not help. It
|
||||
will help if those URLs are encountered many times. */
|
||||
|
||||
static bool
|
||||
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
static reject_reason
|
||||
download_child (const struct urlpos *upos, struct url *parent, int depth,
|
||||
struct url *start_url_parsed, struct hash_table *blacklist,
|
||||
struct iri *iri)
|
||||
{
|
||||
struct url *u = upos->url;
|
||||
const char *url = u->url;
|
||||
bool u_scheme_like_http;
|
||||
reject_reason reason = SUCCESS;
|
||||
|
||||
DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
|
||||
|
||||
@ -529,11 +561,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
if (opt.spider)
|
||||
{
|
||||
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
|
||||
DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
|
||||
DEBUGP (("download_child: parent->url is: %s\n", quote (parent->url)));
|
||||
visited_url (url, referrer);
|
||||
xfree (referrer);
|
||||
}
|
||||
DEBUGP (("Already on the black list.\n"));
|
||||
reason = BLACKLIST;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -563,6 +596,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
if (opt.https_only && u->scheme != SCHEME_HTTPS)
|
||||
{
|
||||
DEBUGP (("Not following non-HTTPS links.\n"));
|
||||
reason = NOTHTTPS;
|
||||
goto out;
|
||||
}
|
||||
#endif
|
||||
@ -574,6 +608,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
|
||||
{
|
||||
DEBUGP (("Not following non-HTTP schemes.\n"));
|
||||
reason = NONHTTP;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -583,6 +618,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
if (opt.relative_only && !upos->link_relative_p)
|
||||
{
|
||||
DEBUGP (("It doesn't really look like a relative link.\n"));
|
||||
reason = ABSOLUTE;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -591,6 +627,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
if (!accept_domain (u))
|
||||
{
|
||||
DEBUGP (("The domain was not accepted.\n"));
|
||||
reason = DOMAIN;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -610,6 +647,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
{
|
||||
DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
|
||||
u->dir, start_url_parsed->dir));
|
||||
reason = PARENT;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
@ -622,12 +660,14 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
if (!accdir (u->dir))
|
||||
{
|
||||
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
|
||||
reason = LIST;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (!accept_url (url))
|
||||
{
|
||||
DEBUGP (("%s is excluded/not-included through regex.\n", url));
|
||||
reason = REGEX;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -652,6 +692,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
{
|
||||
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
|
||||
url, u->file));
|
||||
reason = RULES;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
@ -662,6 +703,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
{
|
||||
DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
|
||||
u->host, parent->host));
|
||||
reason = SPANNEDHOST;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -704,35 +746,36 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
{
|
||||
DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
|
||||
blacklist_add (blacklist, url);
|
||||
reason = ROBOTS;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
|
||||
if (reason == SUCCESS)
|
||||
/* The URL has passed all the tests. It can be placed in the
|
||||
download queue. */
|
||||
DEBUGP (("Decided to load it.\n"));
|
||||
|
||||
return true;
|
||||
|
||||
out:
|
||||
else
|
||||
DEBUGP (("Decided NOT to load it.\n"));
|
||||
|
||||
return false;
|
||||
return reason;
|
||||
}
|
||||
|
||||
/* This function determines whether we will consider downloading the
|
||||
children of a URL whose download resulted in a redirection,
|
||||
possibly to another host, etc. It is needed very rarely, and thus
|
||||
it is merely a simple-minded wrapper around download_child_p. */
|
||||
it is merely a simple-minded wrapper around download_child. */
|
||||
|
||||
static bool
|
||||
descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
|
||||
static reject_reason
|
||||
descend_redirect (const char *redirected, struct url *orig_parsed, int depth,
|
||||
struct url *start_url_parsed, struct hash_table *blacklist,
|
||||
struct iri *iri)
|
||||
{
|
||||
struct url *new_parsed;
|
||||
struct urlpos *upos;
|
||||
bool success;
|
||||
reject_reason reason;
|
||||
|
||||
assert (orig_parsed != NULL);
|
||||
|
||||
@ -742,10 +785,10 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
|
||||
upos = xnew0 (struct urlpos);
|
||||
upos->url = new_parsed;
|
||||
|
||||
success = download_child_p (upos, orig_parsed, depth,
|
||||
reason = download_child (upos, orig_parsed, depth,
|
||||
start_url_parsed, blacklist, iri);
|
||||
|
||||
if (success)
|
||||
if (reason == SUCCESS)
|
||||
blacklist_add (blacklist, upos->url->url);
|
||||
else
|
||||
DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
|
||||
@ -753,7 +796,89 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
|
||||
url_free (new_parsed);
|
||||
xfree (upos);
|
||||
|
||||
return success;
|
||||
return reason;
|
||||
}
|
||||
|
||||
|
||||
/* This function writes the rejected log header. */
|
||||
static void
|
||||
write_reject_log_header (FILE *f)
|
||||
{
|
||||
if (!f)
|
||||
return;
|
||||
|
||||
/* Note: Update this header when columns change in any way. */
|
||||
fprintf (f, "REASON\t"
|
||||
"U_URL\tU_SCHEME\tU_HOST\tU_PORT\tU_PATH\tU_PARAMS\tU_QUERY\tU_FRAGMENT\t"
|
||||
"P_URL\tP_SCHEME\tP_HOST\tP_PORT\tP_PATH\tP_PARAMS\tP_QUERY\tP_FRAGMENT\n");
|
||||
}
|
||||
|
||||
/* This function writes a URL to the reject log. Internal use only. */
|
||||
static void
|
||||
write_reject_log_url (FILE *f, struct url *url)
|
||||
{
|
||||
if (!f)
|
||||
return;
|
||||
|
||||
char *escaped_str = url_escape (url->url);
|
||||
char const *scheme_str = 0;
|
||||
char empty_str[] = "";
|
||||
|
||||
switch (url->scheme)
|
||||
{
|
||||
case SCHEME_HTTP: scheme_str = "SCHEME_HTTP"; break;
|
||||
#ifdef HAVE_SSL
|
||||
case SCHEME_HTTPS: scheme_str = "SCHEME_HTTPS"; break;
|
||||
#endif
|
||||
case SCHEME_FTP: scheme_str = "SCHEME_FTP"; break;
|
||||
case SCHEME_INVALID: scheme_str = "SCHEME_INVALID"; break;
|
||||
}
|
||||
|
||||
fprintf (f, "%s\t%s\t%s\t%i\t%s\t%s\t%s\t%s",
|
||||
escaped_str,
|
||||
scheme_str,
|
||||
url->host,
|
||||
url->port,
|
||||
url->path,
|
||||
url->params ? url->params : empty_str,
|
||||
url->query ? url->query : empty_str,
|
||||
url->fragment ? url->fragment : empty_str);
|
||||
|
||||
free (escaped_str);
|
||||
}
|
||||
|
||||
/* This function writes out information on why a URL was rejected and its
|
||||
context from download_child such as the URL being rejected and it's
|
||||
parent's URL. The format it uses is comma separated values but with tabs. */
|
||||
static void
|
||||
write_reject_log_reason (FILE *f, reject_reason r, struct url *url,
|
||||
struct url *parent)
|
||||
{
|
||||
if (!f)
|
||||
return;
|
||||
|
||||
char const *reason_str = 0;
|
||||
switch (r)
|
||||
{
|
||||
case SUCCESS: reason_str = "SUCCESS"; break;
|
||||
case BLACKLIST: reason_str = "BLACKLIST"; break;
|
||||
case NOTHTTPS: reason_str = "NOTHTTPS"; break;
|
||||
case NONHTTP: reason_str = "NONHTTP"; break;
|
||||
case ABSOLUTE: reason_str = "ABSOLUTE"; break;
|
||||
case DOMAIN: reason_str = "DOMAIN"; break;
|
||||
case PARENT: reason_str = "PARENT"; break;
|
||||
case LIST: reason_str = "LIST"; break;
|
||||
case REGEX: reason_str = "REGEX"; break;
|
||||
case RULES: reason_str = "RULES"; break;
|
||||
case SPANNEDHOST: reason_str = "SPANNEDHOST"; break;
|
||||
case ROBOTS: reason_str = "ROBOTS"; break;
|
||||
}
|
||||
|
||||
fprintf (f, "%s\t", reason_str);
|
||||
write_reject_log_url (f, url);
|
||||
fprintf (f, "\t");
|
||||
write_reject_log_url (f, parent);
|
||||
fprintf (f, "\n");
|
||||
}
|
||||
|
||||
/* vim:set sts=2 sw=2 cino+={s: */
|
||||
|
@ -127,6 +127,7 @@ PX_TESTS = \
|
||||
Test--start-pos.px \
|
||||
Test--start-pos--continue.px \
|
||||
Test--httpsonly-r.px \
|
||||
Test--rejected-log.px \
|
||||
Test-204.px
|
||||
|
||||
EXTRA_DIST = FTPServer.pm FTPTest.pm HTTPServer.pm HTTPTest.pm \
|
||||
|
138
tests/Test--rejected-log.px
Executable file
138
tests/Test--rejected-log.px
Executable file
@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use HTTPTest;
|
||||
|
||||
|
||||
###############################################################################
|
||||
|
||||
my $mainpage = <<EOF;
|
||||
<html>
|
||||
<head>
|
||||
<title>Main Page</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
Recurse to a <a href="http://localhost:{{port}}/secondpage.html">second page</a>.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
|
||||
my $secondpage = <<EOF;
|
||||
<html>
|
||||
<head>
|
||||
<title>Second Page</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
Recurse to a <a href="http://localhost:{{port}}/thirdpage.html">third page</a>.
|
||||
Try the blacklisted <a href="http://localhost:{{port}}/index.html">main page</a>.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
|
||||
my $thirdpage = <<EOF;
|
||||
<html>
|
||||
<head>
|
||||
<title>Third Page</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
Try a hidden <a href="http://localhost:{{port}}/dummy.txt">dummy file</a>.
|
||||
Try to leave to <a href="http://no.such.domain/">another domain</a>.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
|
||||
my $robots = <<EOF;
|
||||
User-agent: *
|
||||
Disallow: /dummy.txt
|
||||
EOF
|
||||
|
||||
my $log = <<EOF;
|
||||
REASON U_URL U_SCHEME U_HOST U_PORT U_PATH U_PARAMS U_QUERY U_FRAGMENT P_URL P_SCHEME P_HOST P_PORT P_PATH P_PARAMS P_QUERY P_FRAGMENT
|
||||
BLACKLIST http%3A//localhost%3A{{port}}/index.html SCHEME_HTTP localhost {{port}} index.html http%3A//localhost%3A{{port}}/secondpage.html SCHEME_HTTP localhost {{port}} secondpage.html
|
||||
ROBOTS http%3A//localhost%3A{{port}}/dummy.txt SCHEME_HTTP localhost {{port}} dummy.txt http%3A//localhost%3A{{port}}/thirdpage.html SCHEME_HTTP localhost {{port}} thirdpage.html
|
||||
SPANNEDHOST http%3A//no.such.domain/ SCHEME_HTTP no.such.domain 80 http%3A//localhost%3A{{port}}/thirdpage.html SCHEME_HTTP localhost {{port}} thirdpage.html
|
||||
EOF
|
||||
|
||||
# code, msg, headers, content
|
||||
my %urls = (
|
||||
'/index.html' => {
|
||||
code => "200",
|
||||
msg => "Dontcare",
|
||||
headers => {
|
||||
"Content-type" => "text/html",
|
||||
},
|
||||
content => $mainpage,
|
||||
},
|
||||
'/secondpage.html' => {
|
||||
code => "200",
|
||||
msg => "Dontcare",
|
||||
headers => {
|
||||
"Content-type" => "text/html",
|
||||
},
|
||||
content => $secondpage,
|
||||
},
|
||||
'/thirdpage.html' => {
|
||||
code => "200",
|
||||
msg => "Dontcare",
|
||||
headers => {
|
||||
"Content-type" => "text/html",
|
||||
},
|
||||
content => $thirdpage,
|
||||
},
|
||||
'/dummy.txt' => {
|
||||
code => "200",
|
||||
msg => "Dontcare",
|
||||
headers => {
|
||||
"Content-type" => "text/plain",
|
||||
},
|
||||
content => "",
|
||||
},
|
||||
'/robots.txt' => {
|
||||
code => "200",
|
||||
msg => "Dontcare",
|
||||
headers => {
|
||||
"Content-type" => "text/plain",
|
||||
},
|
||||
content => $robots
|
||||
},
|
||||
);
|
||||
|
||||
my $cmdline = $WgetTest::WGETPATH . " -nd -r --rejected-log log.csv http://localhost:{{port}}/index.html";
|
||||
|
||||
my $expected_error_code = 0;
|
||||
|
||||
my %expected_downloaded_files = (
|
||||
"index.html" => {
|
||||
content => $mainpage,
|
||||
},
|
||||
"secondpage.html" => {
|
||||
content => $secondpage,
|
||||
},
|
||||
"thirdpage.html" => {
|
||||
content => $thirdpage,
|
||||
},
|
||||
"robots.txt" => {
|
||||
content => $robots,
|
||||
},
|
||||
"log.csv" => {
|
||||
content => $log,
|
||||
},
|
||||
);
|
||||
|
||||
###############################################################################
|
||||
|
||||
my $the_test = HTTPTest->new (input => \%urls,
|
||||
cmdline => $cmdline,
|
||||
errcode => $expected_error_code,
|
||||
output => \%expected_downloaded_files);
|
||||
exit $the_test->run();
|
||||
|
||||
# vim: et ts=4 sw=4
|
Loading…
Reference in New Issue
Block a user