Add option to write URL rejections to a tab-delimited CSV log.

* main.c: Add "--rejected-log" option. * init.c: Add "rejectedlog" command. * options.h: Add "rejected_log" parameter string. * wget.texi: Add brief documentation on new --rejected-log option. * recur.c: Optionally log details of URLs not traversed. Add reject_reason enum. (download_child_p -> download_child): Return a reject_reason. (descend_redirect_p -> descend_redirect): Return a reject_reason. (retrieve_tree): Support logging reasons for rejection. Add write_reject_log_header that writes a CSV format header to a file. Add write_reject_log_url that writes a url struct to a file in CSV format. Add write_reject_log_reason that writes the URL and parent URL as well as the rejection reason to a CSV file. * Test--rejected-log.px: Add a basic test for the --rejected-log command. * tests/Makefile.am: Run Test--rejected-log.px. This allows you to figure out why URLs are being rejected and some context around it. CSV is used as the output format since it can be used easily parsed, it's delimited by tabs instead of commas to allow using all (quoted) URL characters and includes column names which may be used for compatibility.
2024-07-03 16:38:41 -04:00 · 2015-07-31 23:41:36 +10:00 · 2015-07-31 23:41:36 +10:00 · e4db00d74d
commit e4db00d74d
parent 670eb924e7
7 changed files with 308 additions and 32 deletions
--- a/doc/wget.texi
+++ b/doc/wget.texi
@ -551,6 +551,11 @@ would be resolved to @samp{http://foo/baz/b.html}.
@cindex specify config 
@item --config=@var{FILE}
 Specify the location of a startup file you wish to use.
+
+@item --rejected-log=@var{logfile}
+Logs all URL rejections to @var{logfile} as comma separated values.  The values
+include the reason of rejection, the URL and the parent URL it was found in.
+
@end table

@node Download Options, Directory Options, Logging and Input File Options, Invoking
--- a/src/init.c
+++ b/src/init.c
@ -274,6 +274,7 @@ static const struct {
  { "referer",          &opt.referer,           cmd_string },
  { "regextype",        &opt.regex_type,        cmd_spec_regex_type },
  { "reject",           &opt.rejects,           cmd_vector },
+  { "rejectedlog",      &opt.rejected_log,      cmd_file },
  { "rejectregex",      &opt.rejectregex_s,     cmd_string },
  { "relativeonly",     &opt.relative_only,     cmd_boolean },
  { "remoteencoding",   &opt.encoding_remote,   cmd_string },
@ -1856,6 +1857,7 @@ cleanup (void)
  xfree (opt.post_data);
  xfree (opt.body_data);
  xfree (opt.body_file);
+  xfree (opt.rejected_log);

 #endif /* DEBUG_MALLOC */
 }
--- a/src/main.c
+++ b/src/main.c
@ -321,6 +321,7 @@ static struct cmdline_option option_data[] =
    { "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
    { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
    { "local-encoding", 0, OPT_VALUE, "localencoding", -1 },
+    { "rejected-log", 0, OPT_VALUE, "rejectedlog", -1 },
    { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
 #ifdef HAVE_METALINK
    { "metalink-over-http", 0, OPT_BOOLEAN, "metalink-over-http", -1 },
@ -576,6 +577,8 @@ Logging and input file:\n"),
       --config=FILE               specify config file to use\n"),
    N_("\
       --no-config                 do not read any config file\n"),
+    N_("\
+       --rejected-log=FILE         log reasons for URL rejection to FILE\n"),
    "\n",

    N_("\
--- a/src/options.h
+++ b/src/options.h
@ -296,6 +296,8 @@ struct options
                                   name. */
  bool report_bps;              /*Output bandwidth in bits format*/

+  char *rejected_log;           /* The file to log rejected URLS to. */
+
 #ifdef HAVE_HSTS
  bool hsts;
  char *hsts_file;
--- a/src/recur.c
+++ b/src/recur.c
@ -182,11 +182,19 @@ static int blacklist_contains (struct hash_table *blacklist, const char *url)
  return ret;
 }

-static bool download_child_p (const struct urlpos *, struct url *, int,
-                              struct url *, struct hash_table *, struct iri *);
-static bool descend_redirect_p (const char *, struct url *, int,
-                                struct url *, struct hash_table *, struct iri *);
+typedef enum
+{
+  SUCCESS, BLACKLIST, NOTHTTPS, NONHTTP, ABSOLUTE, DOMAIN, PARENT, LIST, REGEX,
+  RULES, SPANNEDHOST, ROBOTS
+} reject_reason;

+static reject_reason download_child (const struct urlpos *, struct url *, int,
+                              struct url *, struct hash_table *, struct iri *);
+static reject_reason descend_redirect (const char *, struct url *, int,
+                              struct url *, struct hash_table *, struct iri *);
+static void write_reject_log_header (FILE *);
+static void write_reject_log_reason (FILE *, reject_reason, struct url *,
+                              struct url *);

 /* Retrieve a part of the web beginning with START_URL.  This used to
   be called "recursive retrieval", because the old function was
@ -244,6 +252,15 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
               false);
  blacklist_add (blacklist, start_url_parsed->url);

+  FILE *rejectedlog = 0; /* Don't write a rejected log. */
+  if (opt.rejected_log)
+    {
+      rejectedlog = fopen (opt.rejected_log, "w");
+      write_reject_log_header (rejectedlog);
+      if (!rejectedlog)
+        logprintf (LOG_NOTQUIET, "%s: %s\n", opt.rejected_log, strerror (errno));
+    }
+
  while (1)
    {
      bool descend = false;
@ -266,9 +283,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
        break;

      /* ...and download it.  Note that this download is in most cases
-         unconditional, as download_child_p already makes sure a file
+         unconditional, as download_child already makes sure a file
         doesn't get enqueued twice -- and yet this check is here, and
-         not in download_child_p.  This is so that if you run `wget -r
+         not in download_child.  This is so that if you run `wget -r
         URL1 URL2', and a random URL is encountered once under URL1
         and again under URL2, but at a different (possibly smaller)
         depth, we want the URL's children to be taken into account
@ -337,13 +354,19 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
                     want to follow it.  */
                  if (descend)
                    {
-                      if (!descend_redirect_p (redirected, url_parsed, depth,
-                                               start_url_parsed, blacklist, i))
-                        descend = false;
+                      reject_reason r = descend_redirect (redirected, url_parsed,
+                                        depth, start_url_parsed, blacklist, i);
+                      if (r == SUCCESS)
+                        {
+                          /* Make sure that the old pre-redirect form gets
+                             blacklisted. */
+                          blacklist_add (blacklist, url);
+                        }
                      else
-                        /* Make sure that the old pre-redirect form gets
-                           blacklisted. */
-                        blacklist_add (blacklist, url);
+                        {
+                          write_reject_log_reason (rejectedlog, r, url_parsed, start_url_parsed);
+                          descend = false;
+                        }
                    }

                  xfree (url);
@ -425,8 +448,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
                    continue;
                  if (dash_p_leaf_HTML && !child->link_inline_p)
                    continue;
-                  if (download_child_p (child, url_parsed, depth, start_url_parsed,
-                                        blacklist, i))
+                  reject_reason r = download_child (child, url_parsed, depth,
+                                    start_url_parsed, blacklist, i);
+                  if (r == SUCCESS)
                    {
                      ci = iri_new ();
                      set_uri_encoding (ci, i->content_encoding, false);
@ -439,6 +463,10 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
                         same URL twice.  */
                      blacklist_add (blacklist, child->url->url);
                    }
+                  else
+                    {
+                      write_reject_log_reason (rejectedlog, r, child->url, url_parsed);
+                    }
                }

              if (strip_auth)
@ -478,6 +506,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
      iri_free (i);
    }

+  if (rejectedlog)
+    fclose (rejectedlog);
+
  /* If anything is left of the queue due to a premature exit, free it
     now.  */
  {
@ -513,14 +544,15 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
   by storing these URLs to BLACKLIST.  This may or may not help.  It
   will help if those URLs are encountered many times.  */

-static bool
-download_child_p (const struct urlpos *upos, struct url *parent, int depth,
+static reject_reason
+download_child (const struct urlpos *upos, struct url *parent, int depth,
                  struct url *start_url_parsed, struct hash_table *blacklist,
                  struct iri *iri)
 {
  struct url *u = upos->url;
  const char *url = u->url;
  bool u_scheme_like_http;
+  reject_reason reason = SUCCESS;

  DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));

@ -529,11 +561,12 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
      if (opt.spider)
        {
          char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
-          DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
+          DEBUGP (("download_child: parent->url is: %s\n", quote (parent->url)));
          visited_url (url, referrer);
          xfree (referrer);
        }
      DEBUGP (("Already on the black list.\n"));
+      reason = BLACKLIST;
      goto out;
    }

@ -563,6 +596,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
  if (opt.https_only && u->scheme != SCHEME_HTTPS)
    {
      DEBUGP (("Not following non-HTTPS links.\n"));
+      reason = NOTHTTPS;
      goto out;
    }
 #endif
@ -574,6 +608,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
  if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
    {
      DEBUGP (("Not following non-HTTP schemes.\n"));
+      reason = NONHTTP;
      goto out;
    }

@ -583,6 +618,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
    if (opt.relative_only && !upos->link_relative_p)
      {
        DEBUGP (("It doesn't really look like a relative link.\n"));
+        reason = ABSOLUTE;
        goto out;
      }

@ -591,6 +627,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
  if (!accept_domain (u))
    {
      DEBUGP (("The domain was not accepted.\n"));
+      reason = DOMAIN;
      goto out;
    }

@ -610,6 +647,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
        {
          DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
                   u->dir, start_url_parsed->dir));
+          reason = PARENT;
          goto out;
        }
    }
@ -622,12 +660,14 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
      if (!accdir (u->dir))
        {
          DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
+          reason = LIST;
          goto out;
        }
    }
  if (!accept_url (url))
    {
      DEBUGP (("%s is excluded/not-included through regex.\n", url));
+      reason = REGEX;
      goto out;
    }

@ -652,6 +692,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
        {
          DEBUGP (("%s (%s) does not match acc/rej rules.\n",
                   url, u->file));
+          reason = RULES;
          goto out;
        }
    }
@ -662,6 +703,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
      {
        DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
                 u->host, parent->host));
+        reason = SPANNEDHOST;
        goto out;
      }

@ -704,35 +746,36 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
        {
          DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
          blacklist_add (blacklist, url);
+          reason = ROBOTS;
          goto out;
        }
    }

-  /* The URL has passed all the tests.  It can be placed in the
-     download queue. */
-  DEBUGP (("Decided to load it.\n"));
+  out:

-  return true;
+  if (reason == SUCCESS)
+    /* The URL has passed all the tests.  It can be placed in the
+       download queue. */
+    DEBUGP (("Decided to load it.\n"));
+  else
+    DEBUGP (("Decided NOT to load it.\n"));

- out:
-  DEBUGP (("Decided NOT to load it.\n"));
-
-  return false;
+  return reason;
 }

 /* This function determines whether we will consider downloading the
   children of a URL whose download resulted in a redirection,
   possibly to another host, etc.  It is needed very rarely, and thus
-   it is merely a simple-minded wrapper around download_child_p.  */
+   it is merely a simple-minded wrapper around download_child.  */

-static bool
-descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
+static reject_reason
+descend_redirect (const char *redirected, struct url *orig_parsed, int depth,
                    struct url *start_url_parsed, struct hash_table *blacklist,
                    struct iri *iri)
 {
  struct url *new_parsed;
  struct urlpos *upos;
-  bool success;
+  reject_reason reason;

  assert (orig_parsed != NULL);

@ -742,10 +785,10 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
  upos = xnew0 (struct urlpos);
  upos->url = new_parsed;

-  success = download_child_p (upos, orig_parsed, depth,
+  reason = download_child (upos, orig_parsed, depth,
                              start_url_parsed, blacklist, iri);

-  if (success)
+  if (reason == SUCCESS)
    blacklist_add (blacklist, upos->url->url);
  else
    DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
@ -753,7 +796,89 @@ descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
  url_free (new_parsed);
  xfree (upos);

-  return success;
+  return reason;
+}
+
+
+/* This function writes the rejected log header. */
+static void
+write_reject_log_header (FILE *f)
+{
+  if (!f)
+    return;
+
+  /* Note: Update this header when columns change in any way. */
+  fprintf (f, "REASON\t"
+    "U_URL\tU_SCHEME\tU_HOST\tU_PORT\tU_PATH\tU_PARAMS\tU_QUERY\tU_FRAGMENT\t"
+    "P_URL\tP_SCHEME\tP_HOST\tP_PORT\tP_PATH\tP_PARAMS\tP_QUERY\tP_FRAGMENT\n");
+}
+
+/* This function writes a URL to the reject log. Internal use only. */
+static void
+write_reject_log_url (FILE *f, struct url *url)
+{
+  if (!f)
+    return;
+
+  char *escaped_str = url_escape (url->url);
+  char const *scheme_str = 0;
+  char empty_str[] = "";
+
+  switch (url->scheme)
+    {
+      case SCHEME_HTTP:    scheme_str = "SCHEME_HTTP";    break;
+      #ifdef HAVE_SSL
+        case SCHEME_HTTPS: scheme_str = "SCHEME_HTTPS";   break;
+      #endif
+      case SCHEME_FTP:     scheme_str = "SCHEME_FTP";     break;
+      case SCHEME_INVALID: scheme_str = "SCHEME_INVALID"; break;
+    }
+
+  fprintf (f, "%s\t%s\t%s\t%i\t%s\t%s\t%s\t%s",
+    escaped_str,
+    scheme_str,
+    url->host,
+    url->port,
+    url->path,
+    url->params ? url->params : empty_str,
+    url->query ? url->query : empty_str,
+    url->fragment ? url->fragment : empty_str);
+
+  free (escaped_str);
+}
+
+/* This function writes out information on why a URL was rejected and its
+   context from download_child such as the URL being rejected and it's
+   parent's URL. The format it uses is comma separated values but with tabs. */
+static void
+write_reject_log_reason (FILE *f, reject_reason r, struct url *url,
+                         struct url *parent)
+{
+  if (!f)
+    return;
+
+  char const *reason_str = 0;
+  switch (r)
+    {
+      case SUCCESS:     reason_str = "SUCCESS";     break;
+      case BLACKLIST:   reason_str = "BLACKLIST";   break;
+      case NOTHTTPS:    reason_str = "NOTHTTPS";    break;
+      case NONHTTP:     reason_str = "NONHTTP";     break;
+      case ABSOLUTE:    reason_str = "ABSOLUTE";    break;
+      case DOMAIN:      reason_str = "DOMAIN";      break;
+      case PARENT:      reason_str = "PARENT";      break;
+      case LIST:        reason_str = "LIST";        break;
+      case REGEX:       reason_str = "REGEX";       break;
+      case RULES:       reason_str = "RULES";       break;
+      case SPANNEDHOST: reason_str = "SPANNEDHOST"; break;
+      case ROBOTS:      reason_str = "ROBOTS";      break;
+    }
+
+  fprintf (f, "%s\t", reason_str);
+  write_reject_log_url (f, url);
+  fprintf (f, "\t");
+  write_reject_log_url (f, parent);
+  fprintf (f, "\n");
 }

 /* vim:set sts=2 sw=2 cino+={s: */
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@ -127,6 +127,7 @@ PX_TESTS = \
             Test--start-pos.px \
             Test--start-pos--continue.px \
             Test--httpsonly-r.px \
+             Test--rejected-log.px \
             Test-204.px

 EXTRA_DIST = FTPServer.pm FTPTest.pm HTTPServer.pm HTTPTest.pm \
--- a/tests/Test--rejected-log.px
+++ b/tests/Test--rejected-log.px
@ -0,0 +1,138 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+
+###############################################################################
+
+my $mainpage = <<EOF;
+<html>
+<head>
+  <title>Main Page</title>
+</head>
+<body>
+  <p>
+    Recurse to a <a href="http://localhost:{{port}}/secondpage.html">second page</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $secondpage = <<EOF;
+<html>
+<head>
+  <title>Second Page</title>
+</head>
+<body>
+  <p>
+    Recurse to a <a href="http://localhost:{{port}}/thirdpage.html">third page</a>.
+    Try the blacklisted <a href="http://localhost:{{port}}/index.html">main page</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $thirdpage = <<EOF;
+<html>
+<head>
+  <title>Third Page</title>
+</head>
+<body>
+  <p>
+    Try a hidden <a href="http://localhost:{{port}}/dummy.txt">dummy file</a>.
+    Try to leave to <a href="http://no.such.domain/">another domain</a>.
+  </p>
+</body>
+</html>
+EOF
+
+my $robots = <<EOF;
+User-agent: *
+Disallow: /dummy.txt
+EOF
+
+my $log = <<EOF;
+REASON	U_URL	U_SCHEME	U_HOST	U_PORT	U_PATH	U_PARAMS	U_QUERY	U_FRAGMENT	P_URL	P_SCHEME	P_HOST	P_PORT	P_PATH	P_PARAMS	P_QUERY	P_FRAGMENT
+BLACKLIST	http%3A//localhost%3A{{port}}/index.html	SCHEME_HTTP	localhost	{{port}}	index.html				http%3A//localhost%3A{{port}}/secondpage.html	SCHEME_HTTP	localhost	{{port}}	secondpage.html			
+ROBOTS	http%3A//localhost%3A{{port}}/dummy.txt	SCHEME_HTTP	localhost	{{port}}	dummy.txt				http%3A//localhost%3A{{port}}/thirdpage.html	SCHEME_HTTP	localhost	{{port}}	thirdpage.html			
+SPANNEDHOST	http%3A//no.such.domain/	SCHEME_HTTP	no.such.domain	80					http%3A//localhost%3A{{port}}/thirdpage.html	SCHEME_HTTP	localhost	{{port}}	thirdpage.html			
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    '/index.html' => {
+        code => "200",
+        msg => "Dontcare",
+        headers => {
+            "Content-type" => "text/html",
+        },
+        content => $mainpage,
+    },
+    '/secondpage.html' => {
+        code => "200",
+        msg => "Dontcare",
+        headers => {
+            "Content-type" => "text/html",
+        },
+        content => $secondpage,
+    },
+    '/thirdpage.html' => {
+        code => "200",
+        msg => "Dontcare",
+        headers => {
+            "Content-type" => "text/html",
+        },
+        content => $thirdpage,
+    },
+    '/dummy.txt' => {
+        code => "200",
+        msg => "Dontcare",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => "",
+    },
+    '/robots.txt' => {
+        code => "200",
+        msg => "Dontcare",
+        headers => {
+            "Content-type" => "text/plain",
+        },
+        content => $robots
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " -nd -r --rejected-log log.csv http://localhost:{{port}}/index.html";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+  "index.html" => {
+    content => $mainpage,
+  },
+  "secondpage.html" => {
+    content => $secondpage,
+  },
+  "thirdpage.html" => {
+    content => $thirdpage,
+  },
+  "robots.txt" => {
+    content => $robots,
+  },
+  "log.csv" => {
+    content => $log,
+  },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (input => \%urls,
+                              cmdline => $cmdline,
+                              errcode => $expected_error_code,
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4