From 5febddec7087446e0083d5e4455a9c06c391d760 Mon Sep 17 00:00:00 2001 From: Micah Cowan Date: Sun, 5 Jul 2009 23:23:17 -0700 Subject: [PATCH] Fix meta name=robots. --- src/ChangeLog | 3 + src/html-url.c | 24 +++++--- tests/ChangeLog | 6 ++ tests/Test-meta-robots.px | 115 ++++++++++++++++++++++++++++++++++++++ tests/run-px | 1 + 5 files changed, 142 insertions(+), 7 deletions(-) create mode 100755 tests/Test-meta-robots.px diff --git a/src/ChangeLog b/src/ChangeLog index 58ee55ba..8771c014 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,5 +1,8 @@ 2009-07-05 Micah Cowan + * html-url.c (tag_handle_meta): Handle meta name="robots" + properly: deal with whitespace, commas after... + * netrc.c (parse_netrc): Rename local-scope variable "quote" to "qmark", to avoid conflict with the function name. diff --git a/src/html-url.c b/src/html-url.c index 68a8bc3c..218659d4 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -590,15 +590,25 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) { while (*content) { - /* Find the next occurrence of ',' or the end of - the string. */ - char *end = strchr (content, ','); - if (end) - ++end; - else - end = content + strlen (content); + char *end; + /* Skip any initial whitespace. */ + content += strspn (content, " \f\n\r\t\v"); + /* Find the next occurrence of ',' or whitespace, + * or the end of the string. */ + end = content + strcspn (content, ", \f\n\r\t\v"); if (!strncasecmp (content, "nofollow", end - content)) ctx->nofollow = true; + /* Skip past the next comma, if any. */ + if (*end == ',') + ++end; + else + { + end = strchr (end, ','); + if (end) + ++end; + else + end = content + strlen (content); + } content = end; } } diff --git a/tests/ChangeLog b/tests/ChangeLog index 3e233e05..9ee2bb08 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,9 @@ +2009-07-05 Micah Cowan + + * Test-meta-robots.px: Added. + + * run-px: Add Test-meta-robots.px to the list. + 2009-07-03 Micah Cowan * Test-ftp-iri-disabled.px, Test-iri-disabled.px: diff --git a/tests/Test-meta-robots.px b/tests/Test-meta-robots.px new file mode 100755 index 00000000..2560369a --- /dev/null +++ b/tests/Test-meta-robots.px @@ -0,0 +1,115 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use HTTPTest; + +# This test checks that Wget parses "nofollow" when it appears in tags, regardless of where in a list of comma-separated +# values it appears, and regardless of spelling. +# +# Three different files contain links to the file "bombshell.html", each +# with "nofollow" set, at various positions in a list of values for a +# tag, and with various degrees of separating +# whitesspace. If bombshell.html is downloaded, the test +# has failed. + +############################################################################### + +my $nofollow_start = < +Don't follow me! +EOF + +my $nofollow_mid = < +Don't follow me! +EOF + +my $nofollow_end = < +Don't follow me! +EOF + +my $nofollow_solo = < +Don't follow me! +EOF + +# code, msg, headers, content +my %urls = ( + '/start.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html", + }, + content => $nofollow_start, + }, + '/mid.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html", + }, + content => $nofollow_mid, + }, + '/end.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html", + }, + content => $nofollow_end, + }, + '/solo.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html", + }, + content => $nofollow_solo, + }, + '/bombshell.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html", + }, + content => 'Hello', + }, +); + +my $cmdline = $WgetTest::WGETPATH . " -r -nd " + . join(' ',(map "http://localhost:{{port}}/$_.html", + qw(start mid end solo))); + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'start.html' => { + content => $nofollow_start, + }, + 'mid.html' => { + content => $nofollow_mid, + }, + 'end.html' => { + content => $nofollow_end, + }, + 'solo.html' => { + content => $nofollow_solo, + } +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-meta-robots", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index 5faccb48..b480f141 100755 --- a/tests/run-px +++ b/tests/run-px @@ -41,6 +41,7 @@ my @tests = ( 'Test-iri-disabled.px', 'Test-iri-forced-remote.px', 'Test-iri-list.px', + 'Test-meta-robots.px', 'Test-N-current.px', 'Test-N-smaller.px', 'Test-N-no-info.px',