mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Fix meta name=robots.
This commit is contained in:
parent
bc7ea11d8e
commit
5febddec70
@ -1,5 +1,8 @@
|
||||
2009-07-05 Micah Cowan <micah@cowan.name>
|
||||
|
||||
* html-url.c (tag_handle_meta): Handle meta name="robots"
|
||||
properly: deal with whitespace, commas after...
|
||||
|
||||
* netrc.c (parse_netrc): Rename local-scope variable "quote" to
|
||||
"qmark", to avoid conflict with the function name.
|
||||
|
||||
|
@ -590,15 +590,25 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
{
|
||||
while (*content)
|
||||
{
|
||||
/* Find the next occurrence of ',' or the end of
|
||||
the string. */
|
||||
char *end = strchr (content, ',');
|
||||
if (end)
|
||||
++end;
|
||||
else
|
||||
end = content + strlen (content);
|
||||
char *end;
|
||||
/* Skip any initial whitespace. */
|
||||
content += strspn (content, " \f\n\r\t\v");
|
||||
/* Find the next occurrence of ',' or whitespace,
|
||||
* or the end of the string. */
|
||||
end = content + strcspn (content, ", \f\n\r\t\v");
|
||||
if (!strncasecmp (content, "nofollow", end - content))
|
||||
ctx->nofollow = true;
|
||||
/* Skip past the next comma, if any. */
|
||||
if (*end == ',')
|
||||
++end;
|
||||
else
|
||||
{
|
||||
end = strchr (end, ',');
|
||||
if (end)
|
||||
++end;
|
||||
else
|
||||
end = content + strlen (content);
|
||||
}
|
||||
content = end;
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,9 @@
|
||||
2009-07-05 Micah Cowan <micah@cowan.name>
|
||||
|
||||
* Test-meta-robots.px: Added.
|
||||
|
||||
* run-px: Add Test-meta-robots.px to the list.
|
||||
|
||||
2009-07-03 Micah Cowan <micah@cowan.name>
|
||||
|
||||
* Test-ftp-iri-disabled.px, Test-iri-disabled.px:
|
||||
|
115
tests/Test-meta-robots.px
Executable file
115
tests/Test-meta-robots.px
Executable file
@ -0,0 +1,115 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use HTTPTest;
|
||||
|
||||
# This test checks that Wget parses "nofollow" when it appears in <meta
|
||||
# name="robots"> tags, regardless of where in a list of comma-separated
|
||||
# values it appears, and regardless of spelling.
|
||||
#
|
||||
# Three different files contain links to the file "bombshell.html", each
|
||||
# with "nofollow" set, at various positions in a list of values for a
|
||||
# <meta name="robots"> tag, and with various degrees of separating
|
||||
# whitesspace. If bombshell.html is downloaded, the test
|
||||
# has failed.
|
||||
|
||||
###############################################################################
|
||||
|
||||
my $nofollow_start = <<EOF;
|
||||
<meta name="roBoTS" content="noFolLow , foo, bar ">
|
||||
<a href="/bombshell.html">Don't follow me!</a>
|
||||
EOF
|
||||
|
||||
my $nofollow_mid = <<EOF;
|
||||
<meta name="rObOts" content=" foo , NOfOllow , bar ">
|
||||
<a href="/bombshell.html">Don't follow me!</a>
|
||||
EOF
|
||||
|
||||
my $nofollow_end = <<EOF;
|
||||
<meta name="RoBotS" content="foo,BAr, nofOLLOw ">
|
||||
<a href="/bombshell.html">Don't follow me!</a>
|
||||
EOF
|
||||
|
||||
my $nofollow_solo = <<EOF;
|
||||
<meta name="robots" content="nofollow">
|
||||
<a href="/bombshell.html">Don't follow me!</a>
|
||||
EOF
|
||||
|
||||
# code, msg, headers, content
|
||||
my %urls = (
|
||||
'/start.html' => {
|
||||
code => "200",
|
||||
msg => "Ok",
|
||||
headers => {
|
||||
"Content-type" => "text/html",
|
||||
},
|
||||
content => $nofollow_start,
|
||||
},
|
||||
'/mid.html' => {
|
||||
code => "200",
|
||||
msg => "Ok",
|
||||
headers => {
|
||||
"Content-type" => "text/html",
|
||||
},
|
||||
content => $nofollow_mid,
|
||||
},
|
||||
'/end.html' => {
|
||||
code => "200",
|
||||
msg => "Ok",
|
||||
headers => {
|
||||
"Content-type" => "text/html",
|
||||
},
|
||||
content => $nofollow_end,
|
||||
},
|
||||
'/solo.html' => {
|
||||
code => "200",
|
||||
msg => "Ok",
|
||||
headers => {
|
||||
"Content-type" => "text/html",
|
||||
},
|
||||
content => $nofollow_solo,
|
||||
},
|
||||
'/bombshell.html' => {
|
||||
code => "200",
|
||||
msg => "Ok",
|
||||
headers => {
|
||||
"Content-type" => "text/html",
|
||||
},
|
||||
content => 'Hello',
|
||||
},
|
||||
);
|
||||
|
||||
my $cmdline = $WgetTest::WGETPATH . " -r -nd "
|
||||
. join(' ',(map "http://localhost:{{port}}/$_.html",
|
||||
qw(start mid end solo)));
|
||||
|
||||
my $expected_error_code = 0;
|
||||
|
||||
my %expected_downloaded_files = (
|
||||
'start.html' => {
|
||||
content => $nofollow_start,
|
||||
},
|
||||
'mid.html' => {
|
||||
content => $nofollow_mid,
|
||||
},
|
||||
'end.html' => {
|
||||
content => $nofollow_end,
|
||||
},
|
||||
'solo.html' => {
|
||||
content => $nofollow_solo,
|
||||
}
|
||||
);
|
||||
|
||||
###############################################################################
|
||||
|
||||
my $the_test = HTTPTest->new (name => "Test-meta-robots",
|
||||
input => \%urls,
|
||||
cmdline => $cmdline,
|
||||
errcode => $expected_error_code,
|
||||
output => \%expected_downloaded_files);
|
||||
exit $the_test->run();
|
||||
|
||||
# vim: et ts=4 sw=4
|
||||
|
@ -41,6 +41,7 @@ my @tests = (
|
||||
'Test-iri-disabled.px',
|
||||
'Test-iri-forced-remote.px',
|
||||
'Test-iri-list.px',
|
||||
'Test-meta-robots.px',
|
||||
'Test-N-current.px',
|
||||
'Test-N-smaller.px',
|
||||
'Test-N-no-info.px',
|
||||
|
Loading…
Reference in New Issue
Block a user