curl/perl/contrib/checklinks.pl.in

#!@PERL@
#
# checklinks.pl
#
# This script extracts all links from a HTML page and checks their validity.
# Written to use 'curl' for URL checking.
#
# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
# Version: 0.7 Sept 30, 1998
#
# HISTORY
#
# 0.5 - Cuts off the #-part from links before checking.
#
# 0.6 - Now deals with error codes 3XX better and follows the Location:
#       properly.
#     - Added the -x flag that only checks http:// -links
#
# 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path
#       but a document. Now it does.
#
#

$in="";

 argv:
if($ARGV[0] eq "-v" ) {
    $verbose = 1;
    shift @ARGV;
    goto argv;
}
elsif($ARGV[0] eq "-i" ) {
    $usestdin = 1;
    shift @ARGV;
    goto argv;
}
elsif($ARGV[0] eq "-l" ) {
    $linenumber = 1;
    shift @ARGV;
    goto argv;
}
elsif($ARGV[0] eq "-h" ) {
    $help = 1;
    shift @ARGV;
    goto argv;
}
elsif($ARGV[0] eq "-x" ) {
    $external = 1;
    shift @ARGV;
    goto argv;
}

$geturl = $ARGV[0];

if(($geturl eq "") || $help) {
    print  "Usage: $0 [-hilvx] <full URL>\n",
    " Use a traling slash for directory URLs!\n",
    " -h  This help text\n",
    " -i  Read the initial page from stdin\n",
    " -l  Line number report for BAD links\n",
    " -v  Verbose mode\n",
    " -x  Check non-local (external?) links only\n";
    exit;
}

if($ARGV[1] eq "-") {
    print "We use stdin!\n";
    $usestdin = 1;
}

# This is necessary from where I tried this:
#$proxy =" -x 194.237.142.41:80";

# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
$linkcheck = "curl -s -m 20 -I$proxy";

# as a second attempt, this will be used. This is not using HEAD but will
# get the whole frigging document!
$linkcheckfull = "curl -s -m 20 -i$proxy";

# htmlget, URL will be appended to the right of this command line
$htmlget = "curl -s$proxy";

# Parse the input URL and split it into the relevant parts:

sub SplitURL {
    my $inurl = $_[0];

    if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
	$getprotocol = $1;
	$getserver = $2;
	$getpath = $3;
	$getdocument = $4;
    }
    elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
	$getprotocol = $1;
	$getserver = $2;
	$getpath = $3;
	$getdocument = "";
	
	if($getpath !~ /\//) {
	    $getpath ="";
	    $getdocument = $3;
	}
    
    }
    elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
	$getprotocol = $1;
	$getserver = $2;
	$getpath = "";
	$getdocument = "";
    }
    else {
	print "Couldn't parse the specified URL, retry please!\n";
	exit;
    }
}

&SplitURL($geturl);

#print "protocol = $getprotocol\n";
#print "server = $getserver\n";
#print "path = $getpath\n";
#print "document = $getdocument\n";
#exit;

if(!$usestdin) {
    open(HEADGET, "$linkcheck $geturl|") ||
	die "Couldn't get web page for some reason";
  headget:
    while(<HEADGET>) {
#	print $_;
	if($_ =~ /HTTP\/.*3\d\d /) {
	    $pagemoved=1;
	}
	elsif($pagemoved &&
	       ($_ =~ /^Location: (.*)/)) {
	    $geturl = $1;

	    &SplitURL($geturl);

	    $pagemoved++;
	    last headget;
	}
    }
    close(HEADGET);

    if($pagemoved == 1) {
	print "Page is moved but we don't know where. Did you forget the ",
	"traling slash?\n";
	exit;
    }

    open(WEBGET, "$htmlget $geturl|") ||
	die "Couldn't get web page for some reason";

    while(<WEBGET>) {
	$line = $_;
	push @indoc, $line;
	$line=~ s/\n//g;
	$line=~ s/\r//g;
#    print $line."\n";
	$in=$in.$line;
    }

    close(WEBGET);
}
else {
    while(<STDIN>) {
	$line = $_;
	push @indoc, $line;
	$line=~ s/\n//g;
	$line=~ s/\r//g;
	$in=$in.$line;
    }
}

#print length($in)."\n";

sub LinkWorks {
    my $check = $_[0];

#   URL encode:
#    $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;

    @doc = `$linkcheck \"$check\"`;

    $head = 1;

#    print "COMMAND: $linkcheck \"$check\"\n";
#    print $doc[0]."\n";

  boo:
    if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {
	$error = $1;

	if($error < 400 ) {
	    return "GOOD";
	}
	else {
	    
	    if($head && ($error >= 500)) {
		# This server doesn't like HEAD!
		@doc = `$linkcheckfull \"$check\"`;
		$head = 0;
		goto boo;
	    }
	    return "BAD";
	}
    }
    return "BAD";
}


sub GetLinks {
    my $in = $_[0];
    my @result;

  getlinkloop:
    while($in =~ /[^<]*(<[^>]+>)/g ) {
	# we have a tag in $1
	$tag = $1;
	
	if($tag =~ /^<!--/) {
	    # this is a comment tag, ignore it 
	}
	else {
	    if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
   	        $url=$2;
		if($url =~ /^\"(.*)\"$/) {
		    # this was a "string" now $1 has removed the quotes:
		    $url=$1;
		}


		$url =~ s/([^\#]*)\#.*/$1/g;

		if($url eq "") {
		    # if the link was nothing than a #-link it may now have
		    # been emptied completely so then we skip the rest
		    next getlinkloop;		    
		}

		if($done{$url}) {
		    # if this url already is done, do next
		    $done{$url}++;
		    next getlinkloop;
		}

		$done{$url} = 1; # this is "done"

	        push @result, $url;
		if($tag =~ /< *([^ ]+)/) {
#		    print "TAG: $1\n";
		    $tagtype{$url}=$1;
		}
	    }
        }
    }
    return @result;
}

@links = &GetLinks($in);

 linkloop:
for(@links) {
    $url = $_;

    if($url =~ /^([^:]+):/) {
	$prot = $1;
#	if($prot !~ /(http|ftp)/i) {
	if($prot !~ /http/i) {
	    # this is an unsupported protocol, we ignore this
	    next linkloop;
	}
	$link = $url;
    }
    else {
	if($external) {
	    next linkloop;
	}

	# this is a link on the save server:
	if($url =~ /^\//) {
	    # from root
	    $link = "$getprotocol://$getserver$url";
	}
	else {
	    # from the scanned page's dir
	    $nyurl=$url;

	    if(length($getpath) &&
	       ($getpath !~ /\/$/) &&
	       ($nyurl !~ /^\//)) {
		# lacks ending slash, add one to the document part:
		$nyurl = "/".$nyurl;
	    }
	    $link = "$getprotocol://$getserver/$getpath$nyurl";
	}
    }

#print "test $link\n";
#$success = "GOOD";

    $success = &LinkWorks($link);

    $count = $done{$url};

    $allcount += $count;

    print "$success $count <".$tagtype{$url}."> $link $url\n";

# If bad and -l, present the line numbers of the usage
    if("BAD" eq $success) {
	$badlinks++;
	if($linenumber) {
	    $line =1;
	    for(@indoc) {
		if($_ =~ /$url/) {
		    print " line $line\n";
		}
		$line++;
	    }
	}
    }

}

if($verbose) {
    print "$allcount links were checked";
    if($badlinks > 0) {
	print ", $badlinks were found bad";
    }
    print "\n";
}
moved here from ../ 2001-03-16 08:04:57 -05:00			`#!@PERL@`
			`#`
			`# checklinks.pl`
			`#`
			`# This script extracts all links from a HTML page and checks their validity.`
			`# Written to use 'curl' for URL checking.`
			`#`
			`# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>`
			`# Version: 0.7 Sept 30, 1998`
			`#`
			`# HISTORY`
			`#`
			`# 0.5 - Cuts off the #-part from links before checking.`
			`#`
			`# 0.6 - Now deals with error codes 3XX better and follows the Location:`
			`# properly.`
			`# - Added the -x flag that only checks http:// -links`
			`#`
			`# 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path`
			`# but a document. Now it does.`
			`#`
			`#`

			`$in="";`

			`argv:`
			`if($ARGV[0] eq "-v" ) {`
			`$verbose = 1;`
			`shift @ARGV;`
			`goto argv;`
			`}`
			`elsif($ARGV[0] eq "-i" ) {`
			`$usestdin = 1;`
			`shift @ARGV;`
			`goto argv;`
			`}`
			`elsif($ARGV[0] eq "-l" ) {`
			`$linenumber = 1;`
			`shift @ARGV;`
			`goto argv;`
			`}`
			`elsif($ARGV[0] eq "-h" ) {`
			`$help = 1;`
			`shift @ARGV;`
			`goto argv;`
			`}`
			`elsif($ARGV[0] eq "-x" ) {`
			`$external = 1;`
			`shift @ARGV;`
			`goto argv;`
			`}`

			`$geturl = $ARGV[0];`

			`if(($geturl eq "") \|\| $help) {`
			`print "Usage: $0 [-hilvx] <full URL>\n",`
			`" Use a traling slash for directory URLs!\n",`
			`" -h This help text\n",`
			`" -i Read the initial page from stdin\n",`
			`" -l Line number report for BAD links\n",`
			`" -v Verbose mode\n",`
			`" -x Check non-local (external?) links only\n";`
			`exit;`
			`}`

			`if($ARGV[1] eq "-") {`
			`print "We use stdin!\n";`
			`$usestdin = 1;`
			`}`

			`# This is necessary from where I tried this:`
			`#$proxy =" -x 194.237.142.41:80";`

			`# linkchecker, URL will be appended to the right of this command line`
			`# this is the one using HEAD:`
			`$linkcheck = "curl -s -m 20 -I$proxy";`

			`# as a second attempt, this will be used. This is not using HEAD but will`
			`# get the whole frigging document!`
			`$linkcheckfull = "curl -s -m 20 -i$proxy";`

			`# htmlget, URL will be appended to the right of this command line`
			`$htmlget = "curl -s$proxy";`

			`# Parse the input URL and split it into the relevant parts:`

			`sub SplitURL {`
			`my $inurl = $_[0];`

			`if($inurl=~ /^([^:]+):\/\/([^\/])\/(.)\/(.*)/ ) {`
			`$getprotocol = $1;`
			`$getserver = $2;`
			`$getpath = $3;`
			`$getdocument = $4;`
			`}`
			`elsif ($inurl=~ /^([^:]+):\/\/([^\/])\/(.)/ ) {`
			`$getprotocol = $1;`
			`$getserver = $2;`
			`$getpath = $3;`
			`$getdocument = "";`

			`if($getpath !~ /\//) {`
			`$getpath ="";`
			`$getdocument = $3;`
			`}`

			`}`
			`elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {`
			`$getprotocol = $1;`
			`$getserver = $2;`
			`$getpath = "";`
			`$getdocument = "";`
			`}`
			`else {`
			`print "Couldn't parse the specified URL, retry please!\n";`
			`exit;`
			`}`
			`}`

			`&SplitURL($geturl);`

			`#print "protocol = $getprotocol\n";`
			`#print "server = $getserver\n";`
			`#print "path = $getpath\n";`
			`#print "document = $getdocument\n";`
			`#exit;`

			`if(!$usestdin) {`
			`open(HEADGET, "$linkcheck $geturl\|") \|\|`
			`die "Couldn't get web page for some reason";`
			`headget:`
			`while(<HEADGET>) {`
			`# print $_;`
			`if($_ =~ /HTTP\/.*3\d\d /) {`
			`$pagemoved=1;`
			`}`
			`elsif($pagemoved &&`
			`($_ =~ /^Location: (.*)/)) {`
			`$geturl = $1;`

			`&SplitURL($geturl);`

			`$pagemoved++;`
			`last headget;`
			`}`
			`}`
			`close(HEADGET);`

			`if($pagemoved == 1) {`
			`print "Page is moved but we don't know where. Did you forget the ",`
			`"traling slash?\n";`
			`exit;`
			`}`

			`open(WEBGET, "$htmlget $geturl\|") \|\|`
			`die "Couldn't get web page for some reason";`

			`while(<WEBGET>) {`
			`$line = $_;`
			`push @indoc, $line;`
			`$line=~ s/\n//g;`
			`$line=~ s/\r//g;`
			`# print $line."\n";`
			`$in=$in.$line;`
			`}`

			`close(WEBGET);`
			`}`
			`else {`
			`while(<STDIN>) {`
			`$line = $_;`
			`push @indoc, $line;`
			`$line=~ s/\n//g;`
			`$line=~ s/\r//g;`
			`$in=$in.$line;`
			`}`
			`}`

			`#print length($in)."\n";`

			`sub LinkWorks {`
			`my $check = $_[0];`

			`# URL encode:`
			`# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;`

			@doc = `$linkcheck \"$check\"`;

			`$head = 1;`

			`# print "COMMAND: $linkcheck \"$check\"\n";`
			`# print $doc[0]."\n";`

			`boo:`
			`if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {`
			`$error = $1;`

			`if($error < 400 ) {`
			`return "GOOD";`
			`}`
			`else {`

			`if($head && ($error >= 500)) {`
			`# This server doesn't like HEAD!`
			@doc = `$linkcheckfull \"$check\"`;
			`$head = 0;`
			`goto boo;`
			`}`
			`return "BAD";`
			`}`
			`}`
			`return "BAD";`
			`}`


			`sub GetLinks {`
			`my $in = $_[0];`
			`my @result;`

			`getlinkloop:`
			`while($in =~ /[^<]*(<[^>]+>)/g ) {`
			`# we have a tag in $1`
			`$tag = $1;`

			`if($tag =~ /^<!--/) {`
			`# this is a comment tag, ignore it`
			`}`
			`else {`
			`if($tag =~ /(src\|href\|background\|archive) = (\"[^\"]\"\|[^ )>]*)/i) {`
			`$url=$2;`
			`if($url =~ /^\"(.*)\"$/) {`
			`# this was a "string" now $1 has removed the quotes:`
			`$url=$1;`
			`}`


			`$url =~ s/([^\#])\#./$1/g;`

			`if($url eq "") {`
			`# if the link was nothing than a #-link it may now have`
			`# been emptied completely so then we skip the rest`
			`next getlinkloop;`
			`}`

			`if($done{$url}) {`
			`# if this url already is done, do next`
			`$done{$url}++;`
			`next getlinkloop;`
			`}`

			`$done{$url} = 1; # this is "done"`

			`push @result, $url;`
			`if($tag =~ /< *([^ ]+)/) {`
			`# print "TAG: $1\n";`
			`$tagtype{$url}=$1;`
			`}`
			`}`
			`}`
			`}`
			`return @result;`
			`}`

			`@links = &GetLinks($in);`

			`linkloop:`
			`for(@links) {`
			`$url = $_;`

			`if($url =~ /^([^:]+):/) {`
			`$prot = $1;`
David Shaw finally removed all traces of Gopher and we are now officially not supporting it. It hasn't been functioning for years anyway, so this is just finally stating what already was true. And a cleanup at the same time. 2006-01-16 17:14:37 -05:00			`# if($prot !~ /(http\|ftp)/i) {`
moved here from ../ 2001-03-16 08:04:57 -05:00			`if($prot !~ /http/i) {`
			`# this is an unsupported protocol, we ignore this`
			`next linkloop;`
			`}`
			`$link = $url;`
			`}`
			`else {`
			`if($external) {`
			`next linkloop;`
			`}`

			`# this is a link on the save server:`
			`if($url =~ /^\//) {`
			`# from root`
			`$link = "$getprotocol://$getserver$url";`
			`}`
			`else {`
			`# from the scanned page's dir`
			`$nyurl=$url;`

			`if(length($getpath) &&`
			`($getpath !~ /\/$/) &&`
			`($nyurl !~ /^\//)) {`
			`# lacks ending slash, add one to the document part:`
			`$nyurl = "/".$nyurl;`
			`}`
			`$link = "$getprotocol://$getserver/$getpath$nyurl";`
			`}`
			`}`

			`#print "test $link\n";`
			`#$success = "GOOD";`

			`$success = &LinkWorks($link);`

			`$count = $done{$url};`

			`$allcount += $count;`

			`print "$success $count <".$tagtype{$url}."> $link $url\n";`

			`# If bad and -l, present the line numbers of the usage`
			`if("BAD" eq $success) {`
			`$badlinks++;`
			`if($linenumber) {`
			`$line =1;`
			`for(@indoc) {`
			`if($_ =~ /$url/) {`
			`print " line $line\n";`
			`}`
			`$line++;`
			`}`
			`}`
			`}`

			`}`

			`if($verbose) {`
			`print "$allcount links were checked";`
			`if($badlinks > 0) {`
			`print ", $badlinks were found bad";`
			`}`
			`print "\n";`
			`}`