From 0d12c567386d54474ababcde3678a0cc5e1fb6b8 Mon Sep 17 00:00:00 2001 From: Daniel Stenberg Date: Wed, 3 Jan 2001 08:35:16 +0000 Subject: [PATCH] Added -i to allow ingore-patterns to get added --- perl/crawlink.pl | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/perl/crawlink.pl b/perl/crawlink.pl index d7855c383..53be500cc 100755 --- a/perl/crawlink.pl +++ b/perl/crawlink.pl @@ -9,10 +9,14 @@ # Written to use 'curl' for URL checking. # # Author: Daniel Stenberg -# Version: 0.2 Dec 19, 2000 +# Version: 0.3 Jan 3, 2001 # # HISTORY # +# 0.3 - The -i now adds regexes that if a full URL link matches one of those, +# it is not followed. This can then be used to prevent this script from +# following '.*\.cgi', specific pages or whatever. +# # 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot # faster to skip large non HTML files such as pdfs or big RFCs! ;-) # Added a -c option that allows me to pass options to curl. @@ -32,6 +36,8 @@ my $help; my $external; my $curlopts; +my @ignorelist; + argv: if($ARGV[0] eq "-v" ) { $verbose++; @@ -44,6 +50,12 @@ elsif($ARGV[0] eq "-c" ) { shift @ARGV; goto argv; } +elsif($ARGV[0] eq "-i" ) { + push @ignorelist, $ARGV[1]; + shift @ARGV; + shift @ARGV; + goto argv; +} elsif($ARGV[0] eq "-l" ) { $linenumber = 1; shift @ARGV; @@ -72,10 +84,12 @@ $rooturls{$ARGV[0]}=1; if(($geturl eq "") || $help) { print "Usage: $0 [-hilvx] \n", " Use a traling slash for directory URLs!\n", - " -h This help text\n", - " -l Line number report for BAD links\n", - " -v Verbose mode\n", - " -x Check non-local (external?) links only\n"; + " -c [data] Pass [data] as argument to every curl invoke\n", + " -h This help text\n", + " -i [regex] Ignore root links that match this pattern\n", + " -l Line number report for BAD links\n", + " -v Verbose mode\n", + " -x Check non-local (external?) links only\n"; exit; } @@ -303,9 +317,6 @@ while(1) { if($geturl == -1) { last; } - if($verbose) { - print "ROOT: $geturl\n"; - } # # Splits the URL in its different parts @@ -332,6 +343,8 @@ while(1) { next; } + print " ==== $geturl ====\n"; + if($verbose == 2) { printf("Error code $error, Content-Type: $ctype, got %d bytes\n", length($in)); @@ -405,8 +418,17 @@ while(1) { } } else { - # the link works, add it! - $rooturls{$link}++; # check this if not checked already + # the link works, add it if it isn't in the ingore list + my $ignore=0; + for(@ignorelist) { + if($link =~ /$_/) { + $ignore=1; + } + } + if(!$ignore) { + # not ignored, add + $rooturls{$link}++; # check this if not checked already + } } }