Merge pull request #253 from linkmauve/improve-checkdeadlinks

Improve checkdeadlinks.py
This commit is contained in:
Sam Whited 2016-10-12 10:07:38 -07:00 committed by GitHub
commit 914273677c
1 changed files with 64 additions and 68 deletions

View File

@ -3,7 +3,7 @@
# File: checkdeadlinks.py # File: checkdeadlinks.py
# Version: 0.1 # Version: 0.1
# Description: a script for checking XEPs for dead links # Description: a script for checking XEPs for dead links
# Last Modified: 2009-04-06 # Last Modified: 2016-10-03
# Author: Tobias Markmann (tm@ayena.de) # Author: Tobias Markmann (tm@ayena.de)
# License: public domain # License: public domain
# HowTo: ./checkdeadlinks.py --xep=xepnum # HowTo: ./checkdeadlinks.py --xep=xepnum
@ -32,78 +32,74 @@
# #
## END LICENSE ## ## END LICENSE ##
import glob '''
import os A script for checking XEPs for dead links.
from select import select '''
import socket
import getopt from __future__ import print_function
from string import split,strip,join,find
from argparse import ArgumentParser
import sys import sys
import time
import re import re
import urllib
import urllib2
from xml.dom.minidom import parse,parseString,Document from xml.dom.minidom import parse
def usage(): try:
print "checkdeadlinks.py" from urllib.request import Request, urlopen
print "" except ImportError:
print "-h, --help Print this help message" # We are on python2
print "-x, --xep [number] Defines the number of the XEP to check" from urllib2 import Request, urlopen
print "-v, --verbose Enables more verbosity"
def main(argv): def is_dead(url):
if re.match("^(http|https)", url):
if verbose:
print(url + ' :', end=' ')
try: try:
opts, args = getopt.gnu_getopt(argv, "hv:x", ["help", "verbose", "xep="]) request = Request(url)
except getopt.GetoptError: request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101")
usage() urlopen(request).read()
sys.exit(2) except Exception as e:
reason = str(e)
if verbose:
print("XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]")
return True
else:
if verbose:
print('OK')
return False
else:
return False
def get_deadlinks(xep, is_verbose=False):
global xepnum
xepnum = '%04d' % xep
global verbose global verbose
verbose = 0 verbose = is_verbose
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-x", "--xep"):
global xepnum
xepnum = arg
elif opt in ("-v", "--verbose"):
verbose = 1
xepfile = 'xep-' + xepnum + '.xml' xepfile = 'xep-' + xepnum + '.xml'
thexep = parse(xepfile) thexep = parse(xepfile)
links = thexep.getElementsByTagName("link") urls = [link.getAttribute("url") for link in thexep.getElementsByTagName("link")]
deadlinks = 0 urls += [image.getAttribute("src") for image in thexep.getElementsByTagName("img")]
if verbose:
print 'Checking XEP-' + xepnum + ':'
for link in links:
url = link.getAttribute("url")
if re.match("^(http|https)", url):
if verbose: if verbose:
print url + ' :', print('Checking XEP-%s (%d links):' % (xepnum, len(urls)))
page = 0
try:
request = urllib2.Request(url)
request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101")
opener = urllib2.build_opener()
page = opener.open(request).read()
except Exception, e:
reason = str(e)
if verbose:
print "DEAD"
else:
print "XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]"
deadlinks = deadlinks + 1
else:
if verbose:
print 'OK'
#if deadlinks = 0: return [url for url in set(urls) if is_dead(url)]
#print "all http/https links are good"
def main():
parser = ArgumentParser(description=__doc__)
parser.add_argument('-v', '--verbose', action='store_true', help='Enables more verbosity')
parser.add_argument('-x', '--xep', type=int, help='Defines the number of the XEP to check')
args = parser.parse_args()
deadlinks = get_deadlinks(args.xep, args.verbose)
if deadlinks:
for url in deadlinks:
print(url)
sys.exit(1)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1:]) main()