1
0
mirror of https://github.com/moparisthebest/xeps synced 2024-11-28 12:12:22 -05:00

Merge pull request #253 from linkmauve/improve-checkdeadlinks

Improve checkdeadlinks.py
This commit is contained in:
Sam Whited 2016-10-12 10:07:38 -07:00 committed by GitHub
commit 914273677c

View File

@ -3,7 +3,7 @@
# File: checkdeadlinks.py # File: checkdeadlinks.py
# Version: 0.1 # Version: 0.1
# Description: a script for checking XEPs for dead links # Description: a script for checking XEPs for dead links
# Last Modified: 2009-04-06 # Last Modified: 2016-10-03
# Author: Tobias Markmann (tm@ayena.de) # Author: Tobias Markmann (tm@ayena.de)
# License: public domain # License: public domain
# HowTo: ./checkdeadlinks.py --xep=xepnum # HowTo: ./checkdeadlinks.py --xep=xepnum
@ -32,78 +32,74 @@
# #
## END LICENSE ## ## END LICENSE ##
import glob '''
import os A script for checking XEPs for dead links.
from select import select '''
import socket
import getopt from __future__ import print_function
from string import split,strip,join,find
from argparse import ArgumentParser
import sys import sys
import time
import re import re
import urllib
import urllib2
from xml.dom.minidom import parse,parseString,Document from xml.dom.minidom import parse
def usage(): try:
print "checkdeadlinks.py" from urllib.request import Request, urlopen
print "" except ImportError:
print "-h, --help Print this help message" # We are on python2
print "-x, --xep [number] Defines the number of the XEP to check" from urllib2 import Request, urlopen
print "-v, --verbose Enables more verbosity"
def main(argv): def is_dead(url):
try: if re.match("^(http|https)", url):
opts, args = getopt.gnu_getopt(argv, "hv:x", ["help", "verbose", "xep="]) if verbose:
except getopt.GetoptError: print(url + ' :', end=' ')
usage() try:
sys.exit(2) request = Request(url)
request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101")
urlopen(request).read()
except Exception as e:
reason = str(e)
if verbose:
print("XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]")
return True
else:
if verbose:
print('OK')
return False
else:
return False
global verbose def get_deadlinks(xep, is_verbose=False):
verbose = 0 global xepnum
for opt, arg in opts: xepnum = '%04d' % xep
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-x", "--xep"):
global xepnum
xepnum = arg
elif opt in ("-v", "--verbose"):
verbose = 1
xepfile = 'xep-' + xepnum + '.xml' global verbose
thexep = parse(xepfile) verbose = is_verbose
links = thexep.getElementsByTagName("link") xepfile = 'xep-' + xepnum + '.xml'
deadlinks = 0 thexep = parse(xepfile)
if verbose:
print 'Checking XEP-' + xepnum + ':'
for link in links: urls = [link.getAttribute("url") for link in thexep.getElementsByTagName("link")]
url = link.getAttribute("url") urls += [image.getAttribute("src") for image in thexep.getElementsByTagName("img")]
if re.match("^(http|https)", url):
if verbose:
print url + ' :',
page = 0
try:
request = urllib2.Request(url)
request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101")
opener = urllib2.build_opener()
page = opener.open(request).read()
except Exception, e:
reason = str(e)
if verbose:
print "DEAD"
else:
print "XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]"
deadlinks = deadlinks + 1
else:
if verbose:
print 'OK'
#if deadlinks = 0: if verbose:
#print "all http/https links are good" print('Checking XEP-%s (%d links):' % (xepnum, len(urls)))
return [url for url in set(urls) if is_dead(url)]
def main():
parser = ArgumentParser(description=__doc__)
parser.add_argument('-v', '--verbose', action='store_true', help='Enables more verbosity')
parser.add_argument('-x', '--xep', type=int, help='Defines the number of the XEP to check')
args = parser.parse_args()
deadlinks = get_deadlinks(args.xep, args.verbose)
if deadlinks:
for url in deadlinks:
print(url)
sys.exit(1)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1:]) main()