1
0
mirror of https://github.com/moparisthebest/xeps synced 2024-11-28 12:12:22 -05:00

Merge pull request #253 from linkmauve/improve-checkdeadlinks

Improve checkdeadlinks.py
This commit is contained in:
Sam Whited 2016-10-12 10:07:38 -07:00 committed by GitHub
commit 914273677c

View File

@ -3,7 +3,7 @@
# File: checkdeadlinks.py # File: checkdeadlinks.py
# Version: 0.1 # Version: 0.1
# Description: a script for checking XEPs for dead links # Description: a script for checking XEPs for dead links
# Last Modified: 2009-04-06 # Last Modified: 2016-10-03
# Author: Tobias Markmann (tm@ayena.de) # Author: Tobias Markmann (tm@ayena.de)
# License: public domain # License: public domain
# HowTo: ./checkdeadlinks.py --xep=xepnum # HowTo: ./checkdeadlinks.py --xep=xepnum
@ -32,78 +32,74 @@
# #
## END LICENSE ## ## END LICENSE ##
import glob '''
import os A script for checking XEPs for dead links.
from select import select '''
import socket
import getopt from __future__ import print_function
from string import split,strip,join,find
from argparse import ArgumentParser
import sys import sys
import time
import re import re
import urllib
import urllib2
from xml.dom.minidom import parse,parseString,Document from xml.dom.minidom import parse
def usage():
print "checkdeadlinks.py"
print ""
print "-h, --help Print this help message"
print "-x, --xep [number] Defines the number of the XEP to check"
print "-v, --verbose Enables more verbosity"
def main(argv):
try: try:
opts, args = getopt.gnu_getopt(argv, "hv:x", ["help", "verbose", "xep="]) from urllib.request import Request, urlopen
except getopt.GetoptError: except ImportError:
usage() # We are on python2
sys.exit(2) from urllib2 import Request, urlopen
def is_dead(url):
if re.match("^(http|https)", url):
if verbose:
print(url + ' :', end=' ')
try:
request = Request(url)
request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101")
urlopen(request).read()
except Exception as e:
reason = str(e)
if verbose:
print("XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]")
return True
else:
if verbose:
print('OK')
return False
else:
return False
def get_deadlinks(xep, is_verbose=False):
global xepnum
xepnum = '%04d' % xep
global verbose global verbose
verbose = 0 verbose = is_verbose
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-x", "--xep"):
global xepnum
xepnum = arg
elif opt in ("-v", "--verbose"):
verbose = 1
xepfile = 'xep-' + xepnum + '.xml' xepfile = 'xep-' + xepnum + '.xml'
thexep = parse(xepfile) thexep = parse(xepfile)
links = thexep.getElementsByTagName("link") urls = [link.getAttribute("url") for link in thexep.getElementsByTagName("link")]
deadlinks = 0 urls += [image.getAttribute("src") for image in thexep.getElementsByTagName("img")]
if verbose:
print 'Checking XEP-' + xepnum + ':'
for link in links:
url = link.getAttribute("url")
if re.match("^(http|https)", url):
if verbose: if verbose:
print url + ' :', print('Checking XEP-%s (%d links):' % (xepnum, len(urls)))
page = 0
try:
request = urllib2.Request(url)
request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101")
opener = urllib2.build_opener()
page = opener.open(request).read()
except Exception, e:
reason = str(e)
if verbose:
print "DEAD"
else:
print "XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]"
deadlinks = deadlinks + 1
else:
if verbose:
print 'OK'
#if deadlinks = 0: return [url for url in set(urls) if is_dead(url)]
#print "all http/https links are good"
def main():
parser = ArgumentParser(description=__doc__)
parser.add_argument('-v', '--verbose', action='store_true', help='Enables more verbosity')
parser.add_argument('-x', '--xep', type=int, help='Defines the number of the XEP to check')
args = parser.parse_args()
deadlinks = get_deadlinks(args.xep, args.verbose)
if deadlinks:
for url in deadlinks:
print(url)
sys.exit(1)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1:]) main()