From 9dbd567676c18967d490841a0691a0e9fd5e2c8b Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 3 Oct 2016 22:22:03 +0100 Subject: [PATCH 01/10] checkdeadlinks: Remove trailing whitespace and use four spaces instead of tabs. --- checkdeadlinks.py | 110 +++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/checkdeadlinks.py b/checkdeadlinks.py index 39bb386e..7d74248d 100755 --- a/checkdeadlinks.py +++ b/checkdeadlinks.py @@ -47,63 +47,63 @@ import urllib2 from xml.dom.minidom import parse,parseString,Document def usage(): - print "checkdeadlinks.py" - print "" - print "-h, --help Print this help message" - print "-x, --xep [number] Defines the number of the XEP to check" - print "-v, --verbose Enables more verbosity" + print "checkdeadlinks.py" + print "" + print "-h, --help Print this help message" + print "-x, --xep [number] Defines the number of the XEP to check" + print "-v, --verbose Enables more verbosity" def main(argv): - try: - opts, args = getopt.gnu_getopt(argv, "hv:x", ["help", "verbose", "xep="]) - except getopt.GetoptError: - usage() - sys.exit(2) + try: + opts, args = getopt.gnu_getopt(argv, "hv:x", ["help", "verbose", "xep="]) + except getopt.GetoptError: + usage() + sys.exit(2) - global verbose - verbose = 0 - for opt, arg in opts: - if opt in ("-h", "--help"): - usage() - sys.exit() - elif opt in ("-x", "--xep"): - global xepnum - xepnum = arg - elif opt in ("-v", "--verbose"): - verbose = 1 - - xepfile = 'xep-' + xepnum + '.xml' - thexep = parse(xepfile) - - links = thexep.getElementsByTagName("link") - deadlinks = 0 - if verbose: - print 'Checking XEP-' + xepnum + ':' - - for link in links: - url = link.getAttribute("url") - if re.match("^(http|https)", url): - if verbose: - print url + ' :', - page = 0 - try: - request = urllib2.Request(url) - request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101") - opener = urllib2.build_opener() - page = opener.open(request).read() - except Exception, e: - reason = str(e) - if verbose: - print "DEAD" - else: - print "XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]" - deadlinks = deadlinks + 1 - else: - if verbose: - print 'OK' - - #if deadlinks = 0: - #print "all http/https links are good" + global verbose + verbose = 0 + for opt, arg in opts: + if opt in ("-h", "--help"): + usage() + sys.exit() + elif opt in ("-x", "--xep"): + global xepnum + xepnum = arg + elif opt in ("-v", "--verbose"): + verbose = 1 + + xepfile = 'xep-' + xepnum + '.xml' + thexep = parse(xepfile) + + links = thexep.getElementsByTagName("link") + deadlinks = 0 + if verbose: + print 'Checking XEP-' + xepnum + ':' + + for link in links: + url = link.getAttribute("url") + if re.match("^(http|https)", url): + if verbose: + print url + ' :', + page = 0 + try: + request = urllib2.Request(url) + request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101") + opener = urllib2.build_opener() + page = opener.open(request).read() + except Exception, e: + reason = str(e) + if verbose: + print "DEAD" + else: + print "XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]" + deadlinks = deadlinks + 1 + else: + if verbose: + print 'OK' + + #if deadlinks = 0: + #print "all http/https links are good" if __name__ == "__main__": - main(sys.argv[1:]) + main(sys.argv[1:]) From 52e64d545b793a6a7d89b5283d113802cf20f590 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 3 Oct 2016 22:34:37 +0100 Subject: [PATCH 02/10] checkdeadlinks: Replace optparse with argparse to simplify argument parsing. --- checkdeadlinks.py | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/checkdeadlinks.py b/checkdeadlinks.py index 7d74248d..e6e11b12 100755 --- a/checkdeadlinks.py +++ b/checkdeadlinks.py @@ -32,11 +32,15 @@ # ## END LICENSE ## +''' +A script for checking XEPs for dead links. +''' + import glob import os from select import select import socket -import getopt +from argparse import ArgumentParser from string import split,strip,join,find import sys import time @@ -46,31 +50,14 @@ import urllib2 from xml.dom.minidom import parse,parseString,Document -def usage(): - print "checkdeadlinks.py" - print "" - print "-h, --help Print this help message" - print "-x, --xep [number] Defines the number of the XEP to check" - print "-v, --verbose Enables more verbosity" +def main(): + parser = ArgumentParser(description=__doc__) + parser.add_argument('-v', '--verbose', action='store_true', help='Enables more verbosity') + parser.add_argument('-x', '--xep', type=int, help='Defines the number of the XEP to check') + args = parser.parse_args() -def main(argv): - try: - opts, args = getopt.gnu_getopt(argv, "hv:x", ["help", "verbose", "xep="]) - except getopt.GetoptError: - usage() - sys.exit(2) - - global verbose - verbose = 0 - for opt, arg in opts: - if opt in ("-h", "--help"): - usage() - sys.exit() - elif opt in ("-x", "--xep"): - global xepnum - xepnum = arg - elif opt in ("-v", "--verbose"): - verbose = 1 + xepnum = '%04d' % args.xep + verbose = args.verbose xepfile = 'xep-' + xepnum + '.xml' thexep = parse(xepfile) @@ -106,4 +93,4 @@ def main(argv): #print "all http/https links are good" if __name__ == "__main__": - main(sys.argv[1:]) + main() From 3acdb8f2d15dd26195fd5c4a896107a41193ff39 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 3 Oct 2016 22:25:16 +0100 Subject: [PATCH 03/10] checkdeadlinks: Return 0 on no-deadlink, 1 on deadlinks. --- checkdeadlinks.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/checkdeadlinks.py b/checkdeadlinks.py index e6e11b12..bce434eb 100755 --- a/checkdeadlinks.py +++ b/checkdeadlinks.py @@ -81,16 +81,14 @@ def main(): except Exception, e: reason = str(e) if verbose: - print "DEAD" - else: print "XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]" deadlinks = deadlinks + 1 else: if verbose: print 'OK' - #if deadlinks = 0: - #print "all http/https links are good" + if deadlinks > 0: + sys.exit(1) if __name__ == "__main__": main() From 615d0918773b7ee026e814a349886f8c8ab96363 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 3 Oct 2016 22:43:47 +0100 Subject: [PATCH 04/10] checkdeadlinks: Move the checking code into its own is_dead() function. --- checkdeadlinks.py | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/checkdeadlinks.py b/checkdeadlinks.py index bce434eb..f8dfef02 100755 --- a/checkdeadlinks.py +++ b/checkdeadlinks.py @@ -50,13 +50,38 @@ import urllib2 from xml.dom.minidom import parse,parseString,Document +def is_dead(url): + if re.match("^(http|https)", url): + if verbose: + print url + ' :', + page = 0 + try: + request = urllib2.Request(url) + request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101") + opener = urllib2.build_opener() + page = opener.open(request).read() + except Exception, e: + reason = str(e) + if verbose: + print "XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]" + return True + else: + if verbose: + print 'OK' + return False + else: + return False + def main(): parser = ArgumentParser(description=__doc__) parser.add_argument('-v', '--verbose', action='store_true', help='Enables more verbosity') parser.add_argument('-x', '--xep', type=int, help='Defines the number of the XEP to check') args = parser.parse_args() + global xepnum xepnum = '%04d' % args.xep + + global verbose verbose = args.verbose xepfile = 'xep-' + xepnum + '.xml' @@ -69,23 +94,8 @@ def main(): for link in links: url = link.getAttribute("url") - if re.match("^(http|https)", url): - if verbose: - print url + ' :', - page = 0 - try: - request = urllib2.Request(url) - request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101") - opener = urllib2.build_opener() - page = opener.open(request).read() - except Exception, e: - reason = str(e) - if verbose: - print "XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]" - deadlinks = deadlinks + 1 - else: - if verbose: - print 'OK' + if is_dead(url): + deadlinks += 1 if deadlinks > 0: sys.exit(1) From 358fce44437e769a97c7527f71d4adca42147eab Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 3 Oct 2016 22:48:12 +0100 Subject: [PATCH 05/10] checkdeadlinks: Also check for images sources, and check each unique URL only once. --- checkdeadlinks.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/checkdeadlinks.py b/checkdeadlinks.py index f8dfef02..73ade3cf 100755 --- a/checkdeadlinks.py +++ b/checkdeadlinks.py @@ -87,13 +87,14 @@ def main(): xepfile = 'xep-' + xepnum + '.xml' thexep = parse(xepfile) - links = thexep.getElementsByTagName("link") deadlinks = 0 if verbose: print 'Checking XEP-' + xepnum + ':' - for link in links: - url = link.getAttribute("url") + urls = [link.getAttribute("url") for link in thexep.getElementsByTagName("link")] + urls += [image.getAttribute("src") for image in thexep.getElementsByTagName("img")] + + for url in set(urls): if is_dead(url): deadlinks += 1 From 230bab3bf9510040452249bbeb72ca984aa4cc2d Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 3 Oct 2016 22:56:39 +0100 Subject: [PATCH 06/10] checkdeadlinks: Remove unused imports. --- checkdeadlinks.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/checkdeadlinks.py b/checkdeadlinks.py index 73ade3cf..efe89fb8 100755 --- a/checkdeadlinks.py +++ b/checkdeadlinks.py @@ -36,19 +36,12 @@ A script for checking XEPs for dead links. ''' -import glob -import os -from select import select -import socket from argparse import ArgumentParser -from string import split,strip,join,find import sys -import time import re -import urllib import urllib2 -from xml.dom.minidom import parse,parseString,Document +from xml.dom.minidom import parse def is_dead(url): if re.match("^(http|https)", url): From 62348c310b30baeafc4190531ce643b22bc2e72b Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 3 Oct 2016 23:02:11 +0100 Subject: [PATCH 07/10] checkdeadlinks: Output the list of dead links on exit. --- checkdeadlinks.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/checkdeadlinks.py b/checkdeadlinks.py index efe89fb8..cd30168e 100755 --- a/checkdeadlinks.py +++ b/checkdeadlinks.py @@ -80,18 +80,17 @@ def main(): xepfile = 'xep-' + xepnum + '.xml' thexep = parse(xepfile) - deadlinks = 0 if verbose: print 'Checking XEP-' + xepnum + ':' urls = [link.getAttribute("url") for link in thexep.getElementsByTagName("link")] urls += [image.getAttribute("src") for image in thexep.getElementsByTagName("img")] - for url in set(urls): - if is_dead(url): - deadlinks += 1 + deadlinks = [url for url in set(urls) if is_dead(url)] - if deadlinks > 0: + if deadlinks: + for url in deadlinks: + print url sys.exit(1) if __name__ == "__main__": From 137e41fbba25d03364531e8f5ba5cd2adbf95610 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 3 Oct 2016 23:11:15 +0100 Subject: [PATCH 08/10] checkdeadlinks: Make the script compatible both python2 and python3. --- checkdeadlinks.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/checkdeadlinks.py b/checkdeadlinks.py index cd30168e..21407eab 100755 --- a/checkdeadlinks.py +++ b/checkdeadlinks.py @@ -36,31 +36,36 @@ A script for checking XEPs for dead links. ''' +from __future__ import print_function + from argparse import ArgumentParser import sys import re -import urllib2 from xml.dom.minidom import parse +try: + from urllib.request import Request, urlopen +except ImportError: + # We are on python2 + from urllib2 import Request, urlopen + def is_dead(url): if re.match("^(http|https)", url): if verbose: - print url + ' :', - page = 0 + print(url + ' :', end=' ') try: - request = urllib2.Request(url) + request = Request(url) request.add_header('User-Agent', "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101") - opener = urllib2.build_opener() - page = opener.open(request).read() - except Exception, e: + urlopen(request).read() + except Exception as e: reason = str(e) if verbose: - print "XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]" + print("XEP-" + xepnum + " - DEAD: " + url + " [" + reason + "]") return True else: if verbose: - print 'OK' + print('OK') return False else: return False @@ -81,7 +86,7 @@ def main(): thexep = parse(xepfile) if verbose: - print 'Checking XEP-' + xepnum + ':' + print('Checking XEP-' + xepnum + ':') urls = [link.getAttribute("url") for link in thexep.getElementsByTagName("link")] urls += [image.getAttribute("src") for image in thexep.getElementsByTagName("img")] @@ -90,7 +95,7 @@ def main(): if deadlinks: for url in deadlinks: - print url + print(url) sys.exit(1) if __name__ == "__main__": From e626ca87aadb55f3c87bfcef97c48de3ad283c11 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 3 Oct 2016 23:17:23 +0100 Subject: [PATCH 09/10] checkdeadlinks: Update the last modified header. --- checkdeadlinks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checkdeadlinks.py b/checkdeadlinks.py index 21407eab..e58ed1d3 100755 --- a/checkdeadlinks.py +++ b/checkdeadlinks.py @@ -3,7 +3,7 @@ # File: checkdeadlinks.py # Version: 0.1 # Description: a script for checking XEPs for dead links -# Last Modified: 2009-04-06 +# Last Modified: 2016-10-03 # Author: Tobias Markmann (tm@ayena.de) # License: public domain # HowTo: ./checkdeadlinks.py --xep=xepnum From 8496245e85a92176af6466f31d3553bf1c0b7af4 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 3 Oct 2016 23:32:43 +0100 Subject: [PATCH 10/10] checkdeadlinks: Move the main functionality into a separate function. --- checkdeadlinks.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/checkdeadlinks.py b/checkdeadlinks.py index e58ed1d3..bae091b4 100755 --- a/checkdeadlinks.py +++ b/checkdeadlinks.py @@ -70,28 +70,31 @@ def is_dead(url): else: return False +def get_deadlinks(xep, is_verbose=False): + global xepnum + xepnum = '%04d' % xep + + global verbose + verbose = is_verbose + + xepfile = 'xep-' + xepnum + '.xml' + thexep = parse(xepfile) + + urls = [link.getAttribute("url") for link in thexep.getElementsByTagName("link")] + urls += [image.getAttribute("src") for image in thexep.getElementsByTagName("img")] + + if verbose: + print('Checking XEP-%s (%d links):' % (xepnum, len(urls))) + + return [url for url in set(urls) if is_dead(url)] + def main(): parser = ArgumentParser(description=__doc__) parser.add_argument('-v', '--verbose', action='store_true', help='Enables more verbosity') parser.add_argument('-x', '--xep', type=int, help='Defines the number of the XEP to check') args = parser.parse_args() - global xepnum - xepnum = '%04d' % args.xep - - global verbose - verbose = args.verbose - - xepfile = 'xep-' + xepnum + '.xml' - thexep = parse(xepfile) - - if verbose: - print('Checking XEP-' + xepnum + ':') - - urls = [link.getAttribute("url") for link in thexep.getElementsByTagName("link")] - urls += [image.getAttribute("src") for image in thexep.getElementsByTagName("img")] - - deadlinks = [url for url in set(urls) if is_dead(url)] + deadlinks = get_deadlinks(args.xep, args.verbose) if deadlinks: for url in deadlinks: