SickRage/lib/html5lib/tests/tokenizertotree.py

import sys
import os
import json
import re

import html5lib
import support
import test_parser
import test_tokenizer

p = html5lib.HTMLParser()

unnamespaceExpected = re.compile(r"^(\|\s*)<html ([^>]+)>", re.M).sub

def main(out_path):
    if not os.path.exists(out_path):
        sys.stderr.write("Path %s does not exist"%out_path)
        sys.exit(1)

    for filename in support.html5lib_test_files('tokenizer', '*.test'):
        run_file(filename, out_path)

def run_file(filename, out_path):
    try:
        tests_data = json.load(file(filename))
    except ValueError:
        sys.stderr.write("Failed to load %s\n"%filename)
        return
    name = os.path.splitext(os.path.split(filename)[1])[0]
    output_file = open(os.path.join(out_path, "tokenizer_%s.dat"%name), "w")

    if 'tests' in tests_data:
        for test_data in tests_data['tests']:
            if 'initialStates' not in test_data:
                test_data["initialStates"] = ["Data state"]
                
            for initial_state in test_data["initialStates"]:
                if initial_state != "Data state":
                    #don't support this yet
                    continue
                test = make_test(test_data)
                output_file.write(test)

    output_file.close()

def make_test(test_data):
    if 'doubleEscaped' in test_data:
        test_data = test_tokenizer.unescape_test(test_data)

    rv = []
    rv.append("#data")
    rv.append(test_data["input"].encode("utf8"))
    rv.append("#errors")
    tree = p.parse(test_data["input"])
    output = p.tree.testSerializer(tree)
    output  = "\n".join(("| "+ line[3:]) if line.startswith("|  ") else line
                        for line in output.split("\n"))
    output = unnamespaceExpected(r"\1<\2>", output)
    rv.append(output.encode("utf8"))
    rv.append("")
    return "\n".join(rv)

if __name__ == "__main__":
    main(sys.argv[1])
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 01:18:05 -04:00			`import sys`
			`import os`
			`import json`
			`import re`

			`import html5lib`
			`import support`
			`import test_parser`
			`import test_tokenizer`

			`p = html5lib.HTMLParser()`

			`unnamespaceExpected = re.compile(r"^(\\|\s*)<html ([^>]+)>", re.M).sub`

			`def main(out_path):`
			`if not os.path.exists(out_path):`
			`sys.stderr.write("Path %s does not exist"%out_path)`
			`sys.exit(1)`

			`for filename in support.html5lib_test_files('tokenizer', '*.test'):`
			`run_file(filename, out_path)`

			`def run_file(filename, out_path):`
			`try:`
			`tests_data = json.load(file(filename))`
			`except ValueError:`
			`sys.stderr.write("Failed to load %s\n"%filename)`
			`return`
			`name = os.path.splitext(os.path.split(filename)[1])[0]`
			`output_file = open(os.path.join(out_path, "tokenizer_%s.dat"%name), "w")`

			`if 'tests' in tests_data:`
			`for test_data in tests_data['tests']:`
			`if 'initialStates' not in test_data:`
			`test_data["initialStates"] = ["Data state"]`

			`for initial_state in test_data["initialStates"]:`
			`if initial_state != "Data state":`
			`#don't support this yet`
			`continue`
			`test = make_test(test_data)`
			`output_file.write(test)`

			`output_file.close()`

			`def make_test(test_data):`
			`if 'doubleEscaped' in test_data:`
			`test_data = test_tokenizer.unescape_test(test_data)`

			`rv = []`
			`rv.append("#data")`
			`rv.append(test_data["input"].encode("utf8"))`
			`rv.append("#errors")`
			`tree = p.parse(test_data["input"])`
			`output = p.tree.testSerializer(tree)`
			`output = "\n".join(("\| "+ line[3:]) if line.startswith("\| ") else line`
			`for line in output.split("\n"))`
			`output = unnamespaceExpected(r"\1<\2>", output)`
			`rv.append(output.encode("utf8"))`
			`rv.append("")`
			`return "\n".join(rv)`

			`if __name__ == "__main__":`
			`main(sys.argv[1])`