xeps/tools/extract-metadata.py

#!/usr/bin/env python3
import pathlib
import sys
import xml.dom.minidom

import xml.etree.ElementTree as etree

from xeplib import (
    minidom_find_child,
    minidom_find_header,
    minidom_get_text,
    minidom_children,
)


DESCRIPTION = """\
Extract a list of XEPs with metadata from the xeps repository."""

EPILOG = """"""


def open_xml(f):
    return xml.dom.minidom.parse(f)


def extract_revision_text(remark_el):
    remark_children = minidom_children(remark_el)
    if len(remark_children) == 1 and remark_children[0].tagName == "p":
        return minidom_get_text(remark_children[0])
    if len(remark_children) == 0:
        return minidom_get_text(remark_el) or None
    if minidom_get_text(remark_el).strip():
        return None
    lines = []
    for child in remark_children:
        if child.tagName == "p":
            lines.append(minidom_get_text(child))
        elif child.tagName == "ul":
            for ul_child in minidom_children(child):
                if ul_child.tagName == "li":
                    lines.append("* {}".format(minidom_get_text(ul_child)))

    return "\n".join(lines)


def extract_xep_metadata(document):
    header = minidom_find_header(document)

    latest_revision = minidom_find_child(header, "revision")
    if latest_revision is not None:
        last_revision_version = minidom_get_text(
            minidom_find_child(latest_revision, "version")
        )
        last_revision_date = minidom_get_text(
            minidom_find_child(latest_revision, "date")
        )
        remark_el = minidom_find_child(latest_revision, "remark")
        last_revision_remark = None
        if remark_el is not None:
            last_revision_remark = extract_revision_text(remark_el)

        if last_revision_remark is not None:
            initials_el = minidom_find_child(latest_revision, "initials")
            last_revision_initials = initials_el and minidom_get_text(
                initials_el
            )
        else:
            last_revision_initials = None
    else:
        last_revision_version = None
        last_revision_date = None
        last_revision_remark = None
        last_revision_initials = None

    status = minidom_get_text(minidom_find_child(header, "status"))
    type_ = minidom_get_text(minidom_find_child(header, "type"))
    abstract = " ".join(minidom_get_text(
        minidom_find_child(header, "abstract")
    ).split())
    sig_el = minidom_find_child(header, "sig")
    if sig_el is None:
        sig = None
    else:
        sig = minidom_get_text(sig_el)
    shortname = minidom_get_text(minidom_find_child(header, "shortname"))
    if shortname.replace("-", " ").replace("_", " ").lower() in [
            "not yet assigned", "n/a", "none", "to be assigned",
            "to be issued"]:
        shortname = None
    title = minidom_get_text(minidom_find_child(header, "title"))

    approver_el = minidom_find_child(header, "approver")
    if approver_el is not None:
        approver = minidom_get_text(approver_el)
    else:
        approver = "Board" if type_ == "Procedural" else "Council"

    last_call_el = minidom_find_child(header, "lastcall")
    if last_call_el is not None:
        last_call = minidom_get_text(last_call_el)
    else:
        last_call = None

    return {
        "last_revision": {
            "version": last_revision_version,
            "date": last_revision_date,
            "initials": last_revision_initials,
            "remark": last_revision_remark,
        },
        "status": status,
        "type": type_,
        "sig": sig,
        "abstract": abstract,
        "shortname": shortname,
        "title": title,
        "approver": approver,
        "last_call": last_call,
    }


def text_element(tag, text):
    el = etree.Element(tag)
    el.text = text
    return el


def make_metadata_element(number, metadata, accepted, *, protoname=None):
    result = etree.Element("xep")
    result.append(text_element("number", number))
    result.append(text_element("title", metadata["title"]))
    result.append(text_element("abstract", metadata["abstract"]))
    result.append(text_element("type", metadata["type"]))
    result.append(text_element("status", metadata["status"]))
    result.append(text_element("approver", metadata["approver"]))

    if metadata["shortname"] is not None:
        result.append(text_element("shortname", metadata["shortname"]))

    if metadata["last_revision"]["version"] is not None:
        last_revision = metadata["last_revision"]
        revision_el = etree.Element("last-revision")
        revision_el.append(text_element("date", last_revision["date"]))
        revision_el.append(text_element("version", last_revision["version"]))
        if last_revision["initials"]:
            revision_el.append(text_element("initials",
                                            last_revision["initials"]))
        if last_revision["remark"]:
            revision_el.append(text_element("remark",
                                            last_revision["remark"]))
        result.append(revision_el)

    if metadata["sig"] is not None:
        result.append(
            text_element("sig", metadata["sig"])
        )

    if metadata["last_call"] is not None:
        result.append(
            text_element("lastcall", metadata["last_call"])
        )

    if accepted:
        result.set("accepted", "true")
    else:
        result.set("accepted", "false")

    if protoname is not None:
        result.append(text_element("proto-name", protoname))

    return result


def parse_checked_and_print_error(xepfile):
    try:
        with xepfile.open("rb") as f:
            return open_xml(f)
    except xml.parsers.expat.ExpatError as exc:
        print("{}: {}".format(xepfile, exc), file=sys.stderr)
        return None


def main():
    import argparse
    import sys

    parser = argparse.ArgumentParser(
        description=DESCRIPTION,
        epilog=EPILOG,
    )
    parser.add_argument(
        "xepdir",
        nargs="?",
        type=pathlib.Path,
        default=pathlib.Path.cwd(),
        help="Directory where the XEP XMLs are. Defaults to current directory."
    )

    args = parser.parse_args()

    tree = etree.Element("xep-infos")

    has_error = False

    for xepfile in args.xepdir.glob("xep-[0-9][0-9][0-9][0-9].xml"):
        number = xepfile.name.split("-", 1)[1].split(".", 1)[0]
        try:
            number = str(int(number))
        except ValueError:
            continue

        parsed = parse_checked_and_print_error(xepfile)
        if parsed is None:
            has_error = True
            continue

        tree.append(make_metadata_element(
            number,
            extract_xep_metadata(parsed),
            True,
        ))

    for xepfile in (args.xepdir / "inbox").glob("*.xml"):
        protoname = xepfile.name.rsplit(".", 1)[0]

        parsed = parse_checked_and_print_error(xepfile)
        if parsed is None:
            has_error = True
            continue

        tree.append(make_metadata_element(
            "xxxx",
            extract_xep_metadata(parsed),
            False,
            protoname=protoname
        ))

    if has_error:
        sys.exit(2)

    sys.stdout.buffer.raw.write(etree.tostring(tree))


if __name__ == "__main__":
    main()
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`#!/usr/bin/env python3`
			`import pathlib`
tooling: print metadata extraction errors to stderr 2017-08-23 10:33:28 -04:00			`import sys`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`import xml.dom.minidom`

			`import xml.etree.ElementTree as etree`

tooling: refactor commonly used stuff into xeplib.py 2017-08-23 07:47:09 -04:00			`from xeplib import (`
			`minidom_find_child,`
			`minidom_find_header,`
			`minidom_get_text,`
			`minidom_children,`
			`)`

tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00
			`DESCRIPTION = """\`
			`Extract a list of XEPs with metadata from the xeps repository."""`

			`EPILOG = """"""`


			`def open_xml(f):`
			`return xml.dom.minidom.parse(f)`


Support for multi-<p/> revision block remarks 2017-11-29 05:55:26 -05:00			`def extract_revision_text(remark_el):`
			`remark_children = minidom_children(remark_el)`
			`if len(remark_children) == 1 and remark_children[0].tagName == "p":`
			`return minidom_get_text(remark_children[0])`
			`if len(remark_children) == 0:`
			`return minidom_get_text(remark_el) or None`
			`if minidom_get_text(remark_el).strip():`
			`return None`
			`lines = []`
			`for child in remark_children:`
			`if child.tagName == "p":`
			`lines.append(minidom_get_text(child))`
tools: Support for <ul/> in revision history 2019-01-13 06:20:01 -05:00			`elif child.tagName == "ul":`
			`for ul_child in minidom_children(child):`
			`if ul_child.tagName == "li":`
			`lines.append("* {}".format(minidom_get_text(ul_child)))`
Support for multi-<p/> revision block remarks 2017-11-29 05:55:26 -05:00
			`return "\n".join(lines)`


tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`def extract_xep_metadata(document):`
tooling: refactor commonly used stuff into xeplib.py 2017-08-23 07:47:09 -04:00			`header = minidom_find_header(document)`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00
tooling: refactor commonly used stuff into xeplib.py 2017-08-23 07:47:09 -04:00			`latest_revision = minidom_find_child(header, "revision")`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`if latest_revision is not None:`
tooling: refactor commonly used stuff into xeplib.py 2017-08-23 07:47:09 -04:00			`last_revision_version = minidom_get_text(`
			`minidom_find_child(latest_revision, "version")`
			`)`
			`last_revision_date = minidom_get_text(`
			`minidom_find_child(latest_revision, "date")`
			`)`
			`remark_el = minidom_find_child(latest_revision, "remark")`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`last_revision_remark = None`
			`if remark_el is not None:`
Support for multi-<p/> revision block remarks 2017-11-29 05:55:26 -05:00			`last_revision_remark = extract_revision_text(remark_el)`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00
			`if last_revision_remark is not None:`
tooling: refactor commonly used stuff into xeplib.py 2017-08-23 07:47:09 -04:00			`initials_el = minidom_find_child(latest_revision, "initials")`
			`last_revision_initials = initials_el and minidom_get_text(`
			`initials_el`
			`)`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`else:`
			`last_revision_initials = None`
			`else:`
			`last_revision_version = None`
			`last_revision_date = None`
			`last_revision_remark = None`
			`last_revision_initials = None`

tooling: refactor commonly used stuff into xeplib.py 2017-08-23 07:47:09 -04:00			`status = minidom_get_text(minidom_find_child(header, "status"))`
			`type_ = minidom_get_text(minidom_find_child(header, "type"))`
			`abstract = " ".join(minidom_get_text(`
			`minidom_find_child(header, "abstract")`
			`).split())`
			`sig_el = minidom_find_child(header, "sig")`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`if sig_el is None:`
			`sig = None`
			`else:`
tooling: refactor commonly used stuff into xeplib.py 2017-08-23 07:47:09 -04:00			`sig = minidom_get_text(sig_el)`
			`shortname = minidom_get_text(minidom_find_child(header, "shortname"))`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`if shortname.replace("-", " ").replace("_", " ").lower() in [`
			`"not yet assigned", "n/a", "none", "to be assigned",`
			`"to be issued"]:`
			`shortname = None`
tooling: refactor commonly used stuff into xeplib.py 2017-08-23 07:47:09 -04:00			`title = minidom_get_text(minidom_find_child(header, "title"))`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00
tooling: refactor commonly used stuff into xeplib.py 2017-08-23 07:47:09 -04:00			`approver_el = minidom_find_child(header, "approver")`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`if approver_el is not None:`
tooling: refactor commonly used stuff into xeplib.py 2017-08-23 07:47:09 -04:00			`approver = minidom_get_text(approver_el)`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`else:`
			`approver = "Board" if type_ == "Procedural" else "Council"`

tools: Update tooling to support Last Calls 2017-10-16 14:39:21 -04:00			`last_call_el = minidom_find_child(header, "lastcall")`
			`if last_call_el is not None:`
			`last_call = minidom_get_text(last_call_el)`
			`else:`
			`last_call = None`

tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`return {`
			`"last_revision": {`
			`"version": last_revision_version,`
			`"date": last_revision_date,`
			`"initials": last_revision_initials,`
			`"remark": last_revision_remark,`
			`},`
			`"status": status,`
			`"type": type_,`
			`"sig": sig,`
			`"abstract": abstract,`
			`"shortname": shortname,`
			`"title": title,`
			`"approver": approver,`
tools: Update tooling to support Last Calls 2017-10-16 14:39:21 -04:00			`"last_call": last_call,`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`}`


			`def text_element(tag, text):`
			`el = etree.Element(tag)`
			`el.text = text`
			`return el`


			`def make_metadata_element(number, metadata, accepted, *, protoname=None):`
			`result = etree.Element("xep")`
			`result.append(text_element("number", number))`
			`result.append(text_element("title", metadata["title"]))`
			`result.append(text_element("abstract", metadata["abstract"]))`
			`result.append(text_element("type", metadata["type"]))`
			`result.append(text_element("status", metadata["status"]))`
			`result.append(text_element("approver", metadata["approver"]))`

			`if metadata["shortname"] is not None:`
			`result.append(text_element("shortname", metadata["shortname"]))`

			`if metadata["last_revision"]["version"] is not None:`
			`last_revision = metadata["last_revision"]`
			`revision_el = etree.Element("last-revision")`
			`revision_el.append(text_element("date", last_revision["date"]))`
			`revision_el.append(text_element("version", last_revision["version"]))`
			`if last_revision["initials"]:`
			`revision_el.append(text_element("initials",`
			`last_revision["initials"]))`
			`if last_revision["remark"]:`
			`revision_el.append(text_element("remark",`
			`last_revision["remark"]))`
			`result.append(revision_el)`

			`if metadata["sig"] is not None:`
			`result.append(`
			`text_element("sig", metadata["sig"])`
			`)`

tools: Update tooling to support Last Calls 2017-10-16 14:39:21 -04:00			`if metadata["last_call"] is not None:`
			`result.append(`
			`text_element("lastcall", metadata["last_call"])`
			`)`

tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`if accepted:`
			`result.set("accepted", "true")`
			`else:`
			`result.set("accepted", "false")`

			`if protoname is not None:`
			`result.append(text_element("proto-name", protoname))`

			`return result`


extract-metadata: make error handling cleaner 2017-08-23 11:00:55 -04:00			`def parse_checked_and_print_error(xepfile):`
			`try:`
			`with xepfile.open("rb") as f:`
			`return open_xml(f)`
			`except xml.parsers.expat.ExpatError as exc:`
			`print("{}: {}".format(xepfile, exc), file=sys.stderr)`
			`return None`


tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`def main():`
			`import argparse`
			`import sys`

			`parser = argparse.ArgumentParser(`
			`description=DESCRIPTION,`
			`epilog=EPILOG,`
			`)`
			`parser.add_argument(`
			`"xepdir",`
			`nargs="?",`
			`type=pathlib.Path,`
			`default=pathlib.Path.cwd(),`
			`help="Directory where the XEP XMLs are. Defaults to current directory."`
			`)`

			`args = parser.parse_args()`

			`tree = etree.Element("xep-infos")`

tooling: print metadata extraction errors to stderr 2017-08-23 10:33:28 -04:00			`has_error = False`

tools: be more specific about XEP filenames Sometimes the main working directory may have other XML files such as the temporary files generated by TeXML (xep-*.tex.xml) which may be picked up by the metadata script. Use a more specific pattern so that it only extracts metadata from real XEP files. 2019-12-19 18:04:42 -05:00			`for xepfile in args.xepdir.glob("xep-[0-9][0-9][0-9][0-9].xml"):`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00			`number = xepfile.name.split("-", 1)[1].split(".", 1)[0]`
			`try:`
			`number = str(int(number))`
			`except ValueError:`
			`continue`

extract-metadata: make error handling cleaner 2017-08-23 11:00:55 -04:00			`parsed = parse_checked_and_print_error(xepfile)`
			`if parsed is None:`
tooling: print metadata extraction errors to stderr 2017-08-23 10:33:28 -04:00			`has_error = True`
extract-metadata: make error handling cleaner 2017-08-23 11:00:55 -04:00			`continue`

			`tree.append(make_metadata_element(`
			`number,`
			`extract_xep_metadata(parsed),`
			`True,`
			`))`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00
			`for xepfile in (args.xepdir / "inbox").glob("*.xml"):`
			`protoname = xepfile.name.rsplit(".", 1)[0]`

extract-metadata: make error handling cleaner 2017-08-23 11:00:55 -04:00			`parsed = parse_checked_and_print_error(xepfile)`
			`if parsed is None:`
tooling: print metadata extraction errors to stderr 2017-08-23 10:33:28 -04:00			`has_error = True`
extract-metadata: make error handling cleaner 2017-08-23 11:00:55 -04:00			`continue`

			`tree.append(make_metadata_element(`
			`"xxxx",`
			`extract_xep_metadata(parsed),`
			`False,`
			`protoname=protoname`
			`))`
tooling: print metadata extraction errors to stderr 2017-08-23 10:33:28 -04:00
			`if has_error:`
			`sys.exit(2)`
tooling: Tool to extract a XEP metadata list from the repository 2017-08-23 03:26:05 -04:00
			`sys.stdout.buffer.raw.write(etree.tostring(tree))`


			`if __name__ == "__main__":`
			`main()`