release11-tools/Backend-libXML/Parser.py

from lxml import etree, html
from io import BytesIO


def convertHTML(source: str, sourceFrom: str):
    htmlParser = html.HTMLParser(remove_comments=True, remove_blank_text=True)
    xmlParser = etree.XMLParser(remove_comments=True, remove_blank_text=True)

    if sourceFrom == "xml":
        xmldoc = etree.parse(BytesIO(source.encode("utf-8")), xmlParser)
        return html.tostring(xmldoc, method="html", pretty_print=True, doctype="<!DOCTYPE html>").decode()
    elif sourceFrom == "html":
        htmldoc = html.parse(BytesIO(source.encode("utf-8")), htmlParser)
        return etree.tostring(htmldoc, method="xml", pretty_print=True, doctype="", xml_declaration=True, encoding="utf-8").decode()
    else:
        return


def formatHTML(source: str, prettify: bool) -> str:
    parser = html.HTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True)
    htmlDoc = html.parse(BytesIO(source.encode("utf-8")),parser=parser)
    if not prettify:
        return html.tostring(htmlDoc).decode().replace("\n", "").replace(">    ", ">")
    return etree.tostring(htmlDoc, encoding='unicode', pretty_print=True)

def formatXML(source: str, prettify: bool) -> str:
    """Method used to format XML

    :param source: XML to format
    :param prettify: sets if XML must be prettified
    (added indentations etc.) or not
    :return: formatted XML
    """

    # Prolog is removed when XML is parsed
    # so program has to copy it
    prolog = ""
    prolog_start = source.find("<?")

    if prolog_start != -1:
        prolog_end = source.find("?>") + 2
        prolog = source[prolog_start:prolog_end]
        source = source[prolog_end:].strip()

    byte_input = BytesIO(source.encode("utf-8"))
    parser = etree.XMLParser(remove_blank_text=True)
    xml = etree.parse(byte_input, parser=parser)

    if prettify:
        prolog += "\n"
        return prolog + etree.tostring(xml, pretty_print=True, encoding="utf-8").decode("utf-8")

    raw_xml = etree.tostring(xml, encoding="utf-8").decode("utf-8")
    raw_xml = " ".join(raw_xml.split())

    return prolog + raw_xml

def xpath(source: str, xpath: str) -> str:
    """
    Method used to get nodes from XML string using XPath

    :param source: XML string used for selection
    :param xpath: XPath query used for selection
    :return: Nodes selected using XPath
    """

    byte_input = BytesIO(source.encode("utf-8"))
    root = etree.parse(byte_input).getroot()
    nsmap = root.nsmap

    # LXML doesn't accept empty (None) namespace prefix,
    # so it need to be deleted if exists
    if None in nsmap:
        nsmap.pop(None)

    result = root.xpath(xpath, namespaces=nsmap)

    # root.xpath can return 4 types: float, string, bool and list.
    # List is the only one that can't be simply converted to str
    if type(result) is not list:
        return str(result), type(result).__name__
    else:
        result_string = ""
        for e in result:
            result_string += etree.tostring(e, pretty_print=True).decode() + "\n"
        return result_string, "node"


def xsd(source: str, xsd: str) -> bool:
    """
    Method used to validate XML string against XSD schema
    :param source: XML string used for validation
    :param xsd: XSD schema to validate XML against
    :return: Message saying, if the validation was successful or not
    """

    schema_input = BytesIO(xsd.encode("utf-8"))
    xml_schema = etree.XMLSchema(etree.parse(schema_input).getroot())

    document_input = BytesIO(source.encode("utf-8"))
    xml = etree.parse(document_input).getroot()

    try:
        xml_schema.assertValid(xml)
        return "XML is valid"
    except etree.DocumentInvalid as e:
        return str(e)

def xslt(source: str, xslt: str) -> str:
    """
    Method used to transform XML string using XSLT

    :param source: XML string to transform
    :param xslt: XSLT string used to transform XML
    :return: Result of transformation
    """
    xslt_input = BytesIO(xslt.encode("utf-8"))
    xslt_transform = etree.XSLT(etree.parse(xslt_input))

    document_input = BytesIO(source.encode("utf-8"))
    xml = etree.parse(document_input).getroot()

    transformed = str(xslt_transform(xml))
    return formatXML(transformed, True)