Files
release11-tools/Backend-libXML/Parser.py
2025-02-10 13:24:49 +01:00

124 lines
4.2 KiB
Python

from lxml import etree, html
from io import BytesIO
def convertHTML(source: str, sourceFrom: str):
htmlParser = html.HTMLParser(remove_comments=True, remove_blank_text=True)
xmlParser = etree.XMLParser(remove_comments=True, remove_blank_text=True)
if sourceFrom == "xml":
xmldoc = etree.parse(BytesIO(source.encode("utf-8")), xmlParser)
return html.tostring(xmldoc, method="html", pretty_print=True, doctype="<!DOCTYPE html>").decode()
elif sourceFrom == "html":
htmldoc = html.parse(BytesIO(source.encode("utf-8")), htmlParser)
return etree.tostring(htmldoc, method="xml", pretty_print=True, doctype="", xml_declaration=True, encoding="utf-8").decode()
else:
return
def formatHTML(source: str, prettify: bool) -> str:
parser = html.HTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True)
htmlDoc = html.parse(BytesIO(source.encode("utf-8")),parser=parser)
if not prettify:
return html.tostring(htmlDoc).decode().replace("\n", "").replace("> ", ">")
return etree.tostring(htmlDoc, encoding='unicode', pretty_print=True)
def formatXML(source: str, prettify: bool) -> str:
"""Method used to format XML
:param source: XML to format
:param prettify: sets if XML must be prettified
(added indentations etc.) or not
:return: formatted XML
"""
# Prolog is removed when XML is parsed
# so program has to copy it
prolog = ""
prolog_start = source.find("<?")
if prolog_start != -1:
prolog_end = source.find("?>") + 2
prolog = source[prolog_start:prolog_end]
source = source[prolog_end:].strip()
byte_input = BytesIO(source.encode("utf-8"))
parser = etree.XMLParser(remove_blank_text=True)
xml = etree.parse(byte_input, parser=parser)
if prettify:
prolog += "\n"
return prolog + etree.tostring(xml, pretty_print=True, encoding="utf-8").decode("utf-8")
raw_xml = etree.tostring(xml, encoding="utf-8").decode("utf-8")
raw_xml = " ".join(raw_xml.split())
return prolog + raw_xml
def xpath(source: str, xpath: str) -> str:
"""
Method used to get nodes from XML string using XPath
:param source: XML string used for selection
:param xpath: XPath query used for selection
:return: Nodes selected using XPath
"""
byte_input = BytesIO(source.encode("utf-8"))
root = etree.parse(byte_input).getroot()
nsmap = root.nsmap
# LXML doesn't accept empty (None) namespace prefix,
# so it need to be deleted if exists
if None in nsmap:
nsmap.pop(None)
result = root.xpath(xpath, namespaces=nsmap)
# root.xpath can return 4 types: float, string, bool and list.
# List is the only one that can't be simply converted to str
if type(result) is not list:
return str(result), type(result).__name__
else:
result_string = ""
for e in result:
result_string += etree.tostring(e, pretty_print=True).decode() + "\n"
return result_string, "node"
def xsd(source: str, xsd: str) -> bool:
"""
Method used to validate XML string against XSD schema
:param source: XML string used for validation
:param xsd: XSD schema to validate XML against
:return: Message saying, if the validation was successful or not
"""
schema_input = BytesIO(xsd.encode("utf-8"))
xml_schema = etree.XMLSchema(etree.parse(schema_input).getroot())
document_input = BytesIO(source.encode("utf-8"))
xml = etree.parse(document_input).getroot()
try:
xml_schema.assertValid(xml)
return "XML is valid"
except etree.DocumentInvalid as e:
return str(e)
def xslt(source: str, xslt: str) -> str:
"""
Method used to transform XML string using XSLT
:param source: XML string to transform
:param xslt: XSLT string used to transform XML
:return: Result of transformation
"""
xslt_input = BytesIO(xslt.encode("utf-8"))
xslt_transform = etree.XSLT(etree.parse(xslt_input))
document_input = BytesIO(source.encode("utf-8"))
xml = etree.parse(document_input).getroot()
transformed = str(xslt_transform(xml))
return formatXML(transformed, True)