124 lines
4.2 KiB
Python
124 lines
4.2 KiB
Python
from lxml import etree, html
|
|
from io import BytesIO
|
|
|
|
|
|
def convertHTML(source: str, sourceFrom: str):
|
|
htmlParser = html.HTMLParser(remove_comments=True, remove_blank_text=True)
|
|
xmlParser = etree.XMLParser(remove_comments=True, remove_blank_text=True)
|
|
|
|
if sourceFrom == "xml":
|
|
xmldoc = etree.parse(BytesIO(source.encode("utf-8")), xmlParser)
|
|
return html.tostring(xmldoc, method="html", pretty_print=True, doctype="<!DOCTYPE html>").decode()
|
|
elif sourceFrom == "html":
|
|
htmldoc = html.parse(BytesIO(source.encode("utf-8")), htmlParser)
|
|
return etree.tostring(htmldoc, method="xml", pretty_print=True, doctype="", xml_declaration=True, encoding="utf-8").decode()
|
|
else:
|
|
return
|
|
|
|
|
|
def formatHTML(source: str, prettify: bool) -> str:
|
|
parser = html.HTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True)
|
|
htmlDoc = html.parse(BytesIO(source.encode("utf-8")),parser=parser)
|
|
if not prettify:
|
|
return html.tostring(htmlDoc).decode().replace("\n", "").replace("> ", ">")
|
|
return etree.tostring(htmlDoc, encoding='unicode', pretty_print=True)
|
|
|
|
def formatXML(source: str, prettify: bool) -> str:
|
|
"""Method used to format XML
|
|
|
|
:param source: XML to format
|
|
:param prettify: sets if XML must be prettified
|
|
(added indentations etc.) or not
|
|
:return: formatted XML
|
|
"""
|
|
|
|
# Prolog is removed when XML is parsed
|
|
# so program has to copy it
|
|
prolog = ""
|
|
prolog_start = source.find("<?")
|
|
|
|
if prolog_start != -1:
|
|
prolog_end = source.find("?>") + 2
|
|
prolog = source[prolog_start:prolog_end]
|
|
source = source[prolog_end:].strip()
|
|
|
|
byte_input = BytesIO(source.encode("utf-8"))
|
|
parser = etree.XMLParser(remove_blank_text=True)
|
|
xml = etree.parse(byte_input, parser=parser)
|
|
|
|
if prettify:
|
|
prolog += "\n"
|
|
return prolog + etree.tostring(xml, pretty_print=True, encoding="utf-8").decode("utf-8")
|
|
|
|
raw_xml = etree.tostring(xml, encoding="utf-8").decode("utf-8")
|
|
raw_xml = " ".join(raw_xml.split())
|
|
|
|
return prolog + raw_xml
|
|
|
|
def xpath(source: str, xpath: str) -> str:
|
|
"""
|
|
Method used to get nodes from XML string using XPath
|
|
|
|
:param source: XML string used for selection
|
|
:param xpath: XPath query used for selection
|
|
:return: Nodes selected using XPath
|
|
"""
|
|
|
|
byte_input = BytesIO(source.encode("utf-8"))
|
|
root = etree.parse(byte_input).getroot()
|
|
nsmap = root.nsmap
|
|
|
|
# LXML doesn't accept empty (None) namespace prefix,
|
|
# so it need to be deleted if exists
|
|
if None in nsmap:
|
|
nsmap.pop(None)
|
|
|
|
result = root.xpath(xpath, namespaces=nsmap)
|
|
|
|
# root.xpath can return 4 types: float, string, bool and list.
|
|
# List is the only one that can't be simply converted to str
|
|
if type(result) is not list:
|
|
return str(result), type(result).__name__
|
|
else:
|
|
result_string = ""
|
|
for e in result:
|
|
result_string += etree.tostring(e, pretty_print=True).decode() + "\n"
|
|
return result_string, "node"
|
|
|
|
|
|
def xsd(source: str, xsd: str) -> bool:
|
|
"""
|
|
Method used to validate XML string against XSD schema
|
|
:param source: XML string used for validation
|
|
:param xsd: XSD schema to validate XML against
|
|
:return: Message saying, if the validation was successful or not
|
|
"""
|
|
|
|
schema_input = BytesIO(xsd.encode("utf-8"))
|
|
xml_schema = etree.XMLSchema(etree.parse(schema_input).getroot())
|
|
|
|
document_input = BytesIO(source.encode("utf-8"))
|
|
xml = etree.parse(document_input).getroot()
|
|
|
|
try:
|
|
xml_schema.assertValid(xml)
|
|
return "XML is valid"
|
|
except etree.DocumentInvalid as e:
|
|
return str(e)
|
|
|
|
def xslt(source: str, xslt: str) -> str:
|
|
"""
|
|
Method used to transform XML string using XSLT
|
|
|
|
:param source: XML string to transform
|
|
:param xslt: XSLT string used to transform XML
|
|
:return: Result of transformation
|
|
"""
|
|
xslt_input = BytesIO(xslt.encode("utf-8"))
|
|
xslt_transform = etree.XSLT(etree.parse(xslt_input))
|
|
|
|
document_input = BytesIO(source.encode("utf-8"))
|
|
xml = etree.parse(document_input).getroot()
|
|
|
|
transformed = str(xslt_transform(xml))
|
|
return formatXML(transformed, True) |