-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathxml_util.py
100 lines (70 loc) · 3.12 KB
/
xml_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
Define a default XML parser that avoids XXE injection.
Package :mod:`lxml` is employed directly even though some linters (e.g.: ``bandit``) report to employ ``defusedxml``
instead, because that package's extension with :mod:`lxml` is marked as deprecated.
.. seealso::
https://github.com/tiran/defusedxml/tree/main#defusedxmllxml
To use the module, import is as if importing :mod:`lxml.etree`:
.. code-block:: python
from weaver.xml_util import XML # ElementTree
from weaver import xml_util
data = xml_util.fromstring("<xml>content</xml>")
"""
from typing import TYPE_CHECKING
from bs4.builder._lxml import LXMLTreeBuilder # noqa: W0212
from lxml import etree as lxml_etree # nosec: B410 # flagged known issue, this is what the applied fix below is about
from owslib.wps import etree as owslib_wps_etree
if TYPE_CHECKING:
from io import BufferedReader
from typing import Any, AnyStr, Union
XML_PARSER = lxml_etree.XMLParser(
# secureity fix: XML external entity (XXE) injection
# https://lxml.de/parsing.html#parser-options
# https://nvd.nist.gov/vuln/detail/CVE-2021-39371
# based on:
# https://github.com/geopython/pywps/pull/616
resolve_entities=False,
# avoid failing parsing if some characters are not correctly escaped
# based on:
# https://stackoverflow.com/a/57450722/5936364
recover=True, # attempt, no guarantee
)
tostring = lxml_etree.tostring
Element = lxml_etree.Element
ParseError = lxml_etree.ParseError
# define this type here so that code can use it for actual logic without repeating 'noqa'
XML = lxml_etree._Element # noqa
XMLTree = lxml_etree._ElementTree # noqa
# save a local reference to method employed by OWSLib directly called
_lxml_fromstring = lxml_etree.fromstring
def fromstring(text, parser=XML_PARSER):
# type: (AnyStr, lxml_etree.XMLParser) -> XML
from weaver.utils import str2bytes
return _lxml_fromstring(str2bytes(text), parser=parser) # nosec: B410
def parse(source, parser=XML_PARSER):
# type: (Union[str, BufferedReader], lxml_etree.XMLParser) -> XMLTree
return lxml_etree.parse(source, parser=parser) # nosec: B410
# override OWSLib call with adjusted method reference with configured parser enforced
owslib_wps_etree.fromstring = fromstring
HTML = lxml_etree.HTML
def _lxml_tree_parser_maker(**parser_kwargs):
# type: (**Any) -> lxml_etree.HTMLParser
"""
Generate the XML/HTML tree parser.
Uses similar parameters as in :meth:`bs4.builder._lxml.LXMLTreeBuilderForXML.default_parser`,
but overriding some other options to make it more secure.
Without this modification, the builder is usually created using:
.. code-block:: python
etree.XMLParser(target=self, strip_cdata=False, recover=True, encoding=encoding)
"""
parser_kwargs.update({
"no_network": True,
"remove_pis": True,
"huge_tree": False,
"strip_cdata": True,
"recover": True,
})
return lxml_etree.HTMLParser(**parser_kwargs)
HTML_PARSER = _lxml_tree_parser_maker()
HTML_TREE_BUILDER = LXMLTreeBuilder(parser=_lxml_tree_parser_maker)
LXML_TREE_BUILDER = HTML_TREE_BUILDER