diff --git a/html5lib/constants.py b/html5lib/constants.py index 1866dd78..431c2c12 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -4,6 +4,9 @@ import gettext _ = gettext.gettext +from itertools import chain + + EOF = None E = { @@ -3078,6 +3081,19 @@ prefixes["http://www.w3.org/1998/Math/MathML"] = "math" +invisibleChars = frozenset(chain( + # ASCII control chars + range(0x0, 0x9), range(0xB, 0xD), range(0xE, 0x20), + # Other control chars + # fixed-width spaces, zero-width marks, bidi marks + range(0x2000, 0x2010), + # LS, PS, bidi control codes + range(0x2028, 0x2030), + # nbsp, mathsp, ideosp, WJ, interlinear + [0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB] +)) + + class DataLossWarning(UserWarning): pass diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 18344aed..e6056f0c 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -94,6 +94,7 @@ class HTMLSerializer(object): # escaping options escape_lt_in_attrs = False escape_rcdata = False + escape_invisible = False resolve_entities = True # miscellaneous options @@ -105,7 +106,8 @@ class HTMLSerializer(object): "minimize_boolean_attributes", "use_trailing_solidus", "space_before_trailing_solidus", "omit_optional_tags", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", - "escape_rcdata", "resolve_entities", "sanitize") + "escape_rcdata", "escape_invisible", "resolve_entities", + "sanitize") def __init__(self, **kwargs): """Initialize HTMLSerializer. @@ -127,6 +129,10 @@ def __init__(self, **kwargs): escape_rcdata=False|True Whether to escape characters that need to be escaped within normal elements within rcdata elements such as style. + escape_invisible=False|True|'numeric'|'named' + Whether to escape invisible characters (such as nbsp, fixed-width + spaces, and control codes). Uses named HTML escapes if 'named' + is specified, otherwise uses numeric codes. resolve_entities=True|False Whether to resolve named character entities that appear in the source tree. The XML predefined entities < > & " ' @@ -160,6 +166,8 @@ def __init__(self, **kwargs): def encode(self, string): assert(isinstance(string, text_type)) + if self.escape_invisible: + string = utils.escapeInvisible(string, self.escape_invisible == 'named') if self.encoding: return string.encode(self.encoding, unicode_encode_errors) else: diff --git a/html5lib/utils.py b/html5lib/utils.py index 9841aebf..ae0d9fbc 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -2,6 +2,10 @@ from types import ModuleType +from six import text_type + +from .constants import invisibleChars + class MethodDispatcher(dict): """Dict with 2 special properties: @@ -71,3 +75,27 @@ def moduleFactory(baseModule, *args, **kwargs): return mod return moduleFactory + + +def escapeInvisible(text, useNamedEntities=False): + """Escape invisible characters other than Tab, LF, CR, and ASCII space + """ + assert type(text) == text_type + # This algorithm is O(MN) for M len(text) and N num escapable + # But it doesn't modify the text when N is zero (common case) and + # N is expected to be small (usually 1 or 2) in most other cases. + escapable = set() + for c in text: + if ord(c) in invisibleChars: + escapable.add(c) + if useNamedEntities: + # for c in escapable: + # name = codepoint2name.get(ord(c)) + # escape = "&%s;" % name if name else "&#x%X;" % ord(c) + # text = text.replace(c, escape) + raise NotImplementedError("This doesn't work on Python 3") + else: + for c in escapable: + text = text.replace(c, "&#x%X;" % ord(c)) + + return text pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy