From f4f1fb8cf044fd1616648821d13247450901f62a Mon Sep 17 00:00:00 2001 From: fantasai Date: Tue, 27 Jul 2010 21:30:17 +0100 Subject: [PATCH 1/2] Google Code Issue 157: Add "escape invisible characters" option Vaguely updated, but basically working. --- html5lib/constants.py | 16 ++++++++++++++++ html5lib/serializer/htmlserializer.py | 10 +++++++++- html5lib/utils.py | 26 ++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/html5lib/constants.py b/html5lib/constants.py index 1866dd78..431c2c12 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -4,6 +4,9 @@ import gettext _ = gettext.gettext +from itertools import chain + + EOF = None E = { @@ -3078,6 +3081,19 @@ prefixes["http://www.w3.org/1998/Math/MathML"] = "math" +invisibleChars = frozenset(chain( + # ASCII control chars + range(0x0, 0x9), range(0xB, 0xD), range(0xE, 0x20), + # Other control chars + # fixed-width spaces, zero-width marks, bidi marks + range(0x2000, 0x2010), + # LS, PS, bidi control codes + range(0x2028, 0x2030), + # nbsp, mathsp, ideosp, WJ, interlinear + [0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB] +)) + + class DataLossWarning(UserWarning): pass diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 18344aed..adc1bf59 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -94,6 +94,7 @@ class HTMLSerializer(object): # escaping options escape_lt_in_attrs = False escape_rcdata = False + escape_invisible = False resolve_entities = True # miscellaneous options @@ -105,7 +106,8 @@ class HTMLSerializer(object): "minimize_boolean_attributes", "use_trailing_solidus", "space_before_trailing_solidus", "omit_optional_tags", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", - "escape_rcdata", "resolve_entities", "sanitize") + "escape_rcdata", "escape_invisible", "resolve_entities", + "sanitize") def __init__(self, **kwargs): """Initialize HTMLSerializer. @@ -127,6 +129,10 @@ def __init__(self, **kwargs): escape_rcdata=False|True Whether to escape characters that need to be escaped within normal elements within rcdata elements such as style. + escape_invisible=False|True|'numeric'|'named' + Whether to escape invisible characters (such as nbsp, fixed-width + spaces, and control codes). Uses named HTML escapes if 'named' + is specified, otherwise uses numeric codes. resolve_entities=True|False Whether to resolve named character entities that appear in the source tree. The XML predefined entities < > & " ' @@ -160,6 +166,8 @@ def __init__(self, **kwargs): def encode(self, string): assert(isinstance(string, text_type)) + if self.escape_invisible: + text = utils.escapeInvisible(text, self.escape_invisible == 'named') if self.encoding: return string.encode(self.encoding, unicode_encode_errors) else: diff --git a/html5lib/utils.py b/html5lib/utils.py index 9841aebf..3f3fee01 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -2,6 +2,8 @@ from types import ModuleType +from .constants import invisibleChars + class MethodDispatcher(dict): """Dict with 2 special properties: @@ -71,3 +73,27 @@ def moduleFactory(baseModule, *args, **kwargs): return mod return moduleFactory + + +def escapeInvisible(text, useNamedEntities=False): + """Escape invisible characters other than Tab, LF, CR, and ASCII space + """ + assert type(text) == text_type + # This algorithm is O(MN) for M len(text) and N num escapable + # But it doesn't modify the text when N is zero (common case) and + # N is expected to be small (usually 1 or 2) in most other cases. + escapable = set() + for c in text: + if ord(c) in invisibleChars: + escapable.add(c) + if useNamedEntities: + raise NotImplementedError("This doesn't work on Python 3") + for c in escapable: + name = codepoint2name.get(ord(c)) + escape = "&%s;" % name if name else "&#x%X;" % ord(c) + text = text.replace(c, escape) + else: + for c in escapable: + text = text.replace(c, "&#x%X;" % ord(c)) + + return text From 93440015e6e41f1bab0162ce27f323b65f4cd6e8 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sat, 4 May 2013 13:56:03 +0100 Subject: [PATCH 2/2] fixup! Google Code Issue 157: Add "escape invisible characters" option --- html5lib/serializer/htmlserializer.py | 2 +- html5lib/utils.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index adc1bf59..e6056f0c 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -167,7 +167,7 @@ def __init__(self, **kwargs): def encode(self, string): assert(isinstance(string, text_type)) if self.escape_invisible: - text = utils.escapeInvisible(text, self.escape_invisible == 'named') + string = utils.escapeInvisible(string, self.escape_invisible == 'named') if self.encoding: return string.encode(self.encoding, unicode_encode_errors) else: diff --git a/html5lib/utils.py b/html5lib/utils.py index 3f3fee01..ae0d9fbc 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -2,6 +2,8 @@ from types import ModuleType +from six import text_type + from .constants import invisibleChars @@ -87,11 +89,11 @@ def escapeInvisible(text, useNamedEntities=False): if ord(c) in invisibleChars: escapable.add(c) if useNamedEntities: + # for c in escapable: + # name = codepoint2name.get(ord(c)) + # escape = "&%s;" % name if name else "&#x%X;" % ord(c) + # text = text.replace(c, escape) raise NotImplementedError("This doesn't work on Python 3") - for c in escapable: - name = codepoint2name.get(ord(c)) - escape = "&%s;" % name if name else "&#x%X;" % ord(c) - text = text.replace(c, escape) else: for c in escapable: text = text.replace(c, "&#x%X;" % ord(c)) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy