From 21bf1ad6ff3d7079a515be642866e72253d33583 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 20 Jul 2010 16:51:34 +0200 Subject: [PATCH 1/4] Add patch from issue 150 by fantasai --HG-- branch : csswg-testsuite --- html5lib/serializer/htmlserializer.py | 38 +++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 45f1d066..dd232255 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -95,6 +95,44 @@ class HTMLSerializer(object): "escape_rcdata", "resolve_entities", "sanitize") def __init__(self, **kwargs): + """Initialize HTMLSerializer. + + Keyword options (default given first unless specified) include: + + inject_meta_charset=True|False + ..? + quote_attr_values=True|False + Whether to quote attribute values that don't require quoting + per HTML5 parsing rules. + quote_char=u'"'|u"'" + Use given quote character for attribute quoting. Default is to + use double quote unless attribute value contains a double quote, + in which case single quotes are used instead. + escape_lt_in_attrs=False|True + Whether to escape < in attribute values. + escape_rc_data=False|True + ..? + resolve_entities=True|False + Whether to resolve named character entities that appear in the + source tree. The XML predified entities < > & " ' + are unaffected by this setting. + strip_whitespace=False|True + ..? + minimize_boolean_attributes=True|false + Shortens boolean attributes to give just the attribute value, + for example becomes . + use_trailing_solidus + Includes a close-tag slash at the end of the start tag of void + elements (empty elements whose end tag is forbidden). E.g.
. + space_before_trailing_solidus + Places a space immediately before the closing slash in a tag + using a trailing solidus. E.g.
. Requires use_trailing_solidus. + sanitize + Strip all unsafe or unknown constructs from output. + See `html5lib user documentation`_ + + .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation + """ if kwargs.has_key('quote_char'): self.use_best_quote_char = False for attr in self.options: From 0e3932482678185507011b233e80b438348c2b85 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 20 Jul 2010 16:51:49 +0200 Subject: [PATCH 2/4] Add patch from issue 152 by fantasai --HG-- branch : csswg-testsuite --- html5lib/serializer/htmlserializer.py | 118 ++++++++++++++++++++++---- 1 file changed, 100 insertions(+), 18 deletions(-) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index dd232255..dbd7206f 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -12,6 +12,8 @@ from html5lib import utils from xml.sax.saxutils import escape +import re + spaceCharacters = u"".join(spaceCharacters) try: @@ -84,6 +86,7 @@ class HTMLSerializer(object): resolve_entities = True # miscellaneous options + emit_doctype = 'preserve' inject_meta_charset = True strip_whitespace = False sanitize = False @@ -92,13 +95,23 @@ class HTMLSerializer(object): "minimize_boolean_attributes", "use_trailing_solidus", "space_before_trailing_solidus", "omit_optional_tags", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", - "escape_rcdata", "resolve_entities", "sanitize") + "escape_rcdata", "resolve_entities", "emit_doctype", "sanitize") def __init__(self, **kwargs): """Initialize HTMLSerializer. Keyword options (default given first unless specified) include: + emit_doctype='html'|'xhtml'|'html5'|'preserve' + Whether to output a doctype. + * emit_doctype='xhtml' preserves unknown doctypes and valid + XHTML doctypes, converts valid HTML doctypes to their XHTML + counterparts, and drops + * emit_doctype='html' preserves unknown doctypes and valid + HTML doctypes, converts valid XHTML doctypes to their HTML + counterparts, and uses for missing doctypes + * emit_doctype='html5' Uses as the doctype + * emit_doctype='preserve' preserves the doctype, if any, unchanged inject_meta_charset=True|False ..? quote_attr_values=True|False @@ -140,6 +153,86 @@ def __init__(self, **kwargs): self.errors = [] self.strict = False + def calc_doctype(self, token=None): + if self.emit_doctype == 'html5' or \ + not token and self.emit_doctype == 'html': + if token: + return u'' + else: + return u'\n' + + rootElement = token["name"] + publicID = token["publicId"] + systemID = token["systemId"] + + if re.match(u'html', rootElement, re.IGNORECASE): + if self.emit_doctype == u'html': + # XHTML 1.1 + if publicID == u"-//W3C//DTD XHTML 1.1//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"): + publicID = u"-//W3C//DTD HTML 4.01//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/strict.dtd" + # XHTML 1.0 Strict + elif publicID == u"-//W3C//DTD XHTML 1.0 Strict//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"): + publicID = u"-//W3C//DTD HTML 4.01//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/strict.dtd" + # XHTML 1.0 Transitional + elif publicID == u"-//W3C//DTD XHTML 1.0 Transitional//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"): + publicID = u"-//W3C//DTD HTML 4.01 Transitional//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/loose.dtd" + # XHTML 1.0 Frameset + elif publicID == u"-//W3C//DTD XHTML 1.0 Frameset//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"): + publicID = u"-//W3C//DTD HTML 4.01 Frameset//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/frameset.dtd" + elif self.emit_doctype == u'xhtml': + # HTML 4.01 Strict + if re.match(u"-//W3C//DTD HTML 4.0(1)?//EN", publicID) and \ + (not systemID or \ + re.match(u"http://www.w3.org/TR/(html4|REC-html40)/strict.dtd", systemID)): + publicID = u"-//W3C//DTD XHTML 1.0 Strict//EN" + if systemID: + systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" + # HTML4.01 Transitional + elif re.match(u"-//W3C//DTD HTML 4.0(1)? Transitional//EN", publicID) and \ + (not systemID or \ + re.match(u"http://www.w3.org/TR/(html4|REC-html40)/loose.dtd", systemID)): + publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN" + if systemID: + systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" + # HTML 4.01 Frameset + elif re.match(u"-//W3C//DTD HTML 4.0(1)? Frameset//EN", publicID) and \ + (not systemID or \ + re.match(u"http://www.w3.org/TR/(html4|REC-html40)/frameset.dtd", systemID)): + publicID = u"-//W3C//DTD XHTML 1.0 Frameset//EN" + if systemID: + systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" + # HTML 3.2 + elif re.match(u"-//W3C//DTD HTML 3.2( Final)?//EN", publicID) and not systemID: + publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN" + + doctype = u"= 0: + if systemID.find(u"'") >= 0: + self.serializeError(_("System identifer contains both single and double quote characters")) + quote_char = u"'" + else: + quote_char = u'"' + doctype += u" %s%s%s" % (quote_char, systemID, quote_char) + doctype += u">" + return doctype + def serialize(self, treewalker, encoding=None): in_cdata = False self.errors = [] @@ -157,26 +250,12 @@ def serialize(self, treewalker, encoding=None): if self.omit_optional_tags: from html5lib.filters.optionaltags import Filter treewalker = Filter(treewalker) + posted_doctype = False for token in treewalker: type = token["type"] if type == "Doctype": - doctype = u"= 0: - if token["systemId"].find(u"'") >= 0: - self.serializeError(_("System identifer contains both single and double quote characters")) - quote_char = u"'" - else: - quote_char = u'"' - doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char) - - doctype += u">" - + posted_doctype = True + doctype = self.calc_doctype(token) if encoding: yield doctype.encode(encoding) else: @@ -196,6 +275,9 @@ def serialize(self, treewalker, encoding=None): yield escape(token["data"]) elif type in ("StartTag", "EmptyTag"): + if not posted_doctype: + posted_doctype = True + yield self.calc_doctype() name = token["name"] if name in rcdataElements and not self.escape_rcdata: in_cdata = True From f0a8c6e7c32fa8db0efec05f5ec1674f82a6be9e Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 20 Jul 2010 16:52:14 +0200 Subject: [PATCH 3/4] Add patch from issue 153 by fantasai --HG-- branch : csswg-testsuite --- html5lib/serializer/htmlserializer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index dbd7206f..af58c3ec 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -286,7 +286,6 @@ def serialize(self, treewalker, encoding=None): attrs = token["data"] if hasattr(attrs, "items"): attrs = attrs.items() - attrs.sort() attributes = [] for k,v in attrs: if encoding: From 956d9b847673db7236c525933ff4f69386009428 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 20 Jul 2010 16:52:31 +0200 Subject: [PATCH 4/4] Add patch from issue 154 by fantasai --HG-- branch : csswg-testsuite --- html5lib/serializer/htmlserializer.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index af58c3ec..260ed7dd 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -88,6 +88,7 @@ class HTMLSerializer(object): # miscellaneous options emit_doctype = 'preserve' inject_meta_charset = True + lang_attr = 'preserve' strip_whitespace = False sanitize = False @@ -95,7 +96,8 @@ class HTMLSerializer(object): "minimize_boolean_attributes", "use_trailing_solidus", "space_before_trailing_solidus", "omit_optional_tags", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", - "escape_rcdata", "resolve_entities", "emit_doctype", "sanitize") + "escape_rcdata", "resolve_entities", "emit_doctype", "lang_attr", + "sanitize") def __init__(self, **kwargs): """Initialize HTMLSerializer. @@ -114,6 +116,11 @@ def __init__(self, **kwargs): * emit_doctype='preserve' preserves the doctype, if any, unchanged inject_meta_charset=True|False ..? + lang_attr='preserve'|'xml'|'html' + Whether to translate 'lang' attributes. + * lang_attr='preserve' does no translation + * lang_attr='xml' translates 'lang' to 'xml:lang' + * lang_attr='html' translates 'xml:lang' to 'lang' quote_attr_values=True|False Whether to quote attribute values that don't require quoting per HTML5 parsing rules. @@ -288,6 +295,18 @@ def serialize(self, treewalker, encoding=None): attrs = attrs.items() attributes = [] for k,v in attrs: + + # clean up xml:lang + if k == '{http://www.w3.org/XML/1998/namespace}lang': + k = 'xml:lang' + if self.lang_attr == 'xml': + if k == 'lang' and not ('xml:lang' in attrs or + '{http://www.w3.org/XML/1998/namespace}lang' in attrs): + k = 'xml:lang' + elif self.lang_attr == 'html': + if k == 'xml:lang' and not ('lang' in attrs): + k = 'lang' + if encoding: k = k.encode(encoding, "strict") attributes.append(' ') pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy