diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 45f1d066..260ed7dd 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -12,6 +12,8 @@ from html5lib import utils from xml.sax.saxutils import escape +import re + spaceCharacters = u"".join(spaceCharacters) try: @@ -84,7 +86,9 @@ class HTMLSerializer(object): resolve_entities = True # miscellaneous options + emit_doctype = 'preserve' inject_meta_charset = True + lang_attr = 'preserve' strip_whitespace = False sanitize = False @@ -92,9 +96,63 @@ class HTMLSerializer(object): "minimize_boolean_attributes", "use_trailing_solidus", "space_before_trailing_solidus", "omit_optional_tags", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", - "escape_rcdata", "resolve_entities", "sanitize") + "escape_rcdata", "resolve_entities", "emit_doctype", "lang_attr", + "sanitize") def __init__(self, **kwargs): + """Initialize HTMLSerializer. + + Keyword options (default given first unless specified) include: + + emit_doctype='html'|'xhtml'|'html5'|'preserve' + Whether to output a doctype. + * emit_doctype='xhtml' preserves unknown doctypes and valid + XHTML doctypes, converts valid HTML doctypes to their XHTML + counterparts, and drops + * emit_doctype='html' preserves unknown doctypes and valid + HTML doctypes, converts valid XHTML doctypes to their HTML + counterparts, and uses for missing doctypes + * emit_doctype='html5' Uses as the doctype + * emit_doctype='preserve' preserves the doctype, if any, unchanged + inject_meta_charset=True|False + ..? + lang_attr='preserve'|'xml'|'html' + Whether to translate 'lang' attributes. + * lang_attr='preserve' does no translation + * lang_attr='xml' translates 'lang' to 'xml:lang' + * lang_attr='html' translates 'xml:lang' to 'lang' + quote_attr_values=True|False + Whether to quote attribute values that don't require quoting + per HTML5 parsing rules. + quote_char=u'"'|u"'" + Use given quote character for attribute quoting. Default is to + use double quote unless attribute value contains a double quote, + in which case single quotes are used instead. + escape_lt_in_attrs=False|True + Whether to escape < in attribute values. + escape_rc_data=False|True + ..? + resolve_entities=True|False + Whether to resolve named character entities that appear in the + source tree. The XML predified entities < > & " ' + are unaffected by this setting. + strip_whitespace=False|True + ..? + minimize_boolean_attributes=True|false + Shortens boolean attributes to give just the attribute value, + for example becomes . + use_trailing_solidus + Includes a close-tag slash at the end of the start tag of void + elements (empty elements whose end tag is forbidden). E.g.

. + space_before_trailing_solidus + Places a space immediately before the closing slash in a tag + using a trailing solidus. E.g.

. Requires use_trailing_solidus. + sanitize + Strip all unsafe or unknown constructs from output. + See `html5lib user documentation`_ + + .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation + """ if kwargs.has_key('quote_char'): self.use_best_quote_char = False for attr in self.options: @@ -102,6 +160,86 @@ def __init__(self, **kwargs): self.errors = [] self.strict = False + def calc_doctype(self, token=None): + if self.emit_doctype == 'html5' or \ + not token and self.emit_doctype == 'html': + if token: + return u'' + else: + return u'\n' + + rootElement = token["name"] + publicID = token["publicId"] + systemID = token["systemId"] + + if re.match(u'html', rootElement, re.IGNORECASE): + if self.emit_doctype == u'html': + # XHTML 1.1 + if publicID == u"-//W3C//DTD XHTML 1.1//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"): + publicID = u"-//W3C//DTD HTML 4.01//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/strict.dtd" + # XHTML 1.0 Strict + elif publicID == u"-//W3C//DTD XHTML 1.0 Strict//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"): + publicID = u"-//W3C//DTD HTML 4.01//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/strict.dtd" + # XHTML 1.0 Transitional + elif publicID == u"-//W3C//DTD XHTML 1.0 Transitional//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"): + publicID = u"-//W3C//DTD HTML 4.01 Transitional//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/loose.dtd" + # XHTML 1.0 Frameset + elif publicID == u"-//W3C//DTD XHTML 1.0 Frameset//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"): + publicID = u"-//W3C//DTD HTML 4.01 Frameset//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/frameset.dtd" + elif self.emit_doctype == u'xhtml': + # HTML 4.01 Strict + if re.match(u"-//W3C//DTD HTML 4.0(1)?//EN", publicID) and \ + (not systemID or \ + re.match(u"http://www.w3.org/TR/(html4|REC-html40)/strict.dtd", systemID)): + publicID = u"-//W3C//DTD XHTML 1.0 Strict//EN" + if systemID: + systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" + # HTML4.01 Transitional + elif re.match(u"-//W3C//DTD HTML 4.0(1)? Transitional//EN", publicID) and \ + (not systemID or \ + re.match(u"http://www.w3.org/TR/(html4|REC-html40)/loose.dtd", systemID)): + publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN" + if systemID: + systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" + # HTML 4.01 Frameset + elif re.match(u"-//W3C//DTD HTML 4.0(1)? Frameset//EN", publicID) and \ + (not systemID or \ + re.match(u"http://www.w3.org/TR/(html4|REC-html40)/frameset.dtd", systemID)): + publicID = u"-//W3C//DTD XHTML 1.0 Frameset//EN" + if systemID: + systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" + # HTML 3.2 + elif re.match(u"-//W3C//DTD HTML 3.2( Final)?//EN", publicID) and not systemID: + publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN" + + doctype = u"= 0: + if systemID.find(u"'") >= 0: + self.serializeError(_("System identifer contains both single and double quote characters")) + quote_char = u"'" + else: + quote_char = u'"' + doctype += u" %s%s%s" % (quote_char, systemID, quote_char) + doctype += u">" + return doctype + def serialize(self, treewalker, encoding=None): in_cdata = False self.errors = [] @@ -119,26 +257,12 @@ def serialize(self, treewalker, encoding=None): if self.omit_optional_tags: from html5lib.filters.optionaltags import Filter treewalker = Filter(treewalker) + posted_doctype = False for token in treewalker: type = token["type"] if type == "Doctype": - doctype = u"= 0: - if token["systemId"].find(u"'") >= 0: - self.serializeError(_("System identifer contains both single and double quote characters")) - quote_char = u"'" - else: - quote_char = u'"' - doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char) - - doctype += u">" - + posted_doctype = True + doctype = self.calc_doctype(token) if encoding: yield doctype.encode(encoding) else: @@ -158,6 +282,9 @@ def serialize(self, treewalker, encoding=None): yield escape(token["data"]) elif type in ("StartTag", "EmptyTag"): + if not posted_doctype: + posted_doctype = True + yield self.calc_doctype() name = token["name"] if name in rcdataElements and not self.escape_rcdata: in_cdata = True @@ -166,9 +293,20 @@ def serialize(self, treewalker, encoding=None): attrs = token["data"] if hasattr(attrs, "items"): attrs = attrs.items() - attrs.sort() attributes = [] for k,v in attrs: + + # clean up xml:lang + if k == '{http://www.w3.org/XML/1998/namespace}lang': + k = 'xml:lang' + if self.lang_attr == 'xml': + if k == 'lang' and not ('xml:lang' in attrs or + '{http://www.w3.org/XML/1998/namespace}lang' in attrs): + k = 'xml:lang' + elif self.lang_attr == 'html': + if k == 'xml:lang' and not ('lang' in attrs): + k = 'lang' + if encoding: k = k.encode(encoding, "strict") attributes.append(' ') pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies: