Skip to content

Commit 0e39324

Browse files
committed
Add patch from issue 152 by fantasai
--HG-- branch : csswg-testsuite
1 parent 21bf1ad commit 0e39324

File tree

1 file changed

+100
-18
lines changed

1 file changed

+100
-18
lines changed

html5lib/serializer/htmlserializer.py

Lines changed: 100 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from html5lib import utils
1313
from xml.sax.saxutils import escape
1414

15+
import re
16+
1517
spaceCharacters = u"".join(spaceCharacters)
1618

1719
try:
@@ -84,6 +86,7 @@ class HTMLSerializer(object):
8486
resolve_entities = True
8587

8688
# miscellaneous options
89+
emit_doctype = 'preserve'
8790
inject_meta_charset = True
8891
strip_whitespace = False
8992
sanitize = False
@@ -92,13 +95,23 @@ class HTMLSerializer(object):
9295
"minimize_boolean_attributes", "use_trailing_solidus",
9396
"space_before_trailing_solidus", "omit_optional_tags",
9497
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
95-
"escape_rcdata", "resolve_entities", "sanitize")
98+
"escape_rcdata", "resolve_entities", "emit_doctype", "sanitize")
9699

97100
def __init__(self, **kwargs):
98101
"""Initialize HTMLSerializer.
99102
100103
Keyword options (default given first unless specified) include:
101104
105+
emit_doctype='html'|'xhtml'|'html5'|'preserve'
106+
Whether to output a doctype.
107+
* emit_doctype='xhtml' preserves unknown doctypes and valid
108+
XHTML doctypes, converts valid HTML doctypes to their XHTML
109+
counterparts, and drops <!DOCTYPE html>
110+
* emit_doctype='html' preserves unknown doctypes and valid
111+
HTML doctypes, converts valid XHTML doctypes to their HTML
112+
counterparts, and uses <!DOCTYPE html> for missing doctypes
113+
* emit_doctype='html5' Uses <!DOCTYPE html> as the doctype
114+
* emit_doctype='preserve' preserves the doctype, if any, unchanged
102115
inject_meta_charset=True|False
103116
..?
104117
quote_attr_values=True|False
@@ -140,6 +153,86 @@ def __init__(self, **kwargs):
140153
self.errors = []
141154
self.strict = False
142155

156+
def calc_doctype(self, token=None):
157+
if self.emit_doctype == 'html5' or \
158+
not token and self.emit_doctype == 'html':
159+
if token:
160+
return u'<!DOCTYPE html>'
161+
else:
162+
return u'<!DOCTYPE html>\n'
163+
164+
rootElement = token["name"]
165+
publicID = token["publicId"]
166+
systemID = token["systemId"]
167+
168+
if re.match(u'html', rootElement, re.IGNORECASE):
169+
if self.emit_doctype == u'html':
170+
# XHTML 1.1
171+
if publicID == u"-//W3C//DTD XHTML 1.1//EN" and (not systemID \
172+
or systemID == u"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"):
173+
publicID = u"-//W3C//DTD HTML 4.01//EN"
174+
if systemID:
175+
systemID = u"http://www.w3.org/TR/html4/strict.dtd"
176+
# XHTML 1.0 Strict
177+
elif publicID == u"-//W3C//DTD XHTML 1.0 Strict//EN" and (not systemID \
178+
or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"):
179+
publicID = u"-//W3C//DTD HTML 4.01//EN"
180+
if systemID:
181+
systemID = u"http://www.w3.org/TR/html4/strict.dtd"
182+
# XHTML 1.0 Transitional
183+
elif publicID == u"-//W3C//DTD XHTML 1.0 Transitional//EN" and (not systemID \
184+
or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"):
185+
publicID = u"-//W3C//DTD HTML 4.01 Transitional//EN"
186+
if systemID:
187+
systemID = u"http://www.w3.org/TR/html4/loose.dtd"
188+
# XHTML 1.0 Frameset
189+
elif publicID == u"-//W3C//DTD XHTML 1.0 Frameset//EN" and (not systemID \
190+
or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"):
191+
publicID = u"-//W3C//DTD HTML 4.01 Frameset//EN"
192+
if systemID:
193+
systemID = u"http://www.w3.org/TR/html4/frameset.dtd"
194+
elif self.emit_doctype == u'xhtml':
195+
# HTML 4.01 Strict
196+
if re.match(u"-//W3C//DTD HTML 4.0(1)?//EN", publicID) and \
197+
(not systemID or \
198+
re.match(u"http://www.w3.org/TR/(html4|REC-html40)/strict.dtd", systemID)):
199+
publicID = u"-//W3C//DTD XHTML 1.0 Strict//EN"
200+
if systemID:
201+
systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
202+
# HTML4.01 Transitional
203+
elif re.match(u"-//W3C//DTD HTML 4.0(1)? Transitional//EN", publicID) and \
204+
(not systemID or \
205+
re.match(u"http://www.w3.org/TR/(html4|REC-html40)/loose.dtd", systemID)):
206+
publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
207+
if systemID:
208+
systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
209+
# HTML 4.01 Frameset
210+
elif re.match(u"-//W3C//DTD HTML 4.0(1)? Frameset//EN", publicID) and \
211+
(not systemID or \
212+
re.match(u"http://www.w3.org/TR/(html4|REC-html40)/frameset.dtd", systemID)):
213+
publicID = u"-//W3C//DTD XHTML 1.0 Frameset//EN"
214+
if systemID:
215+
systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"
216+
# HTML 3.2
217+
elif re.match(u"-//W3C//DTD HTML 3.2( Final)?//EN", publicID) and not systemID:
218+
publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
219+
220+
doctype = u"<!DOCTYPE %s" % rootElement
221+
if token["publicId"]:
222+
doctype += u' PUBLIC "%s"' % publicID
223+
elif systemID:
224+
doctype += u" SYSTEM"
225+
if systemID:
226+
if systemID.find(u'"') >= 0:
227+
if systemID.find(u"'") >= 0:
228+
self.serializeError(_("System identifer contains both single and double quote characters"))
229+
quote_char = u"'"
230+
else:
231+
quote_char = u'"'
232+
doctype += u" %s%s%s" % (quote_char, systemID, quote_char)
233+
doctype += u">"
234+
return doctype
235+
143236
def serialize(self, treewalker, encoding=None):
144237
in_cdata = False
145238
self.errors = []
@@ -157,26 +250,12 @@ def serialize(self, treewalker, encoding=None):
157250
if self.omit_optional_tags:
158251
from html5lib.filters.optionaltags import Filter
159252
treewalker = Filter(treewalker)
253+
posted_doctype = False
160254
for token in treewalker:
161255
type = token["type"]
162256
if type == "Doctype":
163-
doctype = u"<!DOCTYPE %s" % token["name"]
164-
165-
if token["publicId"]:
166-
doctype += u' PUBLIC "%s"' % token["publicId"]
167-
elif token["systemId"]:
168-
doctype += u" SYSTEM"
169-
if token["systemId"]:
170-
if token["systemId"].find(u'"') >= 0:
171-
if token["systemId"].find(u"'") >= 0:
172-
self.serializeError(_("System identifer contains both single and double quote characters"))
173-
quote_char = u"'"
174-
else:
175-
quote_char = u'"'
176-
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
177-
178-
doctype += u">"
179-
257+
posted_doctype = True
258+
doctype = self.calc_doctype(token)
180259
if encoding:
181260
yield doctype.encode(encoding)
182261
else:
@@ -196,6 +275,9 @@ def serialize(self, treewalker, encoding=None):
196275
yield escape(token["data"])
197276

198277
elif type in ("StartTag", "EmptyTag"):
278+
if not posted_doctype:
279+
posted_doctype = True
280+
yield self.calc_doctype()
199281
name = token["name"]
200282
if name in rcdataElements and not self.escape_rcdata:
201283
in_cdata = True

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy