. . , , , , , Beautiful Soup. ( , Beautiful Soup html5lib 1.0.) Amarghosh; . html5lib, , - , , toprettyxml(). :
from html5lib import HTMLParser, treebuilders
from cStringIO import StringIO
def tidy_html(text):
"""Returns a well-formatted version of input HTML."""
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(text)
pretty_HTML = StringIO()
node = dom_tree.firstChild
while node:
node_contents = node.toprettyxml(indent=' ')
pretty_HTML.write(node_contents)
node = node.nextSibling
output = pretty_HTML.getvalue()
pretty_HTML.close()
return output
:
>>> text = """<b><i>bold, italic</b></i><div>a div</div>"""
>>> tidy_html(text)
<b>
<i>
bold, italic
</i>
</b>
<div>
a div
</div>
, toprettyxml() on dom_tree ? HTML, , HTML, <head> <body>. , parseFragment(), , DocumentFragment ( ). , writexml() ( toprettyxml()), , .