Here's something that does the job based on the test data that you provided with minimal changes to your existing code (assuming it basically does what you want already). You will probably want to expand it to use self-closing tags more efficiently:
from HTMLParser import HTMLParser class LinksParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.recording = 0 self.data = '' self.self_closing_tags = ("br",) def handle_starttag(self, tag, attributes): if tag not in ('span',) + self.self_closing_tags: self.data += "<%s" % (tag,) if attributes: self.data += " " + " ".join('%s="%s"' % (k, v) for k, v in attributes) self.data += ">" return if self.recording: self.recording += 1 return for name, value in attributes: if name == 'itemprop' and value == 'description': break else: return self.recording = 1 def handle_endtag(self, tag): if tag == 'span' and self.recording: self.recording -= 1 elif tag in self.self_closing_tags: self.data += "<%s/"> % (tag,) else: self.data += "</%s>" % (tag,) def handle_data(self, data): if self.recording: self.data += data
Given this as input:
<span itemprop="description"> <h1>My First Heading</h1> <p>My first <br/><br/>paragraph.</p> </span>
output:
<h1>My First Heading</h1> <p>My first <br/><br/>paragraph.</p>
source share