Get html under tag using python htmlparser

Question

Get html under tag using python htmlparser

I want to get all the html under the tag and use HTMLParser. I can currently get the data between the tags, and the following is my code

class LinksParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.recording = 0 self.data = '' def handle_starttag(self, tag, attributes): if tag != 'span': return if self.recording: self.recording += 1 return for name, value in attributes: if name == 'itemprop' and value == 'description': break else: return self.recording = 1 def handle_endtag(self, tag): if tag == 'span' and self.recording: self.recording -= 1 def handle_data(self, data): if self.recording: self.data += data

I also need html tags inside input like

 <span itemprop="description"> <h1>My First Heading</h1> <p>My first <br/><br/>paragraph.</p> </span>

when provided as input, will give me only tagged data. Is there any method with which I can get the whole html between tags?

+4

python html-parsing

raju Nov 11 '12 at 18:22

source share

2 answers

Here's something that does the job based on the test data that you provided with minimal changes to your existing code (assuming it basically does what you want already). You will probably want to expand it to use self-closing tags more efficiently:

 from HTMLParser import HTMLParser class LinksParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.recording = 0 self.data = '' self.self_closing_tags = ("br",) def handle_starttag(self, tag, attributes): if tag not in ('span',) + self.self_closing_tags: self.data += "<%s" % (tag,) if attributes: self.data += " " + " ".join('%s="%s"' % (k, v) for k, v in attributes) self.data += ">" return if self.recording: self.recording += 1 return for name, value in attributes: if name == 'itemprop' and value == 'description': break else: return self.recording = 1 def handle_endtag(self, tag): if tag == 'span' and self.recording: self.recording -= 1 elif tag in self.self_closing_tags: self.data += "<%s/"> % (tag,) else: self.data += "</%s>" % (tag,) def handle_data(self, data): if self.recording: self.data += data

Given this as input:

 <span itemprop="description"> <h1>My First Heading</h1> <p>My first <br/><br/>paragraph.</p> </span>

output:

 <h1>My First Heading</h1> <p>My first <br/><br/>paragraph.</p>

+3

Bentrofatter Nov 11 '12 at 18:44

source share

jfs · Accepted Answer · 2012-11-11T19:22:20+0000

You can use xml.etree.ElementTree.TreeBuilder to use the etree API to find / manage the <span> element:

 import sys from HTMLParser import HTMLParser from xml.etree import cElementTree as etree class LinksParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.tb = etree.TreeBuilder() def handle_starttag(self, tag, attributes): self.tb.start(tag, dict(attributes)) def handle_endtag(self, tag): self.tb.end(tag) def handle_data(self, data): self.tb.data(data) def close(self): HTMLParser.close(self) return self.tb.close() parser = LinksParser() parser.feed(sys.stdin.read()) root = parser.close() span = root.find(".//span[@itemprop='description']") etree.ElementTree(span).write(sys.stdout)

Output

 <span itemprop="description"> <h1>My First Heading</h1> <p>My first <br /><br />paragraph.</p> </span>

To print without the parent (root) <span> :

 sys.stdout.write(span.text) for child in span: sys.stdout.write(etree.tostring(child)) # add encoding="unicode" on Python 3

Get html under tag using python htmlparser

Output

More articles: