Get html under tag using python htmlparser

I want to get all the html under the tag and use HTMLParser. I can currently get the data between the tags, and the following is my code

class LinksParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.recording = 0 self.data = '' def handle_starttag(self, tag, attributes): if tag != 'span': return if self.recording: self.recording += 1 return for name, value in attributes: if name == 'itemprop' and value == 'description': break else: return self.recording = 1 def handle_endtag(self, tag): if tag == 'span' and self.recording: self.recording -= 1 def handle_data(self, data): if self.recording: self.data += data 

I also need html tags inside input like

 <span itemprop="description"> <h1>My First Heading</h1> <p>My first <br/><br/>paragraph.</p> </span> 

when provided as input, will give me only tagged data. Is there any method with which I can get the whole html between tags?

+4
source share
2 answers

You can use xml.etree.ElementTree.TreeBuilder to use the etree API to find / manage the <span> element:

 import sys from HTMLParser import HTMLParser from xml.etree import cElementTree as etree class LinksParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.tb = etree.TreeBuilder() def handle_starttag(self, tag, attributes): self.tb.start(tag, dict(attributes)) def handle_endtag(self, tag): self.tb.end(tag) def handle_data(self, data): self.tb.data(data) def close(self): HTMLParser.close(self) return self.tb.close() parser = LinksParser() parser.feed(sys.stdin.read()) root = parser.close() span = root.find(".//span[@itemprop='description']") etree.ElementTree(span).write(sys.stdout) 

Output

 <span itemprop="description"> <h1>My First Heading</h1> <p>My first <br /><br />paragraph.</p> </span> 

To print without the parent (root) <span> :

 sys.stdout.write(span.text) for child in span: sys.stdout.write(etree.tostring(child)) # add encoding="unicode" on Python 3 
+5
source

Here's something that does the job based on the test data that you provided with minimal changes to your existing code (assuming it basically does what you want already). You will probably want to expand it to use self-closing tags more efficiently:

 from HTMLParser import HTMLParser class LinksParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.recording = 0 self.data = '' self.self_closing_tags = ("br",) def handle_starttag(self, tag, attributes): if tag not in ('span',) + self.self_closing_tags: self.data += "<%s" % (tag,) if attributes: self.data += " " + " ".join('%s="%s"' % (k, v) for k, v in attributes) self.data += ">" return if self.recording: self.recording += 1 return for name, value in attributes: if name == 'itemprop' and value == 'description': break else: return self.recording = 1 def handle_endtag(self, tag): if tag == 'span' and self.recording: self.recording -= 1 elif tag in self.self_closing_tags: self.data += "<%s/"> % (tag,) else: self.data += "</%s>" % (tag,) def handle_data(self, data): if self.recording: self.data += data 

Given this as input:

 <span itemprop="description"> <h1>My First Heading</h1> <p>My first <br/><br/>paragraph.</p> </span> 

output:

 <h1>My First Heading</h1> <p>My first <br/><br/>paragraph.</p> 
+3
source

Source: https://habr.com/ru/post/1445376/


All Articles