Python 3.2 html analyzer

Im relative new python im trying html.parser like fallow:

from html.parser import HTMLParser import urllib.request class TestParser(HTMLParser): def handle_starttag(self, tag, attrs): print("Start Tag: ", tag, attrs) def handle_endtag(self, tag): print("End Tag: ", tag) def handle_data(self, data): print("Data: ", data) def handle_startendtag(self,tag,attrs): print("StarEnd Tag: ", tag, attrs) class DanParser(HTMLParser): def __init__(self): super(DanParser, self).__init__(strict = False) self.in_select = False def handle_starttag(self, tag, attrs): print("Start Tag: ", tag, attrs) if tag == "select": self.in_select = True print("Start Tag: ", tag, attrs) def handle_endtag(self, tag): print("EndTag: ", tag) if tag == "select" and self.in_select: self.in_select = False print("EndTag: ", tag) def handle_data(self, data): print("Data: ", data) if self.in_select: print("Data: ", data) def handle_startendtag(self,tag,attrs): print("StarEnd Tag: ", tag, attrs) 

When i do in the interpreter

 t = new DanParser() t.feed("<select>test</select>") 

im getting:

 Data: <select> Data: test EndTag: select 

the handle_starttag method is not called, but when I do this using TestParser it behaves correctly. Can someone tell me what I am doing wrong? thanks

+4
source share
3 answers

This is due to strict = False. When strict = False, the initial handlers are not called. They are called when strict = True. This may be a bug in the python HTML phone. The moul example works because it uses the default value of True for strict.

+2
source

The following code works for me:

 from HTMLParser import HTMLParser class DanParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.in_select = False def handle_starttag(self, tag, attrs): print("Start Tag: ", tag, attrs) if tag == "select": self.in_select = True print("Start Tag: ", tag, attrs) def handle_endtag(self, tag): print("EndTag: ", tag) if tag == "select" and self.in_select: self.in_select = False print("EndTag: ", tag) def handle_data(self, data): print("Data: ", data) if self.in_select: print("Data: ", data) def handle_startendtag(self,tag,attrs): print("StarEnd Tag: ", tag, attrs) t = DanParser() t.feed("<select>test</select>") 
+1
source

Tip: make it easy for yourself and do not use it. Try lxml.html , html5lib or BeautifulSoup

+1
source

Source: https://habr.com/ru/post/1401760/


All Articles