from BeautifulSoup import BeautifulSoup import re html = """ <div>hello</div> <a href="/index.html">Not this one</a>" <a href="http://google.com">Link 1</a> <a href="http:/amazon.com">Link 2</a> """ def processor(tag): href = tag.get('href') if not href: return False return True if (href.find("google") == -1) else False soup = BeautifulSoup(html) back_links = soup.findAll(processor, href=re.compile(r"^http")) print back_links --output:-- [<a href="http:/amazon.com">Link 2</a>]
However, it may be more efficient to simply get all the links starting with http, and then look for those links for links that don't have "google" in their hrefs:
http_links = soup.findAll('a', href=re.compile(r"^http")) results = [a for a in http_links if a['href'].find('google') == -1] print results --output:-- [<a href="http:/amazon.com">Link 2</a>]
7stud source share