I try to open all links via pid, but there are two situations:
Where does it open the whole url (I mean even unwanted urls)
def get_links(self):
links = []
host = urlparse( self.url ).hostname
scheme = urlparse( self.url ).scheme
domain_link = scheme+'://'+host
pattern = re.compile(r'(/pid/)')
for a in self.soup.find_all(href=True):
href = a['href']
if not href or len(href) <= 1:
continue
elif 'javascript:' in href.lower():
continue
elif 'forgotpassword' in href.lower():
continue
elif 'images' in href.lower():
continue
elif 'seller-account' in href.lower():
continue
elif 'review' in href.lower():
continue
else:
href = href.strip()
if href[0] == '/':
href = (domain_link + href).strip()
elif href[:4] == 'http':
href = href.strip()
elif href[0] != '/' and href[:4] != 'http':
href = ( domain_link + '/' + href ).strip()
if '#' in href:
indx = href.index('#')
href = href[:indx].strip()
if href in links:
continue
links.append(self.re_encode(href))
return links
In this case, it simply opens the URL with pid, but in this case it does not follow the links and is limited only to the main page. After opening some links with pid, it will work.
def get_links(self):
links = []
host = urlparse( self.url ).hostname
scheme = urlparse( self.url ).scheme
domain_link = scheme+'://'+host
pattern = re.compile(r'(/pid/)')
for a in self.soup.find_all(href=True):
if pattern.search(a['href']) is not None:
href = a['href']
if not href or len(href) <= 1:
continue
elif 'javascript:' in href.lower():
continue
elif 'forgotpassword' in href.lower():
continue
elif 'images' in href.lower():
continue
elif 'seller-account' in href.lower():
continue
elif 'review' in href.lower():
continue
else:
href= href.strip()
if href[0] == '/':
href = (domain_link + href).strip()
elif href[:4] == 'http':
href = href.strip()
elif href[0] != '/' and href[:4] != 'http':
href = ( domain_link + '/' + href ).strip()
if '#' in href:
indx = href.index('#')
href = href[:indx].strip()
if href in links:
continue
links.append(self.re_encode(href))
return links
Can someone help get all the links, even internal links in the url, and in the end only accept pid as the returned link.
source
share