Beautiful soup open the whole url with pid in it

I try to open all links via pid, but there are two situations:

  • Where does it open the whole url (I mean even unwanted urls)

    def get_links(self): 
        links = [] 
        host = urlparse( self.url ).hostname 
        scheme = urlparse( self.url ).scheme 
        domain_link = scheme+'://'+host 
        pattern = re.compile(r'(/pid/)')
    
        for a in self.soup.find_all(href=True):            
            href = a['href']
            if not href or len(href) <= 1:
                continue
            elif 'javascript:' in href.lower():
                continue
            elif 'forgotpassword' in href.lower():
                continue
            elif 'images' in href.lower():
                continue
            elif 'seller-account' in href.lower():
                continue
            elif 'review' in href.lower():
                continue
            else:
                href = href.strip()
            if href[0] == '/':
                href = (domain_link + href).strip()
            elif href[:4] == 'http':
                href = href.strip()
            elif href[0] != '/' and href[:4] != 'http':
                href = ( domain_link + '/' + href ).strip()                  
            if '#' in href:
                indx = href.index('#')
                href = href[:indx].strip()
            if href in links:
                continue
    
            links.append(self.re_encode(href))
    
        return links
    
  • In this case, it simply opens the URL with pid, but in this case it does not follow the links and is limited only to the main page. After opening some links with pid, it will work.

    def get_links(self): 
        links = [] 
        host = urlparse( self.url ).hostname 
        scheme = urlparse( self.url ).scheme 
        domain_link = scheme+'://'+host 
        pattern = re.compile(r'(/pid/)')
    
        for a in self.soup.find_all(href=True):
            if pattern.search(a['href']) is not None:
               href = a['href']  
                if not href or len(href) <= 1:
                    continue
                elif 'javascript:' in href.lower():
                    continue
                elif 'forgotpassword' in href.lower():
                    continue
                elif 'images' in href.lower():
                    continue
                elif 'seller-account' in href.lower():
                    continue
                elif 'review' in href.lower():
                    continue
                else:
                    href= href.strip()
                if href[0] == '/':
                    href = (domain_link + href).strip()
                elif href[:4] == 'http':
                    href = href.strip()
                elif href[0] != '/' and href[:4] != 'http':
                    href = ( domain_link + '/' + href ).strip()                  
                if '#' in href:
                   indx = href.index('#')
                   href = href[:indx].strip()
                if href in links:
                   continue
    
                links.append(self.re_encode(href))
    
        return links
    

Can someone help get all the links, even internal links in the url, and in the end only accept pid as the returned link.

+4
source share
2 answers

, - , if ? , :

def get_links(self): 
    links = [] 
    host = urlparse( self.url ).hostname 
    scheme = urlparse( self.url ).scheme 
    domain_link = scheme+'://'+host 

    for a in self.soup.find_all(href=True):            
        href = a['href']
        if not href or len(href) <= 1:
            continue
        if href.lower().find("/pid/") != -1:
            if 'javascript:' in href.lower():
                continue
            elif 'forgotpassword' in href.lower():
                continue
            elif 'images' in href.lower():
                continue
            elif 'seller-account' in href.lower():
                continue
            elif 'review' in href.lower():
                continue

            if href[0] == '/':
                href = (domain_link + href).strip()
            elif href[:4] == 'http':
                href = href.strip()
            elif href[0] != '/' and href[:4] != 'http':
                href = ( domain_link + '/' + href ).strip()   

            if '#' in href:
                indx = href.index('#')
                href = href[:indx].strip()

            if href in links:
                continue

            links.append(self.re_encode(href))

    return links

, , , .

else:
        continue
0

- : ,

    for a in self.soup.find_all(href=True):            
        href = a['href']
        if not href or len(href) <= 1:
            continue
        if href[0] == '/':
            href = (domain_link + href).strip()   
            if href.lower().find("?pid=") != -1:
                href = href.strip()
            elif 'javascript:' in href.lower():
                continue
            elif 'reviews' in href.lower():
                continue
        elif href[:4] == 'http':
            if href.lower().find("?pid=") != -1:
                href = href.strip()
        elif href[0] != '/' and href[:4] != 'http':
            href = ( domain_link + '/' + href ).strip()
            if href.lower().find("?pid=") != -1:
                href = href.strip()
        if '#' in href:
            indx = href.index('#')
            href = href[:indx].strip()
        if href in links:
            continue
        links.append(self.re_encode(href))
0

Source: https://habr.com/ru/post/1606276/


All Articles