Beautiful soup open the whole url with pid in it

Question

Beautiful soup open the whole url with pid in it

I try to open all links via pid, but there are two situations:

Where does it open the whole url (I mean even unwanted urls)

def get_links(self): 
    links = [] 
    host = urlparse( self.url ).hostname 
    scheme = urlparse( self.url ).scheme 
    domain_link = scheme+'://'+host 
    pattern = re.compile(r'(/pid/)')

    for a in self.soup.find_all(href=True):            
        href = a['href']
        if not href or len(href) <= 1:
            continue
        elif 'javascript:' in href.lower():
            continue
        elif 'forgotpassword' in href.lower():
            continue
        elif 'images' in href.lower():
            continue
        elif 'seller-account' in href.lower():
            continue
        elif 'review' in href.lower():
            continue
        else:
            href = href.strip()
        if href[0] == '/':
            href = (domain_link + href).strip()
        elif href[:4] == 'http':
            href = href.strip()
        elif href[0] != '/' and href[:4] != 'http':
            href = ( domain_link + '/' + href ).strip()                  
        if '#' in href:
            indx = href.index('#')
            href = href[:indx].strip()
        if href in links:
            continue

        links.append(self.re_encode(href))

    return links

In this case, it simply opens the URL with pid, but in this case it does not follow the links and is limited only to the main page. After opening some links with pid, it will work.

def get_links(self): 
    links = [] 
    host = urlparse( self.url ).hostname 
    scheme = urlparse( self.url ).scheme 
    domain_link = scheme+'://'+host 
    pattern = re.compile(r'(/pid/)')

    for a in self.soup.find_all(href=True):
        if pattern.search(a['href']) is not None:
           href = a['href']  
            if not href or len(href) <= 1:
                continue
            elif 'javascript:' in href.lower():
                continue
            elif 'forgotpassword' in href.lower():
                continue
            elif 'images' in href.lower():
                continue
            elif 'seller-account' in href.lower():
                continue
            elif 'review' in href.lower():
                continue
            else:
                href= href.strip()
            if href[0] == '/':
                href = (domain_link + href).strip()
            elif href[:4] == 'http':
                href = href.strip()
            elif href[0] != '/' and href[:4] != 'http':
                href = ( domain_link + '/' + href ).strip()                  
            if '#' in href:
               indx = href.index('#')
               href = href[:indx].strip()
            if href in links:
               continue

            links.append(self.re_encode(href))

    return links

Can someone help get all the links, even internal links in the url, and in the end only accept pid as the returned link.

+4

python url beautifulsoup parse.com

joe Sep 7 '15 at 14:03

source share

2 answers

dstudeba · Answer 1 · 2015-09-07T15:00:57+0000

, - , if ? , :

def get_links(self): 
    links = [] 
    host = urlparse( self.url ).hostname 
    scheme = urlparse( self.url ).scheme 
    domain_link = scheme+'://'+host 

    for a in self.soup.find_all(href=True):            
        href = a['href']
        if not href or len(href) <= 1:
            continue
        if href.lower().find("/pid/") != -1:
            if 'javascript:' in href.lower():
                continue
            elif 'forgotpassword' in href.lower():
                continue
            elif 'images' in href.lower():
                continue
            elif 'seller-account' in href.lower():
                continue
            elif 'review' in href.lower():
                continue

            if href[0] == '/':
                href = (domain_link + href).strip()
            elif href[:4] == 'http':
                href = href.strip()
            elif href[0] != '/' and href[:4] != 'http':
                href = ( domain_link + '/' + href ).strip()   

            if '#' in href:
                indx = href.index('#')
                href = href[:indx].strip()

            if href in links:
                continue

            links.append(self.re_encode(href))

    return links

, , , .

else:
        continue

joe · Answer 2 · 2015-09-08T02:36:51+0000

- : ,

    for a in self.soup.find_all(href=True):            
        href = a['href']
        if not href or len(href) <= 1:
            continue
        if href[0] == '/':
            href = (domain_link + href).strip()   
            if href.lower().find("?pid=") != -1:
                href = href.strip()
            elif 'javascript:' in href.lower():
                continue
            elif 'reviews' in href.lower():
                continue
        elif href[:4] == 'http':
            if href.lower().find("?pid=") != -1:
                href = href.strip()
        elif href[0] != '/' and href[:4] != 'http':
            href = ( domain_link + '/' + href ).strip()
            if href.lower().find("?pid=") != -1:
                href = href.strip()
        if '#' in href:
            indx = href.index('#')
            href = href[:indx].strip()
        if href in links:
            continue
        links.append(self.re_encode(href))

Beautiful soup open the whole url with pid in it

More articles: