Scrapy: deny / ignore link with a specific class attribute

Question

Scrapy: deny / ignore link with a specific class attribute

I am browsing a large site using scrapy, and I am looking for a way to reject all tags with the attribute class = "AdvSearchKeyword_clearall".

If it is not possible to reject the tag <a>with class = "AdvSearchKeyword_clearall", could you analyze "AdvSearchKeyword_clearall" to filter all links with a specific attribute later?

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from wallspider.items import Website


class ComSpider(CrawlSpider):
    name = "browsepages"
    allowed_domains = ["www.mydomain.com"]
    start_urls = ["http://www.mydomain.com",]

    rules = (
    Rule (SgmlLinkExtractor(allow=('/browse/', ),)
    , callback="parse_items", follow= True),
    Rule(SgmlLinkExtractor(allow=(),unique=True,deny=('/[1-9]$', '(bti=)[1-9]+(?:\.[1-9]*)?', '(sort_by=)[a-zA-Z]', '(sort_by=)[1-9]+(?:\.[1-9]*)?', '(ic=32_)[1-9]+(?:\.[1-9]*)?', '(ic=60_)[0-9]+(?:\.[0-9]*)?', '(search_sort=)[1-9]+(?:\.[1-9]*)?', 'browse-ng.do\?', '/page/', '/ip/', 'out\+value', 'fn=', 'customer_rating', 'special_offers', 'search_sort=&', 'facet=' ))),
    )

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//html')
        items = []

        for site in sites:
            item = Website()
            item['url'] = response.url
            item['referer'] = response.request.headers.get('Referer')
            item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract()
            item['robots'] = site.select('//meta[@name="robots"]/@content').extract()
            items.append(item)

        return items

+4

python web-crawler web-scraping scrapy

Jason youk Jan 27 '14 at 18:36

source share

1 answer

mktums · Answer 1 · 2014-04-09T07:32:24+0000

You can override with the SgmlLinkExtractorfollowing code:

class ExtendedSgmlLinkExtractor(SgmlLinkExtractor):
    def __init__(self, deny_xpaths=(), *args, **kwargs):
        self.deny_xpaths = deny_xpaths
        super(ExtendedSgmlLinkExtractor, self).__init__(*args, **kwargs)

    def extract_links(self, response):
        from scrapy.utils.response import get_base_url
        base_url = None

        if self.restrict_xpaths:
            sel = Selector(response)
            base_url = get_base_url(response)
            body = u''.join(f
                            for x in self.restrict_xpaths
                            for f in sel.xpath(x).extract()
                            ).encode(response.encoding)
        else:
            body = response.body

        if self.deny_xpaths:
            sel = Selector(response)
            base_url = get_base_url(response)
            body = u''.join(f
                            for x in self.deny_xpaths
                            for f in sel.xpath(x).extract()
                            ).encode(response.encoding)

        links = self._extract_links(body, response.url, response.encoding, base_url)
        links = self._process_links(links)
        return links

and then use it with xpaths that contains the statement not(), in your case - //a[not(contains(@class, 'AdvSearchKeyword_clearall'))]for example:

rules = (Rule(ExtendedSgmlLinkExtractor(…, deny_xpaths=('//a[not(contains(@class, 'AdvSearchKeyword_clearall'))]',),)

, , .

Scrapy: deny / ignore link with a specific class attribute

More articles: