Allow both elements and callback request in mode

Disclaimer: I'm pretty new to both Python and Scrapy.

I'm trying to get my spider to collect URLs from the start URL, follow the collected URLs, and both:

  • Scraping the next page for specific items (and eventually returning them)
  • collect more specific URLs from the next page and follow these URLs.

I want to be able to continue this process of receiving both elements and callback requests, but I'm not quite sure how to do this. Currently, my code only returns urls and there are no elements. I am obviously doing something wrong. Any feedback would be greatly appreciated.

class VSSpider(scrapy.Spider):
    name = "vs5"
    allowed_domains = ["votesmart.org"]
    start_urls = [
                  "https://votesmart.org/officials/WA/L/washington-state-legislative#.V8M4p5MrKRv",
                  ]

    def parse(self, response):
        sel = Selector(response)
        #this gathers links to the individual legislator pages, it works
        for href in response.xpath('//h5/a/@href'): 
            url = response.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse1)

    def parse1(self, response):
        sel = Selector(response)
        items = []
        #these xpaths are on the next page that the spider should follow, when it first visits an individual legislator page
        for sel in response.xpath('//*[@id="main"]/section/div/div/div'):
            item = LegislatorsItems()
            item['current_office'] = sel.xpath('//tr[1]/td/text()').extract()
            item['running_for'] = sel.xpath('//tr[2]/td/text()').extract()
            items.append(item)
        #this is the xpath to the biography of the legislator, which it should follow and scrape next
        for href in response.xpath('//*[@id="folder-bio"]/@href'):
            url = response.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse2, meta={'items': items})

    def parse2(self, response):
        sel = Selector(response)
        items = response.meta['items']
        #this is an xpath on the biography page
        for sel in response.xpath('//*[@id="main"]/section/div[2]/div/div[3]/div/'):
            item = LegislatorsItems()
            item['tester'] = sel.xpath('//div[2]/div[2]/ul/li[3]').extract()
            items.append(item)
            return items

Thank!

+4
1

2 .

1. Bio url JS. JS : https://votesmart.org/candidate/126288/derek-stanford

href URL-, .

<a href="#" class="folder" id="folder-bio">
<!--<a href='/candidate/biography/126288/derek-stanford' itemprop="url" class='more'>
           See Full Biographical and Contact Information</a>-->

url, xpath "/comment()", url .

, URL- , url : "//" "///".

NB! , - JS Scrapy. .


2. . " = ", "parse_person" "parse_bio".

. , . :

  • ( ) "items" . Scrapy .
  • "sel = Selector (response)" , .

Scrapy 1.0 Python 3.5, .

from scrapy import Spider, Request

class VSSpider(Spider):
    name = "vs5"
    allowed_domains = ["votesmart.org"]
    start_urls = ["https://votesmart.org/officials/WA/L/washington-state-legislative"]

    def parse(self, response):
        for href in response.css('h5 a::attr(href)').extract():
            person_url = response.urljoin(href)
            yield Request(person_url, callback=self.parse_person)

    def parse_person(self, response):  # former "parse1"
        # define item, one for both parse_person and bio function
        item = LegislatorsItems()

        # extract text from left menu table and populate to item
        desc_rows = response.css('.span-abbreviated td::text').extract()
        if desc_rows:
            item['current_office'] = desc_rows[0]
            item['running_for'] = desc_rows[1] if len(desc_rows) > 1 else None

        # create right bio url and pass item to it
        bio_url = response.url.replace('votesmart.org/candidate/', 
                                       'votesmart.org/candidate/biography/')
        return Request(bio_url, callback=self.parse_bio, meta={'item': item})

    def parse_bio(self, response):  # former "parse2"
        # get item from meta, add "tester" data and return
        item = response.meta['item']
        item['tester'] = response.css('.item.first').xpath('//li[3]').extract()
        print(item)   # for python 2: print item 
        return item
+1

Source: https://habr.com/ru/post/1653233/


All Articles