2 .
1. Bio url JS. JS :
https://votesmart.org/candidate/126288/derek-stanford
href URL-, .
<a href="#" class="folder" id="folder-bio">
url, xpath "/comment()", url .
, URL- , url : "//" "///".
NB! , - JS Scrapy. .
2. . " = ", "parse_person" "parse_bio".
. , . :
- ( ) "items" . Scrapy .
- "sel = Selector (response)" , .
Scrapy 1.0 Python 3.5, .
from scrapy import Spider, Request
class VSSpider(Spider):
name = "vs5"
allowed_domains = ["votesmart.org"]
start_urls = ["https://votesmart.org/officials/WA/L/washington-state-legislative"]
def parse(self, response):
for href in response.css('h5 a::attr(href)').extract():
person_url = response.urljoin(href)
yield Request(person_url, callback=self.parse_person)
def parse_person(self, response):
item = LegislatorsItems()
desc_rows = response.css('.span-abbreviated td::text').extract()
if desc_rows:
item['current_office'] = desc_rows[0]
item['running_for'] = desc_rows[1] if len(desc_rows) > 1 else None
bio_url = response.url.replace('votesmart.org/candidate/',
'votesmart.org/candidate/biography/')
return Request(bio_url, callback=self.parse_bio, meta={'item': item})
def parse_bio(self, response):
item = response.meta['item']
item['tester'] = response.css('.item.first').xpath('//li[3]').extract()
print(item)
return item