, Splash Scrapy CrawlSpider. URL, . , , - scrapy-splash "process_links", URL- Splash http- . , , . :
, URL- , , - .
from urllib.parse import urlencode, parse_qs
URL- , scrapy " ", make "localhost" .
allowed_domains = ['localhost']
start_urls = ['https://www.example.com/']
, -, . LinkExtractor. , .
LinkExtractor(allow=r'(http(s)?://)?(.*\.)?{}.*'.format(r'example.com')),
process_links='process_links',
process_links. urlencode - , splash.
def process_links(self, links):
for link in links:
if "http://localhost:8050/render.html?&" not in link.url:
link.url = "http://localhost:8050/render.html?&" + urlencode({'url':link.url,
'wait':2.0})
return links
, URL- URL- , parse_qs.
parse_qs(response.url)['url'][0]
. , '&' URL- . (... render.html &). URL- , URL-, , , urlencode.