I cannot understand why Scrapy crawls the first page, but does not follow the links to scan subsequent pages. This has to be something to do with the Rules. Very grateful. Thanks!
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from craigslist_sample.items import CraigslistItem class MySpider(CrawlSpider): name = "craig" allowed_domains = ["sfbay.craigslist.org"] start_urls = ["http://sfbay.craigslist.org/acc/"] rules = (Rule (SgmlLinkExtractor(allow=("index100\.html", ),restrict_xpaths=('//p[@id="nextpage"]',)) , callback="parse_items", follow= True), ) def parse_items(self, response): hxs = HtmlXPathSelector(response) titles = hxs.select("//p") items = [] for titles in titles: item = CraigslistItem() item ["title"] = titles.select("a/text()").extract() item ["link"] = titles.select("a/@href").extract() items.append(item) return(items) spider = MySpider()
source share