The idea is to use the start_requests method to decide which URLs to crawl next. In addition, we will track if the email was parsed for the host name in the parsed_hostnames class parsed_hostnames .
In addition, I changed the way I get the hostname from the URL using urlparse .
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scrapy.item import Item, Field import csv from urlparse import urlparse class MailItem(Item): url = Field() mail = Field() class MailSpider(CrawlSpider): name = "mail" parsed_hostnames= set() allowed_domains = [] rules = [ Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'), Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item') ] def start_requests(self): with open('scraped_data.csv', 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') next(reader) for row in reader: url = row[5].strip() if url: hostname = urlparse(url).hostname if hostname not in self.parsed_hostnames: if hostname not in self.allowed_domains: self.allowed_domains.append(hostname) self.rules[0].link_extractor.allow_domains.add(hostname) self.rules[1].link_extractor.allow_domains.add(hostname) yield self.make_requests_from_url(url) else: self.allowed_domains.remove(hostname) self.rules[0].link_extractor.allow_domains.remove(hostname) self.rules[1].link_extractor.allow_domains.remove(hostname) def parse_item(self, response): hxs = HtmlXPathSelector(response) items = [] for mail in hxs.select('//body//text()').re(r'[\w.-] +@ [\w.-]+'): item = MailItem() item['url'] = response.url item['mail'] = mail items.append(item) hostname = urlparse(response.url).hostname self.parsed_hostnames.add(hostname) return items
Should work theoretically. Hope this helps.
source share