How to say that python scrapy will move to the next start URL

I wrote a scrapy spider that has many start_urls and retrieves email addresses in these URLs. the script takes time to execute, so I want to tell Scrapy to stop crawling a specific site when it finds an email and moves to the next site.

EDIT : code added

from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scrapy.item import Item import csv from urlparse import urlparse from entreprise.items import MailItem class MailSpider(CrawlSpider): name = "mail" start_urls = [] allowed_domains = [] with open('scraped_data.csv', 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') next(reader) for row in reader: url = row[5].strip() if (url.strip() != ""): start_urls.append(url) fragments = urlparse(url).hostname.split(".") hostname = ".".join(len(fragments[-2]) < 4 and fragments[-3:] or fragments[-2:]) allowed_domains.append(hostname) rules = [ Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'), Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item') ] def parse_item(self, response): hxs = HtmlXPathSelector(response) items = [] for mail in hxs.select('//body//text()').re(r'[\w.-] +@ [\w.-]+'): item = MailItem() item['url'] = response.url item['mail'] = mail items.append(item) return items 
+4
source share
2 answers

The idea is to use the start_requests method to decide which URLs to crawl next. In addition, we will track if the email was parsed for the host name in the parsed_hostnames class parsed_hostnames .

In addition, I changed the way I get the hostname from the URL using urlparse .

 from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scrapy.item import Item, Field import csv from urlparse import urlparse class MailItem(Item): url = Field() mail = Field() class MailSpider(CrawlSpider): name = "mail" parsed_hostnames= set() allowed_domains = [] rules = [ Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'), Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item') ] def start_requests(self): with open('scraped_data.csv', 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') next(reader) for row in reader: url = row[5].strip() if url: hostname = urlparse(url).hostname if hostname not in self.parsed_hostnames: if hostname not in self.allowed_domains: self.allowed_domains.append(hostname) self.rules[0].link_extractor.allow_domains.add(hostname) self.rules[1].link_extractor.allow_domains.add(hostname) yield self.make_requests_from_url(url) else: self.allowed_domains.remove(hostname) self.rules[0].link_extractor.allow_domains.remove(hostname) self.rules[1].link_extractor.allow_domains.remove(hostname) def parse_item(self, response): hxs = HtmlXPathSelector(response) items = [] for mail in hxs.select('//body//text()').re(r'[\w.-] +@ [\w.-]+'): item = MailItem() item['url'] = response.url item['mail'] = mail items.append(item) hostname = urlparse(response.url).hostname self.parsed_hostnames.add(hostname) return items 

Should work theoretically. Hope this helps.

+2
source

I ended up using process_links

 from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scrapy.item import Item, Field import csv from urlparse import urlparse class MailItem(Item): url = Field() mail = Field() class MailSpider(CrawlSpider): name = "mail" parsed_hostnames= set() rules = [ Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item', process_links='process_links'), Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item', process_links='process_links') ] start_urls = [] allowed_domains = [] with open('scraped_data.csv', 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') next(reader) for row in reader: url = row[5].strip() if (url.strip() != ""): start_urls.append(url) hostname = urlparse(url).hostname allowed_domains.append(hostname) def parse_item(self, response): hxs = HtmlXPathSelector(response) items = [] mails = hxs.select('//body//text()').re(r'[\w.-] +@ [\w.-]+') if mails: for mail in mails: item = MailItem() item['url'] = response.url item['mail'] = mail items.append(item) hostname = urlparse(response.url).hostname self.parsed_hostnames.add(hostname) return items def process_links(self, links): return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames] 
+2
source

Source: https://habr.com/ru/post/1484156/


All Articles