The following pages and the crawler do not work

I am trying to keep track of the pages of this website where the generation of the next page number is rather strange. Instead of the usual indexing, the following pages look like this:

new/v2.php?cat=69&pnum=2&pnum=3
new/v2.php?cat=69&pnum=2&pnum=3&pnum=4
new/v2.php?cat=69&pnum=2&pnum=3&pnum=4&pnum=5

and as a result, my scraper gets into the loop and never stops, clearing the elements of such pages:

DEBUG: Scraped from <200 http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=1&pnum=1&pnum=2&pnum=3>`

etc. Although the scraper elements are correct and consistent with the purpose, the crawler never stops, turning to pages again and again.

my crawler looks like this:

from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin


from mymobile.items import MymobileItem


class MmobySpider(CrawlSpider):
    name = "mmoby2" 
    allowed_domains = ["mymobile.ge"]
    start_urls = [
        "http://mymobile.ge/new/v2.php?cat=69&pnum=1"
    ]

    rules = (Rule(SgmlLinkExtractor(allow=("new/v2.php\?cat=69&pnum=\d*", ))
            , callback="parse_items", follow=True),)

    def parse_items(self, response):
        sel = Selector(response)
        titles = sel.xpath('//table[@width="1000"]//td/table[@class="probg"]')
        items = []
        for t in titles:
            url = t.xpath('tr//a/@href').extract()
            item = MymobileItem()
            item["brand"] = t.xpath('tr[2]/td/text()').re('^([\w\-]+)')
            item["model"] = t.xpath('tr[2]/td/text()').re('\s+(.*)$')
            item["price"] = t.xpath('tr[3]/td//text()').re('^([0-9\.]+)')
            item["url"] = urljoin("http://mymobile.ge/new/", url[0])

            items.append(item)

        return(items)   

any suggestion how can i tame it?

+4
source share
2 answers

. URL, http://mymobile.ge/new/v2.php?cat=69&pnum=1, follow=False, , .

:

from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin


class MmobySpider(CrawlSpider):
    name = "mmoby2" 
    allowed_domains = ["mymobile.ge"]
    start_urls = [ 
        "http://mymobile.ge/new/v2.php?cat=69&pnum=1"
    ]   

    rules = ( 
        Rule(SgmlLinkExtractor(
                allow=("new/v2\.php\?cat=69&pnum=\d*",),
            )   
            , callback="parse_items", follow=False),)

    def parse_items(self, response):
        sel = Selector(response)
        print response.url

Ran :

scrapy crawl mmoby2

, :

...
2014-05-18 12:20:35+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1> (referer: None)
2014-05-18 12:20:36+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1> (referer: http://mymobile.ge/new/v2.php?cat=69&pnum=1)
http://mymobile.ge/new/v2.php?cat=69&pnum=1
2014-05-18 12:20:37+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=4> (referer: http://mymobile.ge/new/v2.php?cat=69&pnum=1)
http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=4
2014-05-18 12:20:38+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=2> (referer: http://mymobile.ge/new/v2.php?cat=69&pnum=1)
http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=2
2014-05-18 12:20:38+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=5> (referer: http://mymobile.ge/new/v2.php?cat=69&pnum=1)
http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=5
2014-05-18 12:20:39+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=3> (referer: http://mymobile.ge/new/v2.php?cat=69&pnum=1)
http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=3
2014-05-18 12:20:39+0200 [mmoby2] INFO: Closing spider (finished)
2014-05-18 12:20:39+0200 [mmoby2] INFO: Dumping Scrapy stats:
        {'downloader/request_bytes': 1962,
         'downloader/request_count': 6,
         'downloader/request_method_count/GET': 6,
         ...
+4

Smgllinkextractor , sprapy selectors/xpaths, Request , .

- .

from scrapy.spider import Spider
from scrapy.http import Request

class MmobySpider(Spider):
    name = "mmoby2"
    allowed_domains = ["mymobile.ge"]
    start_urls = [
        "http://mymobile.ge/new/v2.php?cat=69&pnum=1"
    ]

    def parse(self, response):
        sel = Selector(response)
        titles = sel.xpath('//table[@width="1000"]//td/table[@class="probg"]')
        items = []
        for t in titles:
            url = t.xpath('tr//a/@href').extract()
            item = MymobileItem()
            item["brand"] = t.xpath('tr[2]/td/text()').re('^([\w\-]+)')
            item["model"] = t.xpath('tr[2]/td/text()').re('\s+(.*)$')
            item["price"] = t.xpath('tr[3]/td//text()').re('^([0-9\.]+)')
            item["url"] = urljoin("http://mymobile.ge/new/", url[0])

            yield item

        # extract next page link
        next_page_xpath = "//td[span]/following-sibling::td[1]/a[contains(@href, 'num')]/@href"
        next_page = sel.xpath(next_page_xpath).extract()

        # if there is next page yield Request for it
        if next_page:
            next_page = urljoin(response.url, next_page[0])
            yield Request(next_page, callback=self.parse)

Xpath - , .

+4

Source: https://habr.com/ru/post/1540993/


All Articles