How to go to the next page for scraping in PhantomJS

I am trying to get multiple items from a website with multiple pages. I am currently using PhantomJS to do this work, and my code is almost working, but the problem is that my code resets the first page twice, even if (according to the log) it seems like I have already moved to the second.

Here is the code:

var page = require('webpage').create();
page.viewportSize = { width: 1061, height: 1000 }; //To specify the window size
page.open("website", function () {

    function fetch_names(){
        var name = page.evaluate(function () {
            return [].map.call(document.querySelectorAll('div.pepitesteasermain h2 a'), function(name){
                return name.getAttribute('href');
            });
        });
        console.log(name.join('\n'));
        page.render('1.png');
        window.setTimeout(function (){
            goto_next_page();
        }, 5000);
    }

    function goto_next_page(){
        page.evaluate(function () {
            var a = document.querySelector('#block-system-main .next a');
            var e = document.createEvent('MouseEvents');
            e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
            a.dispatchEvent(e);
            waitforload = true;

        });
        fetch_names();
    }

    fetch_names();
});

You can try it yourself to understand how it all works.

+4
source share
1 answer

You need to wait for the page to load after clicking and not to click, moving setTimeout()from fetch_namesto goto_next_page:

function fetch_names(){
    var name = page.evaluate(function () {
        return [].map.call(document.querySelectorAll('div.pepitesteasermain h2 a'), function(name){
            return name.getAttribute('href');
        });
    });
    console.log(name.join('\n'));
    page.render('1.png');
    goto_next_page();
}

function goto_next_page(){
    page.evaluate(function () {
        var a = document.querySelector('#block-system-main .next a');
        var e = document.createEvent('MouseEvents');
        e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
        a.dispatchEvent(e);
        waitforload = true;

    });
    window.setTimeout(function (){
        fetch_names();
    }, 5000);
}

, - , -.

+3

Source: https://habr.com/ru/post/1610502/


All Articles