Href is missing when I get a page using jsoup or htmlunit

I am trying to parse a Google image search result .

I am trying to get the href attribute of an element. I noticed that the href field is missing when I get the page programmatically (this happens with both jsoup and htmlunit).
Comparing the page element programmatically via java and the page element loaded by the actual browser, the only difference is that there is really no href field (the rest is the same).

The href attribute (IMAGE_LINK) is as follows: /imgres?imgurl=http%3A%2F%2Fcdn.zonarutoppuden.com%2Fns%2Fpe‌​liculas-naruto-shipp‌​uden.jpg&imgrefurl=h‌​ttp%3A%2F%2Fwww.zona‌​rutoppuden.com%2F201‌​0%2F10%2Fnaruto-ship‌​puden-peliculas.html‌​&docid=JR8NPqKrF3ac_‌​M&tbnid=0EPPOYQcflXk‌​MM%3A&w=900&h=600&bi‌​h=638&biw=1275&ved=0‌​ahUKEwih9O2e88_OAhWM‌​ExoKHRLGAGQQMwg2KAMw‌​Aw&iact=mrc&uact=8

Maybe a problem with the javascript engine? Or maybe some kind of anti-parsing algorithm used by the site?

Java code snippet code:

WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.waitForBackgroundJavaScript(50000);
HtmlPage page1=null;

        try {
            // Get the first page
            page1 = webClient.getPage(URL);
            System.out.println(page1.asXml());
        } catch (FailingHttpStatusCodeException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

Shot HTML code (real browser):

<a jsaction="fire.ivg_o;mouseover:str.hmov;mouseout:str.hmou" class="rg_l" style="width: 134px; height: 201px; left: 0px; background: rgb(128, 128, 128);" href="IMAGE_LINK"> CONTENT... </a>

Snippet Html Code (Page retrieved programmatically):

<a jsaction="fire.ivg_o;mouseover:str.hmov;mouseout:str.hmou" class="rg_l" style="width: 134px; height: 201px; left: 0px; background: rgb(128, 128, 128);"> CONTENT... </a>

Thank.

+1
source share
1 answer

For each search result, there <div class="rg_meta">is a containing JSON object that also contains a URL. Using a JSON parser such as json-simple to parse an object, the following code prints the image URLs:

String searchTerm = "naruto shippuden";
String searchUrl = "https://www.google.com/search?site=imghp&tbm=isch&source=hp&biw=1920&bih=955&q=" + searchTerm.replace(" ", "+") + "&gws_rd=cr";

try {
    Document doc = Jsoup.connect(searchUrl)
            .userAgent("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36")
            .referrer("https://www.google.com/").get();

    JSONObject obj;

    for (Element result : doc.select("div.rg_meta")) {

        // div.rg_meta contains a JSON object, which also holds the image url
        obj = (JSONObject) new JSONParser().parse(result.text());

        String imageUrl = (String) obj.get("ou");

        // just printing out the url to demonstate the approach
        System.out.println("imageUrl: " + imageUrl);    
    } 

} catch (IOException e1) {
    e1.printStackTrace();
}catch (ParseException e) {
    e.printStackTrace();
}

Conclusion:

imageUrl: http://ib3.huluim.com/show_key_art/1603?size=1600x600&region=US
imageUrl: http://cdn.zonarutoppuden.com/ns/peliculas-naruto-shippuden.jpg
imageUrl: http://www.saiyanisland.com/news/wp-content/uploads2/2014/12/Naruto-Sasuke.jpg
...

Update

Since jsAction does not look good with htmlUnit, I would suggest using phantomJs . Just download the binary for your OS and create a script file.

page.js:

var page = require('webpage').create();
var fs = require('fs');

page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';

page.zoomFactor = 0.1;

page.viewportSize = {
  width: 1920,
  height: 1080
};

var divCount="-1";
var topPosition=0;
var unchangedCounter=0;

page.open('https://www.google.com/search?site=imghp&tbm=isch&source=hp&q=naruto+shippuden&gws_rd=cr', function(status) {
    console.log("Status: " + status);
    if(status === "success") {

        window.setInterval(function() {

            var newDivCount = page.evaluate(function() { 
                var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");
                return divs[divs.length-1].getAttribute("data-ri");
            });

            topPosition = topPosition + 1080;

            page.scrollPosition = {
                top: topPosition,
                left: 0
            };

            if(newDivCount===divCount){
                page.evaluate(function() {
                    var button = document.querySelector("#smb");
                    console.log("buttontype:"+typeof button);
                    if(!(typeof button === "undefined")) {
                        button.click();
                        return true;
                    }else{
                        return false;
                    }
                });

                if(unchangedCounter===5){
                    console.log(newDivCount);
                    var path = 'output.html';
                    fs.write(path, page.content, 'w');
                    phantom.exit();
                }else{
                    unchangedCounter=unchangedCounter+1;
                }
            }else{
                unchangedCounter=0;
            }
            divCount = newDivCount;

        }, 500);
    }
});

script phantomJs - jsoup:

try {
    Process process = Runtime.getRuntime().exec("bin\\phantomjs page.js"); //change path to phantomjs binary and your script file
    process.waitFor();

    Document doc = Jsoup.parse(new File("output.html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js

    for (Element element : doc.select("div.rg_di.rg_bx.rg_el.ivg-i a")) {
        System.out.println(element.attr("href"));
    }
    System.out.println("Number of results: " + doc.select("div.rg_di.rg_bx.rg_el.ivg-i a").size());
} catch (IOException | InterruptedException e) {
    e.printStackTrace();
}

:

/imgres?imgurl=http%3A%2F%2Fib3.huluim.com%2Fshow_key_art%2F1603%3Fsize%3D1600x600%26region%3DUS&imgrefurl=http%3A%2F%2Fwww.hulu.com%2Fnaruto-shippuden&docid=OgW4j66rp7CKkM&tbnid=SElXvYDJj9cR6M%3A&w=1600&h=600&bih=10800&biw=19200&ved=0ahUKEwjX2PXmptPOAhULVxoKHXfmDg8QMwgzKAAwAA&iact=mrc&uact=8
/imgres?imgurl=http%3A%2F%2Fcdn.zonarutoppuden.com%2Fns%2Fpeliculas-naruto-shippuden.jpg&imgrefurl=http%3A%2F%2Fwww.zonarutoppuden.com%2F2010%2F10%2Fnaruto-shippuden-peliculas.html&docid=JR8NPqKrF3ac_M&tbnid=0EPPOYQcflXkMM%3A&w=900&h=600&bih=10800&biw=19200&ved=0ahUKEwjX2PXmptPOAhULVxoKHXfmDg8QMwg0KAEwAQ&iact=mrc&uact=8
...
Number of results: 463

: URL- script

Script page.js

var page = require('webpage').create();
var fs = require('fs');
var system = require('system');

var url = "";
var searchParameter = "";

if (system.args.length === 3) {
    url=system.args[1];
    searchParameter=system.args[2];
}

if(url==="" || searchParameter===""){
    phantom.exit();
}

page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';

page.zoomFactor = 0.1;

page.viewportSize = {
  width: 1920,
  height: 1080
};

var divCount="-1";
var topPosition=0;
var unchangedCounter=0;

page.open(url, function(status) {
    console.log("Status: " + status);
    if(status === "success") {

        window.setInterval(function() {

            var newDivCount = page.evaluate(function() { 
                var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");
                return divs[divs.length-1].getAttribute("data-ri");
            });

            topPosition = topPosition + 1080;

            page.scrollPosition = {
                top: topPosition,
                left: 0
            };

            if(newDivCount===divCount){
                page.evaluate(function() {
                    var button = document.querySelector("#smb");
                    if(!(typeof button === "undefined")) {
                        button.click();
                        return true;
                    }else{
                        return false;
                    }
                });

                if(unchangedCounter===5){
                    var path = searchParameter+'.html';
                    fs.write(path, page.content, 'w');
                    phantom.exit();
                }else{
                    unchangedCounter=unchangedCounter+1;
                }
            }else{
                unchangedCounter=0;
            }
            divCount = newDivCount;

        }, 500);
    }else{
        phantom.exit();
    }
});

Java-

try {
    //change path to phantomjs binary and your script file
    String phantomJSPath = "phantomjs" + File.separator + "bin" + File.separator + "phantomjs";
    String scriptFile = "page.js";

    String searchTerm = "naruto+shippuden";
    String urlParameter = "https://www.google.com/search?site=imghp&tbm=isch&source=hp&gws_rd=cr&q="+searchTerm;

    Process process = Runtime.getRuntime().exec(phantomJSPath + " " + scriptFile + " " + urlParameter + " " + searchTerm);
    process.waitFor();

    Document doc = Jsoup.parse(new File(searchTerm + ".html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js

    for (Element element : doc.select("div.rg_di.rg_bx.rg_el.ivg-i a")) {
        System.out.println(element.attr("href"));
    }
    System.out.println("Number of results: " + doc.select("div.rg_di.rg_bx.rg_el.ivg-i a").size());
} catch (IOException | InterruptedException e) {
    e.printStackTrace();
}
+2

Source: https://habr.com/ru/post/1666080/


All Articles