Identification if two HTML pages are similar

I am trying to distinguish between the base register and the provided case. Looking for a library to tell me the similarity in percentages or something like that.

Example:

I have 10 different HTML pages. * All of them are 404 responses with only two lines of random code (for example, time or quote of the day).

Now, when I put a new 404 page, I need the result back, for example, "% 80", however, if I put another page in a completely different way or on the same site, but with completely different content, I should get something like "% 20 similar."

Basically, what I want to do, when I have a new answer, I want to determine if the new answer looks like the 10 pages that I put before.

I am trying to solve this in .NET, a library or recommendation algorithm would be great.

+3
source share
7 answers

Instead of using the diff tool, you can use the copy / paste detector (cpd). You can then adjust the threshold for how much you want the files to be.

As an aside, I used them in the past to track down scammers at school.

Sam

+1
source

If you want to use a string solution, you can take a picture using k-grams (you calculate the entire string of length k consecutive characters for both files, then you execute the Jaccard distance on the result sets). This is the standard way to perform rough queries in the database world.

, html (, ), xhtml ( java http://htmlcleaner.sourceforge.net/, .net, , env ), , , pq- (http://www.inf.unibz.it/~augsten/publ/tods10/ java-) (pq- k-).

, , , , k- pq-gram .

+1

diff .

LCS, .

0

diff .

, , .

diff , diff 5 , .

, 2kb . , , (, , 9 , ).

, diff , .

0

:

, . , , , , % . , , . , , . . , , . , (, ).

0

jqgram, PQ-Gram, , Node.js, #. , ... . .

https://github.com/hoonto/jqgram

DOM vs cheerio, , , . , . , , , . DOM DOM Cheerio vs Cheerio - HTML, Cheerio, , (Cheerio - jQuery- DOM- ).

, , Node.js javascript , , , #/. NET.

// This could probably be optimized significantly, but is a real-world
// example of how to use tree edit distance in the browser.

// For cheerio, you'll have to browserify, 
// which requires some fiddling around
// due to cheerio dynamically generated 
// require (good grief) that browserify 
// does not see due to the static nature 
// of its code analysis (dynamic off-line
// analysis is hard, but doable).
//
// Ultimately, the goal is to end up with 
// something like this in the browser:

var cheerio = require('./lib/cheerio'); 

// The easy part, jqgram:
var jq = require("../jqgram").jqgram;

// Make a cheerio DOM:
var html = '<body><div id="a"><div class="c d"><span>Irrelevent text</span></div></div></body>';

var cheeriodom = cheerio.load(html, {
    ignoreWhitespace: false,
    lowerCaseTags: true
});

// For ease, lets assume you have jQuery laoded:
var realdom = $('body');

// The lfn and cfn functions allow you to specify
// how labels and children should be defined:
jq.distance({
    root: cheeriodom,
    lfn: function(node){ 
        // We don't have to lowercase this because we already
        // asked cheerio to do that for us above (lowerCaseTags).
        return node.name; 
    },
    cfn: function(node){ 
        // Cheerio maintains attributes in the attribs array:
        // We're going to put id and classes in as children 
        // of nodes in our cheerio tree
        var retarr = []; 
        if(!! node.attribs && !! node.attribs.class){
            retarr = retarr.concat(node.attribs.class.split(' '));
        }
        if(!! node.attribs && !! node.attribs.id){
            retarr.push(node.attribs.id);
        }
        retarr = retarr.concat(node.children);
        return  retarr;
    }
},{
    root: realdom,
    lfn: function(node){ 
        return node.nodeName.toLowerCase(); 
    },
    cfn: function(node){ 
        var retarr = [];
        if(!! node.attributes && !! node.attributes.class && !! node.attributes.class.nodeValue){
            retarr = retarr.concat(node.attributes.class.nodeValue.split(' '));
        }
        if(!! node.attributes && !! node.attributes.id && !! node.attributes.id.nodeValue) {
            retarr.push(node.attributes.id.nodeValue);
        }
        for(var i=0; i<node.children.length; ++i){
            retarr.push(node.children[i]);
        }
        return retarr;
    }
},{ p:2, q:3, depth:10 },
function(result) {
    console.log(result.distance);
});
0
source

Source: https://habr.com/ru/post/1697253/


All Articles