Largest word intersection algorithm in a multitude of words

History

I create a voice application using x-webkit-speech , which is surprisingly good (a function, not my application), but sometimes the user (I) mumbles a little. It would be nice to accept a command if some reasonable part of the word corresponds to some reasonable part of some reasonable command. Therefore, I am looking for the holy grail called the Greatest Crossing Algorithm in the Code of Words. Can some fresh bright mind drive me out of a cave of despair?

Example

 "rotation" in ["notable","tattoo","onclick","statistically"] 

must match the tattoo, as it has the longest intersection with rotation ( tat_o ). statistically is the second best ( tati ), because most of the word needs to be ignored (but this is a bonus condition, it would be acceptable without it).

Notes

  • I use Czech, where the pronunciation is very close to its writing.
  • javascript is the preferred language, but any pseudo-code is acceptable.
  • minimum intersection length should be an algorithm parameter

What have i tried?

Well, that is pretty confusing ....

 for(var i=10; i>=4; --i) // reasonable substring for(var word in words) // for all words in the set for(var j=0; j<word.length-i; ++j) // search for any i substring // aaargh... three levels of abstraction is too much for me 
+4
source share
3 answers

This is an algorithm that seems to work. I have no idea how well it works compared to other already installed algorithms (I suspect it is worse), but perhaps this gives you an idea of ​​how you could do this:

Fiddle

 var minInt = 3; var arr = ["notable","tattoo","onclick","statistically"]; var word = "rotation"; var res = []; if (word.length >= minInt) { for (var i = 0; i < arr.length; i++) { var comp = arr[i]; var m = 0; if (comp.length >= minInt) { for (var l = 0; l < comp.length - minInt + word.length - minInt + 1; l++) { var subcomp = l > word.length - minInt ? comp.substring(l - word.length + minInt) : comp; var subword = l < word.length - minInt ? word.substring(word.length - minInt - l) : word; var minL = Math.min(subcomp.length, subword.length); var matches = 0; for (var k = 0; k < minL; k++) { if (subcomp[k] === subword[k]) { matches++; } } if (matches > m) { m = matches; } } } res[i] = m >= minInt ? m : null; } } console.log(res); 

What happens is that it compares two lines, "moving" against each other and calculates matching letters in each position. Here you see the compared words "under" for rotation vs. notable rotation vs. notable :

 ion / notable --> one match on index 1 tion / notable --> no match ation / notable --> no match tation / notable --> one match on index 2 otation / notable --> no match rotation / notable --> three matches on index 1,2,3 rotation / otable --> no match rotation / table --> no match rotation / able --> no match rotation / ble --> no match 

As you can see, the maximum number of matches is 3, and this is what it will return.

+2
source

This is where the implementation of the Levenshtein distance calculator in Javascript is implemented.

It returns an object containing the appropriate command and distance.

 var commandArr = ["cat", "dog", "fish", "copy", "delete"] var testCommand = "bopy"; function closestMatch(str, arr) { //console.log("match called"); var matchDist = []; var min, pos; for(var i=0; i<arr.length; i++) { matchDist[i]=calcLevDist(str, arr[i]); console.log("Testing "+ str + " against " + arr[i]); } //http://stackoverflow.com/questions/5442109/how-to-get-the-min-elements-inside-an-array-in-javascript min = Math.min.apply(null,matchDist); pos = matchDist.indexOf(min); var output = { match : arr[pos], distance : matchDist[pos] }; return output; } function calcLevDist (str1, str2) { //console.log("calc running"); var cost = 0 , len1, len2; var x = 1; while(x > 0) { len1 = str1.length; console.log("Length of String 1 = " + len1); len2 = str2.length; console.log("Length of String 2 = " + len2); if(len1 == 0) { cost+= len2; return cost; } if(len2 == 0) { cost+= len1; return cost; } x = Math.min(len1,len2); if(str1.charAt(len1 -1) != str2.charAt(len2 -1)) { cost++; } else console.log(str1.charAt(len1-1) + " matches " + str2.charAt(len2-1)); str1 = str1.substring(0, len1 -1 ); str2 = str2.substring(0, len2 -1 ); console.log("Current Cost = " + cost); } } var matchObj = closestMatch(testCommand, commandArr); var match = matchObj["match"]; var dist = matchObj["distance"]; $("#result").html("Closest match to " + testCommand + " = " + match + " with a Lev Distance of " + dist + "." ) 

You can do with the violin here .

+1
source

Thanks to Basilicum and Jason Nichols, as well as Mike and Andrew for comments, it really helped me finish the algorithm. I come up with my solution for brute force O(n^3) if someone comes across this issue with the same problem.

Anyone is invited to play the violin to improve it.

Algorithm

 /** * Fuzzy match for word in array of strings with given accurancy * @param string needle word to search * @param int accurancy minimum matching characters * @param array haystack array of strings to examine * @return string matching word or undefined if none is found */ function fuzzyMatch(needle,accurancy,haystack) { function strcmpshift(a,b,shift) { var match=0, len=Math.min(a.length,b.length); for(var i in a) if(a[i]==b[+i+shift]) ++match; return match; } function strcmp(a,b) { for(var i=0,max=0,now; i<b.length; ++i) { now = strcmpshift(a,b,i); if(now>max) max = now; } return max; } var word,best=accurancy-1,step,item; for(var i in haystack) { item = haystack[i]; step = Math.max(strcmp(item,needle),strcmp(needle,item)); if(step<=best) continue; best=step, word=item; }; return word; } 

Example

 var word = "rotation"; var commands = ["notable","tattoo","onclick","statistically"]; // find the closest command with at least 3 matching characters var command = fuzzyMatch(word,3,commands); alert(command); // tattoo 
0
source

Source: https://habr.com/ru/post/1483643/


All Articles