How do I return the shortest snippet from a keyword search?

I have the following task:

Given a content page with alphanumeric words and a search phrase of N words, write an algorithm that returns the shortest piece of content containing all N words in any order.

This is what I still have. I got the text on the page into an array without punctuation marks:

var allText = $('#all-text').text();
var punctuationless = allText.replace(/[^A-Za-z0-9_]/g," ").toLowerCase();
var lessSpaces = punctuationless.replace(/\s{2,}/g," ");
var allTextArray = lessSpaces.split(" ");
var keywords = [];

I think I can probably use the .filter method, but I'm not sure how to compare two arrays.

allTextArray.filter(keywords)
//find the indexes of the matches to the keywords
//compare how far apart the indexes are to each other and return the shortest length
+4
source share
2 answers

So, to list some examples, as I understand OP, if the text of the page:

One for the money,
two for the show.
  • Search terms "One", "money"should produce "One for the money".
  • "One","two","for" "One for the money, two"
  • "for","show" "for the show"

    • ... for the money, two for the show

, , , .

jsfiddle, , ( , , , ).

, , , , :

  • .
  • , , .
  • () , - , , .

, , , , .

, , , , OP .

( ) ( , ) , indexOf .

getSnippet = function(keywords, fullText) {
  var keywordCount = keywords.length,
      keywordIndexes = [];

  // Find each occurrence of every word
  for(var i=0; i < keywordCount; i++) {
    var searchPos = 0;
    var word = keywords[i];
    var index = -1;
    do {
      index = fullText.indexOf(keywords[i],searchPos);
      if (index >= 0) {
        keywordIndexes.push({i:index, word:word});
      }
      searchPos = index + 1;
    } while (index >= 0);
  }

  keywordIndexes.sort(function(a, b) { return a.i == b.i ? 0 : a.i < b.i ? -1 : 1; });

  // Find the shortest run by starting at each array index and scanning to the
  // right until we have encountered each word in the list.
  for (i=0, n=keywordIndexes.length-keywordCount; i<=n; i++) {
    // NOTE: We actually can actually stop once there are fewer keyword
    // indexes than keywords, since we know we won't find all the keywords (hence the subtraction of keywordCount)
    var foundWords = {},
        foundCount = 0;
    snippetStart = keywordIndexes[i].i;

    for (j=i; j < keywordIndexes.length; j++) {
      var word = keywordIndexes[j].word;
      if (!foundWords[word]) {
        foundWords[word] = true;
        foundCount++;
      }
      if (foundCount == keywordCount) {
        // We've found all the words
        snippetEnd = keywordIndexes[j].i + word.length;
        if (minSnippet.end - minSnippet.start > snippetEnd - snippetStart) {
          minSnippet.end = snippetEnd;
          minSnippet.start = snippetStart;
        }
        break;
      }
    }
  }
  return fullText.substring(minSnippet.start, minSnippet.end);
}

. jsfiddle.

+4

, q ,

[
    {
        "word": "bridge",
        "pos": 46
    },
    {
        "word": "city",
        "pos": 65
    },
    {
        "word": "bridge",
        "pos": 155
    },
    ...
]

, qq. . .

[
    {
        "start": 155,
        "end": 181
    },
    {
        "start": 177,
        "end": 220
    },
    ...
]

, toLowerCase.

function getWords() {
    var text = document.getElementById('text').textContent,
        lowerText = text.toLowerCase(),
        wordsO = {},
        words = document.getElementById('words').value.split(' ').map(function (w) { wordsO[w.toLowerCase()] = true; return w.toLowerCase(); }),
        pattern = /\w+/g,
        execResult,
        result = [],
        q = [], qq = [], i, max;

    while ((execResult = pattern.exec(lowerText)) !== null) {
        wordsO[execResult[0]] && q.push({ word: execResult[0], pos: execResult.index });
    }

    for (i = 0; i < q.length - words.length + 1; i++) {
        max = words.reduce(function (r, w) {
            var j = i;
            while (j < q.length && q[j].word !== w) {
                j++;
            }
            return !~r || j === q.length ? -1 : j > r ? j : r;
        }, i);
        ~max && qq.push({ start: q[i].pos, end: q[max].pos + q[max].word.length });
    }

    qq.sort(function (a, b) { return a.end - a.start - b.end + b.start; });

    qq.every(function (a, _, aa) {
        return aa[0].end - aa[0].start === a.end - a.start && result.push(text.substring(a.start, a.end) + ' [' + a.start + ']');
    });

    document.getElementById('result').innerHTML = result.length && '<ul>' + result.map(function (a) { return '<li>' + a + '</li>'; }).join('') + '</ul>' || '### no match found ###';
}
<form onsubmit="return false;">
    <textarea rows="5" cols="65" id="text">001 002 003 004 001 002 The George Washington Bridge in New York City is one of the oldest bridges ever constructed. It is now being remodeled because the bridge is a landmark. City officials say that the landmark bridge effort will create a lot of new jobs in the city.</textarea><br />
    <input type="text" id="words" value="Landmark City Bridge" /> <button onclick="getWords()">search</button>
</form>
Found search: <br />
<div id="result"></div>
Hide result
+2

Source: https://habr.com/ru/post/1629619/


All Articles