Find duplicate content in line?

How do you solve the following problem:

I have a semi-large file with text (about 10 pages), and I want to find duplicate content in this text. To be more specific, given the line, find the two longest lines that are identical.

I looked at the longest overall subsequence:

http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_subsequence

But these implementations take two lines as input.

Maybe the service is already doing this?

+3
source share
4 answers

( ) : Loop , 1. . , . . #:

    public static string FindDuplicateSubstring(string s)
    {
        for (int len = s.Length-1; len > 0; len--) {
            var dict = new Dictionary<string, int>();
            for (int i = 0; i <= s.Length - len; i++) {
                string sub = s.Substring(i, len);
                if (dict.ContainsKey(sub)) return sub;
                else dict[sub] = i;
            }
        }
        return null;
    }

, , - "". , , .. "bbbb" "bbb". , . . .

+2

" " . , "" " " "". , ?

, , . .

- , , LCS, . , . , . n, O (n), O (n) , O (n ^ 2), . ( , O (n ^ 3) O (n ^ 2).) (.. , "bbbb" "bb", "bbb",), , . #:

    public static string FindDuplicateSubstring(string s, 
                                                bool allowOverlap = false)
    {
        int matchPos = 0, maxLength = 0;
        for (int shift = 1; shift < s.Length; shift++) {
            int matchCount = 0;
            for (int i = 0; i < s.Length - shift; i++) {
                if (s[i] == s[i+shift]) {
                    matchCount++;
                    if (matchCount > maxLength) {
                        maxLength = matchCount;
                        matchPos = i-matchCount+1;
                    }
                    if (!allowOverlap && (matchCount == shift)) {
                        // we have found the largest allowable match 
                        // for this shift.
                        break;
                    }
                } else matchCount = 0;
            }
        }
        if (maxLength > 0) return s.Substring(matchPos, maxLength);
        else return null;
    }

, . 3000, 15 , 60 ( ).

+1

try it

public static string FindLargestDuplicateString(
        string text)
    {
        var largest = string.Empty;
        for (var i = '!'; i <= '}'; i++)
        {
            var l = FindLargestDuplicateStringImpl(
                text, i.ToString());
            if (l.Length > largest.Length)
                largest = l;
        }
        return largest;
    }

    private static string FindLargestDuplicateStringImpl(
        string text, string currentLargest)
    {
        bool found = false;
        for (var i = '!'; i <= '}'; i++)
        {
            var comp = currentLargest + i;
            var last = text.LastIndexOf(comp);
            var first = text.IndexOf(comp, 0);
            if (first == -1 || last == -1 || first == last) 
                continue;
            currentLargest = comp;
            found = true;
        }
        return !found ? currentLargest : 
            FindLargestDuplicateStringImpl(text, currentLargest);
    }
0
source

You can do something like this

public static ArrayList<String> split(String line){
    line+=" ";
    Pattern pattern = Pattern.compile("\\w*\\s");
    Matcher matcher = pattern.matcher(line);
    ArrayList<String> list = new ArrayList<String>();
    while (matcher.find()){
        list.add(matcher.group());
    }
    return list;
}

be sure to remove the punctuation marks

public static void duplicatedWords(String s, int n){
    ArrayList<String> splitted = split(s); 
    System.out.println(splitted);
    HashMap<String, Integer> map = new HashMap<String, Integer>();
    PriorityQueue<String> pq = new PriorityQueue<String>(splitted.size(), new myComp());
    for(int i = 0; i<splitted.size(); i++){
        if(map.get(splitted.get(i)) == null){
            map.put(splitted.get(i), 1);
        }
        else if(map.get(splitted.get(i)) == 1) {
            map.put(splitted.get(i), map.get(splitted.get(i))+1);
            pq.add(splitted.get(i));
        }
    }
    int size = pq.size();
    for(int i = 0; i<size; i++){
        if(i <n)
            System.out.println(pq.remove());
        else 
            break;
    }
}

Using this comparator:

public static class myComp implements Comparator{
    @Override
    public int compare(Object arg0, Object arg1) {
        String s1 = (String)arg0;
        String s2 = (String)arg1;
        return s2.length()-s1.length();
    }

}
0
source

Source: https://habr.com/ru/post/1775812/


All Articles