Search for a template in a set

Question

Search for a template in a set

What algorithms can I use to determine common characters in a set of strings?

To make a simple example, I only need 2+ characters per line, and if it is displayed in two or more samples. For instance:

0000abcde0000
0000abcd00000
000abc0000000
00abc000de000

I'd like to know:

00 used in 1,2,3,4,000
was used in 1,2,3,4
0000 was used in 1,2,3
00,000 used in 2,3
ab used in 1,2,3,4
abc used in 1 , 2,3,4
abcd used at 1,2
bc used at 1,2,3,4
bcd used at 1,2
cd used at 1,2
de used at 1,4

+3

algorithm design-patterns data-mining

Jason Nov 06 '08 at 10:18

source share

7

, , NP-hard. . , Smith-Waterman (= ) . .

+2

Konrad Rudolph 06 . '08 22:26

, . node "", ( ). N , N - , (, char, N )

, ( , ).

: , , , , , , , .

abc
abd
abde
acc
bde

a : 4
  b : 3
    c : 1
    d : 2
      e : 1
  c : 1
    c : 1
b : 4
  d : 3
    e : 2
  c : 1
c : 3
  c : 1
d : 3
  e : 2

+2

BCS 06 . '08 22:47

"", ? , ?

- - , ( ), , .

+1

LarryF 06 . '08 22:26

" " . " , " . , , wikipedia , 205 : " , k ".

+1

florin 07 . '08 0:43

. ( ) .

0

nlucaroni 06 . '08 22:24

You may find the suffix array simpler and more efficient than the suffix tree, depending on how common substrings are often found in your data - if they are common enough, you will need a more sophisticated algorithm for constructing the suffix array. (The naive method is to simply use the library sort function.)

0

Darius bacon Nov 07 '08 at 5:15

source share

joel.neely · Accepted Answer · 2008-11-07T00:25:01+0000

, . ( , !) -

. O(m**2 * n), m - , n - .

Occurrence , . commonOccurrences , captureOccurrences . captureOccurrences Occurrence , . , commonOccurrences , Occurrences, .

, , . , "00ab" . (, , ..) - , , .; -)

JAVA:

package com.stackoverflow.answers;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

public class CommonSubstringFinder {

    public static final int MINIMUM_SUBSTRING_LENGTH = 2;

    public static class Occurrence implements Comparable<Occurrence> {
        private final String value;
        private final Set<Integer> indices;
        public Occurrence(String value) {
            this.value = value == null ? "" : value;
            indices = new TreeSet<Integer>();
        }
        public String getValue() {
            return value;
        }
        public Set<Integer> getIndices() {
            return Collections.unmodifiableSet(indices);
        }
        public void occur(int index) {
            indices.add(index);
        }
        public String toString() {
            StringBuilder result = new StringBuilder();
            result.append('"').append(value).append('"');
            String separator = ": ";
            for (Integer i : indices) {
                result.append(separator).append(i);
                separator = ",";
            }
            return result.toString();
        }
        public int compareTo(Occurrence that) {
            return this.value.compareTo(that.value);
        }
    }

    public static Set<Occurrence> commonOccurrences(String[] strings) {
        Map<String,Occurrence> work = new HashMap<String,Occurrence>();
        if (strings != null) {
            int index = 0;
            for (String string : strings) {
                if (string != null) {
                    captureOccurrences(index, work, string);
                }
                ++index;
            }
        }
        Set<Occurrence> result = new TreeSet<Occurrence>();
        for (Occurrence occurrence : work.values()) {
            if (occurrence.indices.size() > 1) {
                result.add(occurrence);
            }
        }
        return result;
    }

    private static void captureOccurrences(int index, Map<String,Occurrence> work, String string) {
        final int maxLength = string.length();
        for (int i = 0; i < maxLength; ++i) {
            for (int j = i + MINIMUM_SUBSTRING_LENGTH; j < maxLength; ++j) {
                String partial = string.substring(i, j);
                Occurrence current = work.get(partial);
                if (current == null) {
                    current = new Occurrence(partial);
                    work.put(partial, current);
                }
                current.occur(index);
            }
        }
    }

    private static final String[] TEST_DATA = {
        "0000abcde0000",
        "0000abcd00000",
        "000abc0000000",
        "00abc000de000",
    };
    public static void main(String[] args) {
        Set<Occurrence> found = commonOccurrences(TEST_DATA);
        for (Occurrence occurrence : found) {
            System.out.println(occurrence);
        }
    }

}

SAMPLE OUTPUT: ( , , )

"00": 0,1,2,3 "000": 0,1,2,3
"0000": 0,1,2 "0000a": 0,1
"0000ab": 0,1 "0000abc": 0,1
"0000abcd": 0,1 "000a": 0,1,2
"000ab": 0,1,2 "000abc": 0,1,2
"000abcd": 0,1 "00a": 0,1,2,3
"00ab": 0,1,2,3 "00abc": 0,1,2,3
"00abc0": 2,3 "00abc00": 2,3
"00abc000": 2,3 "00abcd": 0,1
"0a": 0,1,2,3 "0ab": 0,1,2,3
"0abc": 0,1,2,3 "0abc0": 2,3
"0abc00": 2,3 "0abc000": 2,3
"0abcd": 0,1 "ab": 0,1,2,3 "abc": 0,1,2,3 "abc0": 2,3 "abc00": 2,3
"abc000": 2,3 "abcd": 0,1 "bc": 0,1,2,3 "bc0": 2,3 "bc00": 2,3
"bc000": 2,3 "bcd": 0,1 "c0": 2,3 "c00": 2,3 "c000": 2,3 "cd": 0,1
"de": 0,3 "de0": 0,3 "de00": 0,3
"e0": 0,3 "e00": 0,3

Search for a template in a set

More articles: