Improved sorting algorithm for pre-sorted pieces

I have a very large list of elements (several million) that are locally sorted in chunks of variable size.

abcb i hajklefgab l ...

I know the size, start and end position of each fragment in advance.

[abc] [b i hm] [ajkl] [efg] [a] [bl] ...

Is there a faster way to sort the list using an algorithm that can use boundary information?

+4
source share
6 answers

, , k-way/multiway:
k n.

, , -:

() , . log k ( ), n.
, rakwaht schnaader.

k-way

() , . , . , O(log k) n , , .

, , , .

, , 16- , , "" .

() , , ( , ), , .

+3

merge sort, . :

  • 2, ,

:

merge(chunk1[], chunk2[], chunk1_length, chunk2_length) {
  chunk1_pointer = chunk2_pointer = 0
  repeat the following:
    compare chunk1[chunk1_pointer] and chunk2[chunk2_pointer]
      same value: add both to the output, increase both pointers
      chunk1 value larger: add chunk2 value to the output, increase chunk2_pointer
      chunk2 value larger: add chunk1 value to the output, increase chunk1_pointer
    is one of the pointers at the end of the chunk?
      add the remaining elements of the other chunk to the output and exit
}
+1

, MergeSort , :

` :

, , . , , MergeSort, chunkcs : MergeSort Example

0

, .
, , :
1
2

0

Timsort (https://en.wikipedia.org/wiki/Timsort) . , . Python, Java , , . , , , .

, , ( ).

mergesort, , . . , , . O (n log k) (n - , k - ).

mergesort, , , . , - . , (- , , ).

0

C++.

template<typename T>
struct Triplet
{
    T elm;
    size_t listIdx;
    size_t elmIdx;

    Triplet(T e, size_t ldx, size_t eIdx) : elm(e), listIdx(ldx), elmIdx(eIdx) {}

    bool operator<(const Triplet &trp) const {
        return elm < trp.elm;
    }
    bool operator>(const Triplet &trp) const {
        return elm > trp.elm;
    }

};

template<typename T>
vector<T> mergeSortedLists(const vector<vector<T>> &vec)
{
    vector<T> ret;
    priority_queue<Triplet<T>, vector<Triplet<T>>, std::greater<Triplet<T>>> pq;
    for (size_t i = 0; i < vec.size(); i++) {
        if (vec[i].size() > 0) {
            pq.push({ vec[i][0], i, 0 });
        }
    }
    while (!pq.empty()) {
        Triplet<T> trp = pq.top();
        pq.pop();
        ret.push_back(trp.elm);
        if (trp.elmIdx < vec[trp.listIdx].size() - 1) {
            pq.push({ vec[trp.listIdx][trp.elmIdx + 1], trp.listIdx, trp.elmIdx + 1 });
        }
    }
    return ret;
}

: ( ). , : , , ind, . , , .

, . :

int main()
{
    vector<vector<int>> vint = { {1,3,5}, {2,4,6}, {3,5,7} };
    vector<int> resInt = mergeSortedLists(vint);
    vector<int> combinedInt = { 1,3,5,2,4,6,3,5,7 };
    std::sort(combinedInt.begin(), combinedInt.end());
    assert(resInt == combinedInt);

    vector<vector<string>> vstr = { {"abc"}, {"adef"}, {"ccfgt"}, {"aaabb"} };
    vector<string> resStr = mergeSortedLists(vstr);
    vector<string> combinedStr = { "abc", "adef", "ccfgt", "aaabb" };
    std::sort(combinedStr.begin(), combinedStr.end());
    assert(resStr == combinedStr);

    vector<vector<char>> vchar = { {'a', 'b', 'c'}, {'d', 'e' }, {'b', 'd', 'f'}, {'e','f','g','h'} };
    vector<char> resChar = mergeSortedLists(vchar);
    vector<char> combinedChar = { 'a', 'b', 'c', 'd', 'e' , 'b', 'd', 'f','e','f', 'g', 'h' };
    std::sort(combinedChar.begin(), combinedChar.end());
    assert(resChar == combinedChar);
}

, "", . , m , n. , " "

O(m * n) + O(m * n * log(m * n)) = O(m * n * log(m * n)) 

O (m * n * log(m))

<= m. O (m * n * log (m)) .

:

template<typename T>
vector<T> mergeSortedLists(const vector<vector<T>> &vec)
{
    vector<T> ret;
    priority_queue<Triplet<T>, vector<Triplet<T>>, std::greater<Triplet<T>>> pq;
    for (size_t i = 0; i < vec.size(); i++) {
        if (vec[i].size() > 0) {
            pq.push({ vec[i][0], i, 0 });
        }
    }
    while (!pq.empty()) {
        Triplet<T> trp = pq.top();
        pq.pop();
        ret.push_back(trp.elm);
        while ((trp.elmIdx < vec[trp.listIdx].size() - 1) && (vec[trp.listIdx][trp.elmIdx + 1] == ret.back())) {
            ret.push_back(vec[trp.listIdx][trp.elmIdx + 1]);
            trp.elmIdx++;
        }
        if (trp.elmIdx < vec[trp.listIdx].size() - 1) {
            pq.push({ vec[trp.listIdx][trp.elmIdx + 1], trp.listIdx, trp.elmIdx + 1 });
        }
    }
    return ret;
}

Windows ( MS Visual Studio) :

10 x 100000 (10 presorted vectors each of size 100000 ):
Brute Force algorithm: 0.0407124 s
My algorithm: 0.030523 s

50 x 100000 :
Brute Force algorithm: 0.237032 s
My algorithm: 0.217482 s

100 x 100000:
Brute Force algorithm: 0.422206 s
My algorithm: 0.356308 s

0 100000 .

0

Source: https://habr.com/ru/post/1683509/


All Articles