Is there a generalization of std :: bitset for two-bit values?

Suppose I am a genome scientist trying to store extremely long lines of characters, each of which represents two bits of information (that is, each element is either G, A, T, or C). Since the strings are incredibly long, I need to be able to store a string of length N exactly 2N bits (more precisely, N / 4 bytes).

Given this motivation, I am looking for a generalization std::bitset(or boost::dynamic_bitset<>) that works on two-bit values ​​instead of single-bit values. I want to save Nsuch two-bit values, each of which can be 0, 1, 2 or 3. I need data that is stored as close as possible in the memory, so vector<char>it won’t work (since it leaves the memory coefficient 4).

What is the best way to achieve my goal? One option is to wrap existing bit patterns with custom operator[], iterators, etc., but I would rather use an existing library, if at all possible.

+4
source share
3 answers

You have two options.

Given:

enum class nucleobase { a, c, g, t };

You have two options. You can:

  • use one std::bitsetand play with indexing
  • use std::bitsetin combination with another container

First, you can simply define a couple of functions that target the correct number of bits in the / get set:

template<std::size_t N>
void set(std::bitset<N>& bits, std::size_t i, nucleobase x) {
    switch (x) {
        case nucleobase::a: bits.set(i * 2, 0); bits.set(i * 2 + 1, 0); break;
        case nucleobase::c: bits.set(i * 2, 0); bits.set(i * 2 + 1, 1); break;
        case nucleobase::g: bits.set(i * 2, 1); bits.set(i * 2 + 1, 0); break;
        case nucleobase::t: bits.set(i * 2, 1); bits.set(i * 2 + 1, 1); break;
    }
}

template<std::size_t N>
nucleobase get(const std::bitset<N>& bits, std::size_t i) {
    if (!bits[i * 2])
        if (!bits[i * 2 + 1]) return nucleobase::a;
        else                  return nucleobase::c;
    else
        if (!bits[i * 2 + 1]) return nucleobase::g;
        else                  return nucleobase::t;
}

Live demo

This is just an example and terrible (it's almost 4AM here, and I really need to sleep).

For the second, you just need to match the alleles and bits:

bit_pair bits_for(nucleobase x) {
    switch (x) {
        case nucleobase::a: return bit_pair("00"); break;
        case nucleobase::c: return bit_pair("10"); break;
        case nucleobase::g: return bit_pair("01"); break;
        case nucleobase::t: return bit_pair("11"); break;
    }
}

nucleobase nucleobase_for(bit_pair x) {
    switch (x.to_ulong()) {
        case 0: return nucleobase::a; break;
        case 1: return nucleobase::c; break;
        case 2: return nucleobase::g; break;
        case 3: return nucleobase::t; break;
        default: return nucleobase::a; break; // just for the warning
    }
}

Live demo

, , boost::dynamic_bitset std::vector.

+1

std::bitset<> - , , , .

, std::vector<bool>.

, std::vector<bool> , , . - .

, API ; .

@Jefffrey , bitset<>.

[ boost::dynamic_bitset<> vector.]

, , , char.

class Genome
{
public:
    enum class Letter {A,C,G,T};
    Genome(const std::string& source)
    {
        code_.resize(source.size() * 2);
        for (unsigned index = 0; index != source.size(); ++index)
        {
            char text = source[index];
            Letter letter = textToLetter(text);
            set(index, letter);
        }
    }  
    static Letter textToLetter(char text)
    {
        // Or search through the array `letterText`.
        // Or come up with a neat but unintelligible one liner ...
        Letter letter = Letter::A;
        switch (text)
        {
        case 'A':
            letter = Letter::A;
            break;
        case 'C':
            letter = Letter::C;
            break;
        case 'G':
            letter = Letter::G;
            break;
        case 'T':
            letter = Letter::T;
            break;
        default:
            // Invalid - handle error.
            break;
        }
        return letter;
    }
    static char letterToText(Letter l) 
    {
        return letterText[(unsigned)l];
    }
    // Add bounds checking
    Letter get(unsigned index) const
    {
        unsigned distance = index * 2;
        char numeric = code_[distance] + code_[distance + 1] * 2;
        return Letter(numeric);
    }
    // Add bounds checking
    void set(unsigned index, Letter value)
    {
        unsigned distance = index * 2;
        bool low = (unsigned)value & 1;
        bool high = (bool)((unsigned)value & 2);
        code_[distance] = low;
        code_[distance + 1]  = high;
    }
    unsigned size()
    {
        return code_.size() / 2;
    }
    // Extend by numLetters, initially set to 'A'
    void extend(unsigned numLetters)
    {
        code_.resize(code_.size() + numLetters * 2);
    }
private:

    static char letterText[4];
    std::vector<bool> code_;
};

char Genome::letterText [4] = { 'A', 'C', 'G', 'T' };

int main()
{
    Genome g("GATT");
    g.extend(3);
    g.set(5, Genome::Letter::C);
    for (unsigned i = 0; i != g.size(); ++i)
        std::cout << Genome::letterToText(g.get(i));
    std::cout << std::endl;
    return 0;
}
+1

k-mers .

#include <cstdint>
#include <cstdlib>
#include <ostream>

enum class nucleotide { A, C, G, T };

inline std::ostream&
operator<<(std::ostream& pOut, nucleotide pNt)
{
    switch (pNt) {
        case nucleotide::A: pOut << 'A'; break;
        case nucleotide::C: pOut << 'C'; break;
        case nucleotide::G: pOut << 'G'; break;
        case nucleotide::T: pOut << 'T'; break;
    }
    return pOut;
}

class kmer_base;

class nucleotide_proxy {
public:
    operator nucleotide() const {
        return nucleotide((*mWord >> (mPosition * 2)) & 3);
    };

    nucleotide_proxy& operator=(nucleotide pNt) {
        uint64_t word = *mWord;
        word &= ~(uint64_t(3) << (mPosition*2));
        word |= uint64_t(pNt) << (mPosition*2);
        *mWord = word;

        return *this;
    };

private:
    friend class kmer_base;

    nucleotide_proxy(uint64_t* pWord, uint8_t pPosition)
        : mWord(pWord), mPosition(pPosition)
    {
    }

    uint64_t* mWord;
    uint8_t mPosition;
};


class kmer_base {
protected:
    nucleotide_proxy access(uint64_t* pWord, size_t pPosition)
    {
        return nucleotide_proxy(pWord + (pPosition / 32), (pPosition & 31));
    }

    const nucleotide_proxy access(uint64_t* pWord, size_t pPosition) const
    {
        return nucleotide_proxy(pWord + (pPosition / 32), (pPosition & 31));
    }
};


template<int K>
class kmer : public kmer_base
{
    enum { Words = (K + 31) / 32 };
public:
    nucleotide_proxy operator[](size_t pOutdex) {
        return access(mWords, pOutdex);
    }

    const nucleotide_proxy operator[](size_t pOutdex) const {
        return access(mWords, pOutdex);
    }

private:
    uint64_t mWords[Words];
};

Extending this parameter to k-mest with dynamic length remains in the form of an exercise; It is quite easy if you have nucleotide_proxyat your disposal. Implementing the backfill operator effectively also remains as an exercise.

0
source

Source: https://habr.com/ru/post/1545304/


All Articles