Look for more information on "Group Encoding / Decoding Variables" presented in Jeff's slides

I noticed that in Jeff the slides "Problems in building large-scale search engines", which can also be downloaded here: http://research.google.com/people/jeff/WSDM09-keynote.pdf , mentioned a method for compressing integers called "varint coding". This was said much faster than 7 bits per byte of integer encoding (2X more). I am very interested in this and looking for an implementation of this or more detailed information that could help me implement this myself.

I am not a professional and new to this, and any help is appreciated!

+3
source share
4 answers

This refers to the "integer encoding variable", where the number of bits used to store the integer during serialization is not fixed at 4 bytes. There is a good description of varint in the protocol protocol documentation .

It is used in the encoding of the Google protocol buffers , and you can view the source code of the protocol buffer .

CodedOutputStreamcontains the exact encoding function WriteVarint32FallbackToArrayInline :

inline uint8* CodedOutputStream::WriteVarint32FallbackToArrayInline(
    uint32 value, uint8* target) {
  target[0] = static_cast<uint8>(value | 0x80);
  if (value >= (1 << 7)) {
    target[1] = static_cast<uint8>((value >>  7) | 0x80);
    if (value >= (1 << 14)) {
      target[2] = static_cast<uint8>((value >> 14) | 0x80);
      if (value >= (1 << 21)) {
        target[3] = static_cast<uint8>((value >> 21) | 0x80);
        if (value >= (1 << 28)) {
          target[4] = static_cast<uint8>(value >> 28);
          return target + 5;
        } else {
          target[3] &= 0x7F;
          return target + 4;
        }
      } else {
        target[2] &= 0x7F;
        return target + 3;
      }
    } else {
      target[1] &= 0x7F;
      return target + 2;
    }
  } else {
    target[0] &= 0x7F;
    return target + 1;
  }
}

if target, value . 0x80 , , value . , , 0x7f " ". ( OR'ing 0x80, 1, ( AND'ing 0x7f). , varints , .

, " Group VarInt". , VarInt ( , 7-). . , , 64- . , , -.

varint "Group varint" , :)

, VarInt, . , , .

void DecodeGroupVarInt(const byte* compressed, int size, uint32_t* uncompressed) {
  const uint32_t MASK[4] = { 0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF };
  const byte* limit = compressed + size;
  uint32_t current_value = 0;
  while (compressed != limit) {
    const uint32_t selector = *compressed++;
    const uint32_t selector1 = (selector & 3);
    current_value += *((uint32_t*)(compressed)) & MASK[selector1];
    *uncompressed++ = current_value;
    compressed += selector1 + 1;
    const uint32_t selector2 = ((selector >> 2) & 3);
    current_value += *((uint32_t*)(compressed)) & MASK[selector2];
    *uncompressed++ = current_value;
    compressed += selector2 + 1;
    const uint32_t selector3 = ((selector >> 4) & 3);
    current_value += *((uint32_t*)(compressed)) & MASK[selector3];
    *uncompressed++ = current_value;
    compressed += selector3 + 1;
    const uint32_t selector4 = (selector >> 6);
    current_value += *((uint32_t*)(compressed)) & MASK[selector4];
    *uncompressed++ = current_value;
    compressed += selector4 + 1;
  }
}
+4
+1
0

Source: https://habr.com/ru/post/1748699/


All Articles