Small file compression algorithm (345 bytes)

I am looking for software that can compress a small amount of noisy data without loss; Example:

ZmNiYWNma3F5gYqSmqCkpqenpKGdmJONiIN/e3d1c3FxcXFxcnN0dXZ3eXp7fHx9fn5/gIGDhISFhYWFhIOCgYGAgICBgoOEhYeIiYmJiYiGhIF+fHl3dHNyc3N1d3p9gIOGiYuMjY2NjIuKiIeFg4GAfnx7eXl4eHl5enx9fn5/gICAgYGBgYGBgYGBgoKDhISEhISEhIOCgoGAgH9/fn5+fn5/gICAgICAgICAgICAgICAf39/gICBgoOFhoeIiIiIiIeGhYOCgH99fHt6enp6e3x9foCAgoKDg4ODgoKBgH59fHt7e3t8fX5/gIKDhIWFhoaGhoaFhISDgoKBgQ==

Initial data: 345B (100%), gzip: 280B (81%), bzip2: 289B (84%), lzop: 415B (120%),

Are there any other methods that I should try?

+4
source share
2 answers

Since the data is Base64encoded (every 3 bytes become 4 bytes), the first step would be to decode it (the compressed data will be binary anyway):

344 bytes -> 256 bytes

Then a simple test with standard winzip shows compression in 170 bytes( COMP_DEFLATEblock). You should get the same with gzip / zlib.

, .

243 ( .zip , zip - 359 , ).

, zlib , ± 170 .


, . , .

Hex ( ):

66 63 62 61 63 66 6B 71 79 81 8A 92 9A A0 A4 A6
A7 A7 A4 A1 9D 98 93 8D 88 83 7F 7B 77 75 73 71
71 71 71 71 72 73 74 75 76 77 79 7A 7B 7C 7C 7D
7E 7E 7F 80 81 83 84 84 85 85 85 85 84 83 82 81
81 80 80 80 81 82 83 84 85 87 88 89 89 89 89 88
86 84 81 7E 7C 79 77 74 73 72 73 73 75 77 7A 7D
80 83 86 89 8B 8C 8D 8D 8D 8C 8B 8A 88 87 85 83
81 80 7E 7C 7B 79 79 78 78 79 79 7A 7C 7D 7E 7E
7F 80 80 80 81 81 81 81 81 81 81 81 81 82 82 83
84 84 84 84 84 84 84 83 82 82 81 80 80 7F 7F 7E
7E 7E 7E 7E 7F 80 80 80 80 80 80 80 80 80 80 80
80 80 80 80 7F 7F 7F 80 80 81 82 83 85 86 87 88
88 88 88 88 87 86 85 83 82 80 7F 7D 7C 7B 7A 7A
7A 7A 7B 7C 7D 7E 80 80 82 82 83 83 83 83 82 82
81 80 7E 7D 7C 7B 7B 7B 7B 7C 7D 7E 7F 80 82 83
84 85 85 86 86 86 86 86 85 84 84 83 82 82 81 81

: 2 3 , 64 96 .


:

. , . , 1, 2, 3 4 (4 ). ( ) (zlib, ..).

2 -:

102        
99    -3    
98    -1    2
97    -1    0
99     2    3
102    3    1
107    5    2
113    6    1
121    8    2
129    8    0
138    9    1
146    8   -1
154    8    0
160    6   -2
164    4   -2
166    2   -2
167    1   -1
167    0   -1
164   -3   -3
161   -3    0
157   -4   -1
152   -5   -1
147   -5    0
141   -6   -1
136   -5    1
131   -5    0
127   -4    1
123   -4    0
119   -4    0
117   -2    2
115   -2    0
113   -2    0
113    0    2
113    0    0
113    0    0
113    0    0
114    1    1
115    1    0
116    1    0
117    1    0
118    1    0
119    1    0
121    2    1
122    1   -1
123    1    0
124    1    0
124    0   -1
125    1    1
126    1    0
126    0   -1
127    1    1
128    1    0
129    1    0
131    2    1
132    1   -1
132    0   -1
133    1    1
133    0   -1
133    0    0
133    0    0
132   -1   -1
131   -1    0
130   -1    0
129   -1    0
129    0    1
128   -1   -1
128    0    1
128    0    0
129    1    1
130    1    0
131    1    0
132    1    0
133    1    0
135    2    1
136    1   -1
137    1    0
137    0   -1
137    0    0
137    0    0
136   -1   -1
134   -2   -1
132   -2    0
129   -3   -1
126   -3    0
124   -2    1
121   -3   -1
119   -2    1
116   -3   -1
115   -1    2
114   -1    0
115    1    2
115    0   -1
117    2    2
119    2    0
122    3    1
125    3    0
128    3    0
131    3    0
134    3    0
137    3    0
139    2   -1
140    1   -1
141    1    0
141    0   -1
141    0    0
140   -1   -1
139   -1    0
138   -1    0
136   -2   -1
135   -1    1
133   -2   -1
131   -2    0
129   -2    0
128   -1    1
126   -2   -1
124   -2    0
123   -1    1
121   -2   -1
121    0    2
120   -1   -1
120    0    1
121    1    1
121    0   -1
122    1    1
124    2    1
125    1   -1
126    1    0
126    0   -1
127    1    1
128    1    0
128    0   -1
128    0    0
129    1    1
129    0   -1
129    0    0
129    0    0
129    0    0
129    0    0
129    0    0
129    0    0
129    0    0
130    1    1
130    0   -1
131    1    1
132    1    0
132    0   -1
132    0    0
132    0    0
132    0    0
132    0    0
132    0    0
131   -1   -1
130   -1    0
130    0    1
129   -1   -1
128   -1    0
128    0    1
127   -1   -1
127    0    1
126   -1   -1
126    0    1
126    0    0
126    0    0
126    0    0
127    1    1
128    1    0
128    0   -1
128    0    0
128    0    0
128    0    0
128    0    0
128    0    0
128    0    0
128    0    0
128    0    0
128    0    0
128    0    0
128    0    0
128    0    0
128    0    0
127   -1   -1
127    0    1
127    0    0
128    1    1
128    0   -1
129    1    1
130    1    0
131    1    0
133    2    1
134    1   -1
135    1    0
136    1    0
136    0   -1
136    0    0
136    0    0
136    0    0
135   -1   -1
134   -1    0
133   -1    0
131   -2   -1
130   -1    1
128   -2   -1
127   -1    1
125   -2   -1
124   -1    1
123   -1    0
122   -1    0
122    0    1
122    0    0
122    0    0
123    1    1
124    1    0
125    1    0
126    1    0
128    2    1
128    0   -2
130    2    2
130    0   -2
131    1    1
131    0   -1
131    0    0
131    0    0
130   -1   -1
130    0    1
129   -1   -1
128   -1    0
126   -2   -1
125   -1    1
124   -1    0
123   -1    0
123    0    1
123    0    0
123    0    0
124    1    1
125    1    0
126    1    0
127    1    0
128    1    0
130    2    1
131    1   -1
132    1    0
133    1    0
133    0   -1
134    1    1
134    0   -1
134    0    0
134    0    0
134    0    0
133   -1   -1
132   -1    0
132    0    1
131   -1   -1
130   -1    0
130    0    1
129   -1   -1
129    0    1
+6

, :

  • First Base64 (345 → 256 )
  • delta-encode (, , )
  • - ( 3 2)

76 , .

Mercurial .

! , , , . , , , .

:

static void Main(string[] args)
{
    string inputString = "ZmNiYWNma3F5gYqSmqCkpqenpKGdmJONiIN/e3d1c3FxcXFxcnN0dXZ3eXp7fHx9fn5/gIGDhISFhYWFhIOCgYGAgICBgoOEhYeIiYmJiYiGhIF+fHl3dHNyc3N1d3p9gIOGiYuMjY2NjIuKiIeFg4GAfnx7eXl4eHl5enx9fn5/gICAgYGBgYGBgYGBgoKDhISEhISEhIOCgoGAgH9/fn5+fn5/gICAgICAgICAgICAgICAf39/gICBgoOFhoeIiIiIiIeGhYOCgH99fHt6enp6e3x9foCAgoKDg4ODgoKBgH59fHt7e3t8fX5/gIKDhIWFhoaGhoaFhISDgoKBgQ==";
    byte[] original = Convert.FromBase64String(inputString);

    Console.WriteLine($"Original String: {inputString.Length}");
    Console.WriteLine($"Original bytes: {original.Length}");

    byte[] deltaEncoded = DeltaEncoderDecoder.Encode(original, 2);
    byte[] compressed = HuffmanCompression.Compress(deltaEncoded);
    Console.WriteLine($"Compressed bytes: {compressed.Length}");

    byte[] deltaDecoded = HuffmanCompression.Decompress(compressed);
    byte[] decompressed = DeltaEncoderDecoder.Decode(deltaDecoded, 2);
    Console.WriteLine($"Decompressed bytes: {decompressed.Length}");

    Console.WriteLine($"Decompressed == original: {decompressed.Length == original.Length && Enumerable.Range(0, original.Length).All(index => original[index] == decompressed[index])}");
}

:

Original String: 344
Original bytes: 256
Compressed bytes: 76
Decompressed bytes: 256
Decompressed == original: True

:

public static class HuffmanCompression
{
    internal const byte CompressedSignature = 255;
    internal const byte UncompressedSignature = 0;

    [NotNull]
    public static byte[] Compress([NotNull] byte[] input)
    {
        if (input == null)
            throw new ArgumentNullException(nameof(input));
        if (input.Length == 0)
            return input;

        var rootNode = GetNodesFromRawInput(input);
        var bitStrings = GetBitStringsFromTree(rootNode);

        var output = new MemoryStream(input.Length);
        var writer = new BitStreamWriter(output);

        writer.Write(CompressedSignature);
        writer.Write(input.Length);
        WriteNodes(writer, rootNode);
        WriteStrings(writer, bitStrings, input);
        writer.Flush();

        if (output.Length < input.Length + 1)
            return output.ToArray();

        return EncodeAsUncompressed(input);
    }

    [NotNull]
    private static byte[] EncodeAsUncompressed([NotNull] byte[] input)
    {
        var output = new MemoryStream();
        output.WriteByte(UncompressedSignature);
        output.Write(input, 0, input.Length);
        return output.ToArray();
    }

    private static void WriteStrings([NotNull] BitStreamWriter writer, [NotNull] string[] bitStrings, [NotNull] byte[] input)
    {
        foreach (byte value in input)
        {
            Assume(bitStrings[value] != null);
            foreach (char bitChar in bitStrings[value])
                writer.Write(bitChar == '1');
        }
    }

    private static void WriteNodes([NotNull] BitStreamWriter writer, [NotNull] Node node)
    {
        if (node.Left == null)
        {
            writer.Write(false);
            writer.Write(node.Value);
        }
        else
        {
            Assume(node.Right != null);

            writer.Write(true);
            WriteNodes(writer, node.Left);
            WriteNodes(writer, node.Right);
        }
    }

    [NotNull, ItemNotNull]
    private static string[] GetBitStringsFromTree([NotNull] Node node)
    {
        var result = new string[256];
        TraverseToGetBitStringsFromTree(node, string.Empty, result);
        return result;
    }

    private static void TraverseToGetBitStringsFromTree([NotNull] Node node, [NotNull] string prefix, [NotNull, ItemNotNull] string[] dictionary)
    {
        if (node.Left != null)
        {
            Assume(node.Right != null);

            TraverseToGetBitStringsFromTree(node.Left, prefix + "0", dictionary);
            TraverseToGetBitStringsFromTree(node.Right, prefix + "1", dictionary);
        }
        else
            dictionary[node.Value] = prefix;
    }

    [NotNull]
    private static Node GetNodesFromRawInput([NotNull] byte[] input)
    {
        var occurances = new int[256];
        foreach (byte value in input)
            occurances[value]++;

        var nodes = new List<Node>(256);
        for (int index = 0; index < 256; index++)
            if (occurances[index] > 0)
                nodes.Add(new Node
                {
                    Occurances = occurances[index],
                    Value = (byte)index
                });

        while (nodes.Count > 1)
        {
            nodes.Sort((n1, n2) =>
            {
                Assume(n1 != null && n2 != null);

                return n1.Occurances.CompareTo(n2.Occurances);
            });

            Assume(nodes[0] != null && nodes[1] != null);

            nodes[0] = new Node
            {
                Left = nodes[0],
                Right = nodes[1],
                Occurances = nodes[0].Occurances + nodes[1].Occurances
            };
            nodes.RemoveAt(1);
        }

        Assume(nodes[0] != null);

        return nodes[0];
    }

    [NotNull]
    public static byte[] Decompress([NotNull] byte[] input)
    {
        if (input == null)
            throw new ArgumentNullException(nameof(input));
        if (input.Length == 0)
            return input;

        if (input[0] != CompressedSignature)
            return DecodeUncompressed(input);

        var reader = new BitStreamReader(new MemoryStream(input));
        reader.ReadByte(); // skip signature

        int length = reader.ReadInt32();
        var rootNode = ReadNodes(reader);
        var output = new byte[length];
        for (int index = 0; index < length; index++)
            output[index] = DecompressOneByte(reader, rootNode);

        return output;
    }

    private static byte DecompressOneByte([NotNull] BitStreamReader reader, [NotNull] Node node)
    {
        while (node.Left != null)
        {
            if (reader.ReadBit())
                node = node.Right;
            else
                node = node.Left;

            Assume(node != null);
        }

        return node.Value;
    }

    [NotNull]
    private static Node ReadNodes([NotNull] BitStreamReader reader)
    {
        if (reader.ReadBit())
            return new Node
            {
                Left = ReadNodes(reader),
                Right = ReadNodes(reader)
            };

        return new Node
        {
            Value = reader.ReadByte()
        };
    }

    [NotNull]
    private static byte[] DecodeUncompressed([NotNull] byte[] input)
    {
        return input.Skip(1).ToArray();
    }
}

public class BitStreamReader
{
    [NotNull]
    private readonly MemoryStream _Source;

    private byte _Buffer;
    private int _InBuffer;

    public BitStreamReader([NotNull] MemoryStream source)
    {
        if (source == null)
            throw new ArgumentNullException(nameof(source));

        _Source = source;
    }

    public bool ReadBit()
    {
        if (_InBuffer == 0)
            FillBuffer();

        return (_Buffer & BitStreamConstants.BitMasks[8 - _InBuffer--]) != 0;
    }

    public byte ReadByte()
    {
        if (_InBuffer == 8)
        {
            _InBuffer = 0;
            return _Buffer;
        }

        return (byte)((ReadBit() ? 128 : 0) | (ReadBit() ? 64 : 0) | (ReadBit() ? 32 : 0) | (ReadBit() ? 16 : 0) | (ReadBit() ? 8 : 0) | (ReadBit() ? 4 : 0) | (ReadBit() ? 2 : 0) | (ReadBit() ? 1 : 0));
    }

    public int ReadInt32()
    {
        int result = 0;
        if (ReadBit())
            result |= ReadByte();
        if (ReadBit())
            result |= ReadByte() << 8;
        if (ReadBit())
            result |= ReadByte() << 16;
        if (ReadBit())
            result |= ReadByte() << 24;
        return result;
    }

    private void FillBuffer()
    {
        int value = _Source.ReadByte();
        if (value < 0)
            throw new InvalidOperationException("Read past end of source stream");

        _Buffer = (byte)value;
        _InBuffer = 8;
    }
}

public class BitStreamWriter
{
    [NotNull]
    private readonly MemoryStream _Target;

    private byte _Buffer;
    private int _InBuffer;

    public BitStreamWriter([NotNull] MemoryStream target)
    {
        if (target == null)
            throw new ArgumentNullException(nameof(target));

        _Target = target;
    }

    public void Flush()
    {
        if (_InBuffer == 0)
            return;

        _Target.WriteByte(_Buffer);
        _Buffer = 0;
        _InBuffer = 0;
    }

    public void Write(bool bit)
    {
        unchecked
        {
            if (bit)
                _Buffer = (byte)(_Buffer | 1 << (7 - _InBuffer));
            if (++_InBuffer == 8)
                Flush();
        }
    }

    public void Write(byte value)
    {
        for (int index = 0; index < 8; index++)
            Write((value & BitStreamConstants.BitMasks[index]) != 0);
    }

    public void Write(int value)
    {
        byte b0 = (byte)(value & 0xff);
        byte b1 = (byte)((value >> 8) & 0xff);
        byte b2 = (byte)((value >> 16) & 0xff);
        byte b3 = (byte)((value >> 24) & 0xff);

        Write(b0 != 0);
        if (b0 != 0)
            Write(b0);

        Write(b1 != 0);
        if (b1 != 0)
            Write(b1);

        Write(b2 != 0);
        if (b2 != 0)
            Write(b2);

        Write(b3 != 0);
        if (b3 != 0)
            Write(b3);
    }
}

internal static class BitStreamConstants
{
    [NotNull]
    public static readonly byte[] BitMasks = { 128, 64, 32, 16, 8, 4, 2, 1 };

    public const byte CompressedSignature = 255;
}

public static class DeltaEncoderDecoder
{
    [NotNull]
    public static byte[] Encode([NotNull] byte[] input, int iterations)
    {
        if (input == null)
            throw new ArgumentNullException(nameof(input));

        var output = new byte[input.Length];
        Buffer.BlockCopy(input, 0, output, 0, input.Length);

        while (iterations-- > 0)
        {
            byte previous = 0;
            for (int index = 0; index < output.Length; index++)
            {
                byte current = output[index];
                output[index] = (byte)(current - previous);
                previous = current;
            }
        }

        return output;
    }

    [NotNull]
    public static byte[] Decode([NotNull] byte[] input, int iterations)
    {
        if (input == null)
            throw new ArgumentNullException(nameof(input));

        var output = new byte[input.Length];
        Buffer.BlockCopy(input, 0, output, 0, input.Length);

        while (iterations-- > 0)
        {
            byte previous = 0;
            for (int index = 0; index < output.Length; index++)
            {
                output[index] = (byte)(previous + output[index]);
                previous = output[index];
            }
        }

        return output;
    }
}
+3

Source: https://habr.com/ru/post/1624321/


All Articles