libcryfs/src/blockstore/implementations/compressing/compressors/RunLengthEncoding.cpp

#include "RunLengthEncoding.h"
#include <sstream>
#include <messmer/cpp-utils/assert/assert.h>

using cpputils::Data;
using std::string;
using std::ostringstream;
using std::istringstream;

namespace blockstore {
    namespace compressing {

        // Alternatively store a run of arbitrary bytes and a run of identical bytes.
        // Each run is preceded by its length. Length fields are uint16_t.
        // Example: 2 - 5 - 8 - 10 - 3 - 0 - 2 - 0
        // Length 2 arbitrary bytes (values: 5, 8), the next 10 bytes store "3" each,
        // then 0 arbitrary bytes and 2x "0".

        Data RunLengthEncoding::Compress(const Data &data) {
            ostringstream compressed;
            uint8_t *current = (uint8_t*)data.data();
            uint8_t *end = (uint8_t*)data.data()+data.size();
            while (current < end) {
                _encodeArbitraryWords(&current, end, &compressed);
                ASSERT(current <= end, "Overflow");
                if (current == end) {
                    break;
                }
                _encodeIdenticalWords(&current, end, &compressed);
                ASSERT(current <= end, "Overflow");
            }
            return _extractData(&compressed);
        }

        void RunLengthEncoding::_encodeArbitraryWords(uint8_t **current, uint8_t* end, ostringstream *output) {
            uint16_t size = _arbitraryRunLength(*current, end);
            output->write((const char*)&size, sizeof(uint16_t));
            output->write((const char*)*current, size);
            *current += size;
        }

        uint16_t RunLengthEncoding::_arbitraryRunLength(uint8_t *start, uint8_t* end) {
            // Each stopping of an arbitrary bytes run costs us 5 byte, because we have to store the length
            // for the identical bytes run (2 byte), the identical byte itself (1 byte) and the length for the next arbitrary bytes run (2 byte).
            // So to get an advantage from stopping an arbitrary bytes run, at least 6 bytes have to be identical.

            // realEnd avoids an overflow of the 16bit counter
            uint8_t *realEnd = std::min(end, start + std::numeric_limits<uint16_t>::max());

            // Count the number of identical bytes and return if it finds a run of more than 6 identical bytes.
            uint8_t lastByte = *start + 1; // Something different from the first byte
            uint8_t numIdenticalBytes = 1;
            for(uint8_t *current = start; current != realEnd; ++current) {
                if (*current == lastByte) {
                    ++numIdenticalBytes;
                    if (numIdenticalBytes == 6) {
                        return current - start - 5; //-5, because the end pointer for the arbitrary byte run should point to the first identical byte, not the one before.
                    }
                } else {
                    numIdenticalBytes = 1;
                }
                lastByte = *current;
            }
            //It wasn't worth stopping the arbitrary bytes run anywhere. The whole region should be an arbitrary run.
            return realEnd-start;
        }

        void RunLengthEncoding::_encodeIdenticalWords(uint8_t **current, uint8_t* end, ostringstream *output) {
            uint16_t size = _countIdenticalBytes(*current, end);
            output->write((const char*)&size, sizeof(uint16_t));
            output->write((const char*)*current, 1);
            *current += size;
        }

        uint16_t RunLengthEncoding::_countIdenticalBytes(uint8_t *start, uint8_t *end) {
            uint8_t *realEnd = std::min(end, start + std::numeric_limits<uint16_t>::max()); // This prevents overflow of the 16bit counter
            for (uint8_t *current = start+1; current != realEnd; ++current) {
                if (*current != *start) {
                    return current-start;
                }
            }
            // All bytes have been identical
            return realEnd - start;
        }

        Data RunLengthEncoding::_extractData(ostringstream *stream) {
            string str = stream->str();
            Data data(str.size());
            std::memcpy(data.data(), str.c_str(), str.size());
            return data;
        }

        Data RunLengthEncoding::Decompress(const void *data, size_t size) {
            istringstream stream;
            _parseData((uint8_t*)data, size, &stream);
            ostringstream decompressed;
            while(_hasData(&stream)) {
                _decodeArbitraryWords(&stream, &decompressed);
                if (!_hasData(&stream)) {
                    break;
                }
                _decodeIdenticalWords(&stream, &decompressed);
            }
            return _extractData(&decompressed);
        }

        bool RunLengthEncoding::_hasData(istringstream *str) {
            str->peek();
            return !str->eof();
        }

        void RunLengthEncoding::_parseData(const uint8_t *data, size_t size, istringstream *result) {
            result->str(string((const char*)data, size));
        }

        void RunLengthEncoding::_decodeArbitraryWords(istringstream *stream, ostringstream *decompressed) {
            uint16_t size;
            stream->read((char*)&size, sizeof(uint16_t));
            ASSERT(stream->good(), "Premature end of stream");
            Data run(size);
            stream->read((char*)run.data(), size);
            ASSERT(stream->good(), "Premature end of stream");
            decompressed->write((const char*)run.data(), run.size());
        }

        void RunLengthEncoding::_decodeIdenticalWords(istringstream *stream, ostringstream *decompressed) {
            uint16_t size;
            stream->read((char*)&size, sizeof(uint16_t));
            ASSERT(stream->good(), "Premature end of stream");
            uint8_t value;
            stream->read((char*)&value, 1);
            ASSERT(stream->good(), "Premature end of stream");
            Data run(size);
            std::memset(run.data(), value, run.size());
            decompressed->write((const char*)run.data(), run.size());
        }

    }
}
Add CompressingBlockStore 2015-12-14 17:17:16 +01:00			`#include "RunLengthEncoding.h"`
			`#include <sstream>`
			`#include <messmer/cpp-utils/assert/assert.h>`

			`using cpputils::Data;`
			`using std::string;`
			`using std::ostringstream;`
			`using std::istringstream;`

			`namespace blockstore {`
			`namespace compressing {`

			`// Alternatively store a run of arbitrary bytes and a run of identical bytes.`
			`// Each run is preceded by its length. Length fields are uint16_t.`
			`// Example: 2 - 5 - 8 - 10 - 3 - 0 - 2 - 0`
			`// Length 2 arbitrary bytes (values: 5, 8), the next 10 bytes store "3" each,`
			`// then 0 arbitrary bytes and 2x "0".`

			`Data RunLengthEncoding::Compress(const Data &data) {`
			`ostringstream compressed;`
			`uint8_t current = (uint8_t)data.data();`
			`uint8_t end = (uint8_t)data.data()+data.size();`
			`while (current < end) {`
			`_encodeArbitraryWords(&current, end, &compressed);`
			`ASSERT(current <= end, "Overflow");`
			`if (current == end) {`
			`break;`
			`}`
			`_encodeIdenticalWords(&current, end, &compressed);`
			`ASSERT(current <= end, "Overflow");`
			`}`
			`return _extractData(&compressed);`
			`}`

			`void RunLengthEncoding::_encodeArbitraryWords(uint8_t *current, uint8_t end, ostringstream *output) {`
			`uint16_t size = _arbitraryRunLength(*current, end);`
			`output->write((const char*)&size, sizeof(uint16_t));`
			`output->write((const char)current, size);`
			`*current += size;`
			`}`

			`uint16_t RunLengthEncoding::_arbitraryRunLength(uint8_t start, uint8_t end) {`
			`// Each stopping of an arbitrary bytes run costs us 5 byte, because we have to store the length`
			`// for the identical bytes run (2 byte), the identical byte itself (1 byte) and the length for the next arbitrary bytes run (2 byte).`
			`// So to get an advantage from stopping an arbitrary bytes run, at least 6 bytes have to be identical.`

			`// realEnd avoids an overflow of the 16bit counter`
			`uint8_t *realEnd = std::min(end, start + std::numeric_limits<uint16_t>::max());`

			`// Count the number of identical bytes and return if it finds a run of more than 6 identical bytes.`
			`uint8_t lastByte = *start + 1; // Something different from the first byte`
			`uint8_t numIdenticalBytes = 1;`
			`for(uint8_t *current = start; current != realEnd; ++current) {`
			`if (*current == lastByte) {`
			`++numIdenticalBytes;`
			`if (numIdenticalBytes == 6) {`
			`return current - start - 5; //-5, because the end pointer for the arbitrary byte run should point to the first identical byte, not the one before.`
			`}`
			`} else {`
			`numIdenticalBytes = 1;`
			`}`
			`lastByte = *current;`
			`}`
			`//It wasn't worth stopping the arbitrary bytes run anywhere. The whole region should be an arbitrary run.`
			`return realEnd-start;`
			`}`

			`void RunLengthEncoding::_encodeIdenticalWords(uint8_t *current, uint8_t end, ostringstream *output) {`
			`uint16_t size = _countIdenticalBytes(*current, end);`
			`output->write((const char*)&size, sizeof(uint16_t));`
			`output->write((const char)current, 1);`
			`*current += size;`
			`}`

			`uint16_t RunLengthEncoding::_countIdenticalBytes(uint8_t start, uint8_t end) {`
			`uint8_t *realEnd = std::min(end, start + std::numeric_limits<uint16_t>::max()); // This prevents overflow of the 16bit counter`
			`for (uint8_t *current = start+1; current != realEnd; ++current) {`
			`if (current != start) {`
			`return current-start;`
			`}`
			`}`
			`// All bytes have been identical`
			`return realEnd - start;`
			`}`

			`Data RunLengthEncoding::_extractData(ostringstream *stream) {`
			`string str = stream->str();`
			`Data data(str.size());`
			`std::memcpy(data.data(), str.c_str(), str.size());`
			`return data;`
			`}`

			`Data RunLengthEncoding::Decompress(const void *data, size_t size) {`
Fix RunLengthEncoding for older compilers 2015-12-14 18:14:03 +01:00			`istringstream stream;`
			`_parseData((uint8_t*)data, size, &stream);`
Add CompressingBlockStore 2015-12-14 17:17:16 +01:00			`ostringstream decompressed;`
			`while(_hasData(&stream)) {`
			`_decodeArbitraryWords(&stream, &decompressed);`
			`if (!_hasData(&stream)) {`
			`break;`
			`}`
			`_decodeIdenticalWords(&stream, &decompressed);`
			`}`
			`return _extractData(&decompressed);`
			`}`

			`bool RunLengthEncoding::_hasData(istringstream *str) {`
			`str->peek();`
			`return !str->eof();`
			`}`

Fix RunLengthEncoding for older compilers 2015-12-14 18:14:03 +01:00			`void RunLengthEncoding::_parseData(const uint8_t data, size_t size, istringstream result) {`
			`result->str(string((const char*)data, size));`
Add CompressingBlockStore 2015-12-14 17:17:16 +01:00			`}`

			`void RunLengthEncoding::_decodeArbitraryWords(istringstream stream, ostringstream decompressed) {`
			`uint16_t size;`
			`stream->read((char*)&size, sizeof(uint16_t));`
			`ASSERT(stream->good(), "Premature end of stream");`
			`Data run(size);`
			`stream->read((char*)run.data(), size);`
			`ASSERT(stream->good(), "Premature end of stream");`
			`decompressed->write((const char*)run.data(), run.size());`
			`}`

			`void RunLengthEncoding::_decodeIdenticalWords(istringstream stream, ostringstream decompressed) {`
			`uint16_t size;`
			`stream->read((char*)&size, sizeof(uint16_t));`
			`ASSERT(stream->good(), "Premature end of stream");`
			`uint8_t value;`
			`stream->read((char*)&value, 1);`
			`ASSERT(stream->good(), "Premature end of stream");`
			`Data run(size);`
			`std::memset(run.data(), value, run.size());`
			`decompressed->write((const char*)run.data(), run.size());`
			`}`

			`}`
			`}`