140 lines
6.0 KiB
C++
140 lines
6.0 KiB
C++
#include "RunLengthEncoding.h"
|
|
#include <sstream>
|
|
#include <messmer/cpp-utils/assert/assert.h>
|
|
|
|
using cpputils::Data;
|
|
using std::string;
|
|
using std::ostringstream;
|
|
using std::istringstream;
|
|
|
|
namespace blockstore {
|
|
namespace compressing {
|
|
|
|
// Alternatively store a run of arbitrary bytes and a run of identical bytes.
|
|
// Each run is preceded by its length. Length fields are uint16_t.
|
|
// Example: 2 - 5 - 8 - 10 - 3 - 0 - 2 - 0
|
|
// Length 2 arbitrary bytes (values: 5, 8), the next 10 bytes store "3" each,
|
|
// then 0 arbitrary bytes and 2x "0".
|
|
|
|
Data RunLengthEncoding::Compress(const Data &data) {
|
|
ostringstream compressed;
|
|
uint8_t *current = (uint8_t*)data.data();
|
|
uint8_t *end = (uint8_t*)data.data()+data.size();
|
|
while (current < end) {
|
|
_encodeArbitraryWords(¤t, end, &compressed);
|
|
ASSERT(current <= end, "Overflow");
|
|
if (current == end) {
|
|
break;
|
|
}
|
|
_encodeIdenticalWords(¤t, end, &compressed);
|
|
ASSERT(current <= end, "Overflow");
|
|
}
|
|
return _extractData(&compressed);
|
|
}
|
|
|
|
void RunLengthEncoding::_encodeArbitraryWords(uint8_t **current, uint8_t* end, ostringstream *output) {
|
|
uint16_t size = _arbitraryRunLength(*current, end);
|
|
output->write((const char*)&size, sizeof(uint16_t));
|
|
output->write((const char*)*current, size);
|
|
*current += size;
|
|
}
|
|
|
|
uint16_t RunLengthEncoding::_arbitraryRunLength(uint8_t *start, uint8_t* end) {
|
|
// Each stopping of an arbitrary bytes run costs us 5 byte, because we have to store the length
|
|
// for the identical bytes run (2 byte), the identical byte itself (1 byte) and the length for the next arbitrary bytes run (2 byte).
|
|
// So to get an advantage from stopping an arbitrary bytes run, at least 6 bytes have to be identical.
|
|
|
|
// realEnd avoids an overflow of the 16bit counter
|
|
uint8_t *realEnd = std::min(end, start + std::numeric_limits<uint16_t>::max());
|
|
|
|
// Count the number of identical bytes and return if it finds a run of more than 6 identical bytes.
|
|
uint8_t lastByte = *start + 1; // Something different from the first byte
|
|
uint8_t numIdenticalBytes = 1;
|
|
for(uint8_t *current = start; current != realEnd; ++current) {
|
|
if (*current == lastByte) {
|
|
++numIdenticalBytes;
|
|
if (numIdenticalBytes == 6) {
|
|
return current - start - 5; //-5, because the end pointer for the arbitrary byte run should point to the first identical byte, not the one before.
|
|
}
|
|
} else {
|
|
numIdenticalBytes = 1;
|
|
}
|
|
lastByte = *current;
|
|
}
|
|
//It wasn't worth stopping the arbitrary bytes run anywhere. The whole region should be an arbitrary run.
|
|
return realEnd-start;
|
|
}
|
|
|
|
void RunLengthEncoding::_encodeIdenticalWords(uint8_t **current, uint8_t* end, ostringstream *output) {
|
|
uint16_t size = _countIdenticalBytes(*current, end);
|
|
output->write((const char*)&size, sizeof(uint16_t));
|
|
output->write((const char*)*current, 1);
|
|
*current += size;
|
|
}
|
|
|
|
uint16_t RunLengthEncoding::_countIdenticalBytes(uint8_t *start, uint8_t *end) {
|
|
uint8_t *realEnd = std::min(end, start + std::numeric_limits<uint16_t>::max()); // This prevents overflow of the 16bit counter
|
|
for (uint8_t *current = start+1; current != realEnd; ++current) {
|
|
if (*current != *start) {
|
|
return current-start;
|
|
}
|
|
}
|
|
// All bytes have been identical
|
|
return realEnd - start;
|
|
}
|
|
|
|
Data RunLengthEncoding::_extractData(ostringstream *stream) {
|
|
string str = stream->str();
|
|
Data data(str.size());
|
|
std::memcpy(data.data(), str.c_str(), str.size());
|
|
return data;
|
|
}
|
|
|
|
Data RunLengthEncoding::Decompress(const void *data, size_t size) {
|
|
istringstream stream;
|
|
_parseData((uint8_t*)data, size, &stream);
|
|
ostringstream decompressed;
|
|
while(_hasData(&stream)) {
|
|
_decodeArbitraryWords(&stream, &decompressed);
|
|
if (!_hasData(&stream)) {
|
|
break;
|
|
}
|
|
_decodeIdenticalWords(&stream, &decompressed);
|
|
}
|
|
return _extractData(&decompressed);
|
|
}
|
|
|
|
bool RunLengthEncoding::_hasData(istringstream *str) {
|
|
str->peek();
|
|
return !str->eof();
|
|
}
|
|
|
|
void RunLengthEncoding::_parseData(const uint8_t *data, size_t size, istringstream *result) {
|
|
result->str(string((const char*)data, size));
|
|
}
|
|
|
|
void RunLengthEncoding::_decodeArbitraryWords(istringstream *stream, ostringstream *decompressed) {
|
|
uint16_t size;
|
|
stream->read((char*)&size, sizeof(uint16_t));
|
|
ASSERT(stream->good(), "Premature end of stream");
|
|
Data run(size);
|
|
stream->read((char*)run.data(), size);
|
|
ASSERT(stream->good(), "Premature end of stream");
|
|
decompressed->write((const char*)run.data(), run.size());
|
|
}
|
|
|
|
void RunLengthEncoding::_decodeIdenticalWords(istringstream *stream, ostringstream *decompressed) {
|
|
uint16_t size;
|
|
stream->read((char*)&size, sizeof(uint16_t));
|
|
ASSERT(stream->good(), "Premature end of stream");
|
|
uint8_t value;
|
|
stream->read((char*)&value, 1);
|
|
ASSERT(stream->good(), "Premature end of stream");
|
|
Data run(size);
|
|
std::memset(run.data(), value, run.size());
|
|
decompressed->write((const char*)run.data(), run.size());
|
|
}
|
|
|
|
}
|
|
}
|