#include "RunLengthEncoding.h" #include #include using cpputils::Data; using std::string; using std::ostringstream; using std::istringstream; namespace blockstore { namespace compressing { // Alternatively store a run of arbitrary bytes and a run of identical bytes. // Each run is preceded by its length. Length fields are uint16_t. // Example: 2 - 5 - 8 - 10 - 3 - 0 - 2 - 0 // Length 2 arbitrary bytes (values: 5, 8), the next 10 bytes store "3" each, // then 0 arbitrary bytes and 2x "0". Data RunLengthEncoding::Compress(const Data &data) { ostringstream compressed; uint8_t *current = (uint8_t*)data.data(); uint8_t *end = (uint8_t*)data.data()+data.size(); while (current < end) { _encodeArbitraryWords(¤t, end, &compressed); ASSERT(current <= end, "Overflow"); if (current == end) { break; } _encodeIdenticalWords(¤t, end, &compressed); ASSERT(current <= end, "Overflow"); } return _extractData(&compressed); } void RunLengthEncoding::_encodeArbitraryWords(uint8_t **current, uint8_t* end, ostringstream *output) { uint16_t size = _arbitraryRunLength(*current, end); output->write((const char*)&size, sizeof(uint16_t)); output->write((const char*)*current, size); *current += size; } uint16_t RunLengthEncoding::_arbitraryRunLength(uint8_t *start, uint8_t* end) { // Each stopping of an arbitrary bytes run costs us 5 byte, because we have to store the length // for the identical bytes run (2 byte), the identical byte itself (1 byte) and the length for the next arbitrary bytes run (2 byte). // So to get an advantage from stopping an arbitrary bytes run, at least 6 bytes have to be identical. // realEnd avoids an overflow of the 16bit counter uint8_t *realEnd = std::min(end, start + std::numeric_limits::max()); // Count the number of identical bytes and return if it finds a run of more than 6 identical bytes. uint8_t lastByte = *start + 1; // Something different from the first byte uint8_t numIdenticalBytes = 1; for(uint8_t *current = start; current != realEnd; ++current) { if (*current == lastByte) { ++numIdenticalBytes; if (numIdenticalBytes == 6) { return current - start - 5; //-5, because the end pointer for the arbitrary byte run should point to the first identical byte, not the one before. } } else { numIdenticalBytes = 1; } lastByte = *current; } //It wasn't worth stopping the arbitrary bytes run anywhere. The whole region should be an arbitrary run. return realEnd-start; } void RunLengthEncoding::_encodeIdenticalWords(uint8_t **current, uint8_t* end, ostringstream *output) { uint16_t size = _countIdenticalBytes(*current, end); output->write((const char*)&size, sizeof(uint16_t)); output->write((const char*)*current, 1); *current += size; } uint16_t RunLengthEncoding::_countIdenticalBytes(uint8_t *start, uint8_t *end) { uint8_t *realEnd = std::min(end, start + std::numeric_limits::max()); // This prevents overflow of the 16bit counter for (uint8_t *current = start+1; current != realEnd; ++current) { if (*current != *start) { return current-start; } } // All bytes have been identical return realEnd - start; } Data RunLengthEncoding::_extractData(ostringstream *stream) { string str = stream->str(); Data data(str.size()); std::memcpy(data.data(), str.c_str(), str.size()); return data; } Data RunLengthEncoding::Decompress(const void *data, size_t size) { istringstream stream; _parseData((uint8_t*)data, size, &stream); ostringstream decompressed; while(_hasData(&stream)) { _decodeArbitraryWords(&stream, &decompressed); if (!_hasData(&stream)) { break; } _decodeIdenticalWords(&stream, &decompressed); } return _extractData(&decompressed); } bool RunLengthEncoding::_hasData(istringstream *str) { str->peek(); return !str->eof(); } void RunLengthEncoding::_parseData(const uint8_t *data, size_t size, istringstream *result) { result->str(string((const char*)data, size)); } void RunLengthEncoding::_decodeArbitraryWords(istringstream *stream, ostringstream *decompressed) { uint16_t size; stream->read((char*)&size, sizeof(uint16_t)); ASSERT(stream->good(), "Premature end of stream"); Data run(size); stream->read((char*)run.data(), size); ASSERT(stream->good(), "Premature end of stream"); decompressed->write((const char*)run.data(), run.size()); } void RunLengthEncoding::_decodeIdenticalWords(istringstream *stream, ostringstream *decompressed) { uint16_t size; stream->read((char*)&size, sizeof(uint16_t)); ASSERT(stream->good(), "Premature end of stream"); uint8_t value; stream->read((char*)&value, 1); ASSERT(stream->good(), "Premature end of stream"); Data run(size); std::memset(run.data(), value, run.size()); decompressed->write((const char*)run.data(), run.size()); } } }