diff --git a/CMakeLists.txt b/CMakeLists.txt index 2bd7b29..c5a890c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,4 +5,6 @@ add_executable(main bitstream.h huffman_table.cpp huffman_table.h + MashZip.cpp + MashZip.h main.cpp) \ No newline at end of file diff --git a/MashZip.h b/MashZip.h index 4fd7d9b..a087e19 100644 --- a/MashZip.h +++ b/MashZip.h @@ -1,4 +1,14 @@ +#include "huffman_table.h" + +#include +#include + +#define MAGIC "MASH" class MashZip { +public: + MashZip() {}; + void unmashzip_stream(std::basic_istream &is, std::basic_ostream &os); + void mashzip_file(std::basic_istream &cs, std::basic_istream &is, std::basic_ostream &os); }; \ No newline at end of file diff --git a/bitstream.cpp b/bitstream.cpp index 8a059c7..53aa835 100644 --- a/bitstream.cpp +++ b/bitstream.cpp @@ -6,11 +6,16 @@ // i.e when stream is 0b12345678 0bABCDEFGH and command is // to read 12 bits: -// out = 00000000 00000000 0000EFGH 12345678 +// out = 00000000 00000000 0000EFGH 12345678 -- wrong // to read 2 bits: // out = 00000000 00000000 00000000 00000078 +// TODO: +// to read 12 bits: +// out = 00000000 00000000 00008765 4321HGFE +// to read 2 bits: +// out = 00000000 00000000 00000000 00000087 int ibitstream::getbits(size_t n) { - std::cerr << "To read " << n << " count: " << this->count << " " << this->cache << std::endl; + // std::cerr << "To read " << n << " count: " << this->count << " " << this->cache << std::endl; int out = 0, read = 0, to_read; if (n > 32 || n < 0) { @@ -25,16 +30,29 @@ int ibitstream::getbits(size_t n) { while (n > 0) { to_read = min(n, 8 - this->count); - std::cerr << "Iter n: " << n << " count: " << this->count << " " - << this->cache << " to_read: " << to_read << " already read: " << read << std::endl; + // std::cerr << "Iter n: " << n << " count: " << this->count << " " + // << this->cache << " to_read: " << to_read << " already read: " << read << std::endl; // cache & 0b11111000 if count = 3; // cache & 0b10000000 if count = 7; // cache & 0b11111111 if count = 0, etc; uint8_t mask = (((1 << to_read) - 1) << this->count); - out |= ((cache & mask) >> this->count) << read; + uint8_t chunk = ((cache & mask) >> this->count); - std::cerr << "Read result: " << std::bitset<8>((cache & mask) >> this->count) << " " << std::bitset<32>(((cache & mask) >> this->count) << read) << std::endl; + // todo inverse chunk + uint8_t inv = 0; + for (size_t i = 0; i < to_read; i++) + { + inv |= ((chunk >> i) & 1) << (to_read - i - 1); + } + + out <<= to_read; // shift by length of chunk + out |= inv; // concat with chunk + + // out |= inv << read; + + // std::cerr << "Mask " << std::bitset<8>(mask) << " chunk " << std::bitset<8>(chunk) << + // " inv " << std::bitset<8>(inv) << " out " << std::bitset<32>(out) << std::endl; this->count += to_read; read += to_read; @@ -51,13 +69,33 @@ int ibitstream::getbits(size_t n) { } void obitstream::writebits(short bits, size_t n) { - std::cerr << "Write: " << std::bitset<16>(bits) << " n: " << n << " count: " << this->count << " cache: " << std::bitset<8>(this->cache) << std::endl; + // std::cerr << "Write: " << std::bitset<16>(bits) << " n: " << n << " count: " << this->count << " cache: " << std::bitset<8>(this->cache) << std::endl; int written = 0, to_write; while (n > 0) { to_write = min(n, 8 - this->count); + + // mask: + // ------- n ----------------------------- + // 00000..00000 1...1111111111 000000..000 == bits + // - written - | - to_write - | - offset - + // so offset = n - to_write + // because n = n - to_write at every step, so written is already present - uint8_t chunk = (bits & (((1 << to_write) - 1) << written)) >> written; - this->cache |= (chunk << this->count); + uint16_t mask = (((1 << to_write) - 1) << (n - to_write)); + uint8_t chunk = (bits & mask) >> (n - to_write); + + // todo inverse bits in chunk + uint8_t inv = 0; + for (size_t i = 0; i < to_write; i++) + { + inv |= ((chunk >> i) & 1) << (to_write - i - 1); + } + + // std::cerr << "Chunk " << std::bitset<8>(chunk) << " inv " << std::bitset<8>(inv) << " to_write " + // << to_write << " written " << written << " n " << n << " mask " << std::bitset<16>(mask) + // << " offset " << (n - to_write) << std::endl; + + this->cache |= (inv << this->count); this->count += to_write; written += to_write; @@ -65,7 +103,7 @@ void obitstream::writebits(short bits, size_t n) { if (this->count == 8){ // flush chunk - std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl; + // std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl; os << this->cache; os.flush(); @@ -78,7 +116,7 @@ void obitstream::writebits(short bits, size_t n) { } void obitstream::flush() { - std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl; + // std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl; os << this->cache; os.flush(); diff --git a/huffman_table.cpp b/huffman_table.cpp index 5eeef55..30ea03d 100644 --- a/huffman_table.cpp +++ b/huffman_table.cpp @@ -6,22 +6,32 @@ #include #include -#define HEADER_SIZE 128 - -void initialize_table(const std::map > &huffmanLengths, - std::unordered_map > &table) +void initialize_table(const int sumLen, + const std::map > &huffmanLengths, + std::unordered_map > &codingTable, + std::vector &symbols, + std::array &counts) { - int nextbl = 0; + int nextbl = 0, offset = 0; // offset = total offset to symbols of current len short code = 0; + symbols.resize(sumLen); + counts.fill(0); + + // std::cerr << "Sum len " << sumLen << std::endl; + for (auto lenCodePairIt = huffmanLengths.begin(); lenCodePairIt != huffmanLengths.end(); lenCodePairIt++) { + int cnt = 0; // counter of symbols of current code length auto lenCodePair = *lenCodePairIt; + counts[lenCodePair.first] = lenCodePair.second.size(); + // std::cerr << "Counts[" << lenCodePair.first << "] " << counts[lenCodePair.first] << std::endl; + for (auto it = lenCodePair.second.begin(); it != lenCodePair.second.end(); it++) { - table[*it].first = lenCodePair.first; // save current bit length for code - table[*it].second = code; + codingTable[*it].first = lenCodePair.first; // save current bit length for code + codingTable[*it].second = code; // code := (code + 1) << ((bit length of the next symbol) − (current bit length)) // code++; @@ -33,15 +43,20 @@ void initialize_table(const std::map > &huffmanLengths, nextbl = lenCodePair.first; } + // std::cerr << "symbols[" << offset + cnt << "] =" << (*it) << " code " << std::bitset<16>(code) << std::endl; + symbols[offset + cnt] = *it; + code = (code + 1) << (nextbl - lenCodePair.first); + cnt++; } + offset += cnt; // code <<= 1; } } -HuffmanTable::HuffmanTable(uint8_t *header) { - int cnt1, cnt2; +HuffmanTable::HuffmanTable(const char *header) { + int cnt1, cnt2, total_cnt = 0; std::map > huffmanLengths; for (int i = 0; i < HEADER_SIZE; i++) { @@ -49,13 +64,15 @@ HuffmanTable::HuffmanTable(uint8_t *header) { cnt2 = (header[i] & 0b1111); if (cnt1 != 0) huffmanLengths[cnt1].insert((char)(i * 2)); if (cnt2 != 0) huffmanLengths[cnt2].insert((char)(i * 2 + 1)); + + total_cnt += cnt1 + cnt2; } // build up codes - initialize_table(huffmanLengths, this->huffmanCodes); + initialize_table(total_cnt, huffmanLengths, this->huffmanCodes, this->symbols, this->counts); } -void get_lengths(Node* root, int len, +void get_lengths(Node* root, int len, int &cnt, std::map > &huffmanLengths) { if (!root) @@ -65,11 +82,12 @@ void get_lengths(Node* root, int len, if (root->isLeaf()) { // huffmanCode[root->ch] = str; // std::cerr << "Got leaf: " << root->getChar() << std::endl; + cnt++; huffmanLengths[len].insert(root->getChar()); } - get_lengths(root->getLeft(), len + 1, huffmanLengths); - get_lengths(root->getRight(), len + 1, huffmanLengths); + get_lengths(root->getLeft(), len + 1, cnt, huffmanLengths); + get_lengths(root->getRight(), len + 1, cnt, huffmanLengths); } HuffmanTable::HuffmanTable(std::basic_istream &is) { @@ -81,7 +99,7 @@ HuffmanTable::HuffmanTable(std::basic_istream &is) { freq[ch]++; } - std::cerr << "Calculated freqs" << std::endl; + // std::cerr << "Calculated freqs" << std::endl; // Create a priority queue to store live nodes of // Huffman tree; @@ -94,7 +112,7 @@ HuffmanTable::HuffmanTable(std::basic_istream &is) { pq.push(new_node); } - std::cerr << "Filled PQ: " << pq.size() << std::endl; + // std::cerr << "Filled PQ: " << pq.size() << std::endl; // do till there is more than one node in the queue while (pq.size() != 1) @@ -119,11 +137,20 @@ HuffmanTable::HuffmanTable(std::basic_istream &is) { Node* root = pq.top(); std::map > huffmanLengths; - get_lengths(root, 0, huffmanLengths); + int total_cnt = 0; + get_lengths(root, 0, total_cnt, huffmanLengths); // std::cerr << "Got lengths: " << huffmanLengths.size() << std::endl; - initialize_table(huffmanLengths, this->huffmanCodes); + initialize_table(total_cnt, huffmanLengths, this->huffmanCodes, this->symbols, this->counts); + + // for (auto s : this->symbols) { + // std::cerr << "Symbol " << s << std::endl; + // } + + // for (int i = 0; i < this->counts.size(); i++) { + // std::cerr << "Count for len " << i << " " << this->counts[i] << std::endl; + // } } std::pair HuffmanTable::operator[](const char &c) { @@ -133,11 +160,45 @@ std::pair HuffmanTable::operator[](const char &c) { void HuffmanTable::write_symbol(obitstream &os, const char &c) { if (huffmanCodes.find(c) == huffmanCodes.end()) throw std::runtime_error("No code in table for char!"); + std::cerr << "Write code for " << c << " " << (int)c << " : " << std::bitset<16>(huffmanCodes[c].second) << " " << " len " << huffmanCodes[c].first << std::endl; + os.writebits(huffmanCodes[c].second, huffmanCodes[c].first); } -uint8_t *HuffmanTable::to_header() { - uint8_t *header = new uint8_t[HEADER_SIZE]; +int HuffmanTable::decode_one_symbol(ibitstream &bs) +{ + uint16_t code = 0; + int len = 1, first = 0, index = 0; + + while (len <= MAX_LEN) { + // read one bit + uint16_t bit = (uint16_t) bs.getbits(1); + + code |= bit; + + + int count = this->counts[len]; + + // std::cerr << "Read bit " << bit << " code " << std::bitset<16>(code) << " len " << len << + // " first " << std::bitset<16>(first) << " index " << index << " count " << count << std::endl; + + + if (code < first + count) { + return this->symbols[index + (code - first)]; + } + + index += count; + first += count; + first <<= 1; + code <<= 1; + len++; + } + + return -1; +} + +char *HuffmanTable::to_header() { + char *header = new char[HEADER_SIZE]; for (size_t i = 0; i < HEADER_SIZE; i++) { diff --git a/huffman_table.h b/huffman_table.h index 40f9431..5b798a5 100644 --- a/huffman_table.h +++ b/huffman_table.h @@ -3,6 +3,9 @@ #include "bitstream.h" +#define HEADER_SIZE 128 +#define MAX_LEN 16 + #ifndef HUFFMAN_TABLE #define HUFFMAN_TABLE @@ -10,16 +13,18 @@ class HuffmanTable { private: std::unordered_map > huffmanCodes; + std::vector symbols; + std::array counts; public: // Given the list of code lengths length[0..n-1] representing a canonical // Huffman code for n symbols, construct the tables required to decode those // codes. - HuffmanTable(uint8_t *header); + HuffmanTable(const char *header); // Build from input stream HuffmanTable(std::basic_istream &is); - uint8_t *to_header(); + char *to_header(); std::pair operator[](const char &c); diff --git a/main.cpp b/main.cpp index 0752d33..e346a38 100644 --- a/main.cpp +++ b/main.cpp @@ -9,6 +9,7 @@ #include "bitstream.h" #include "huffman_table.h" +#include "MashZip.h" using namespace std; @@ -201,18 +202,18 @@ int main(int argc, char **argv) // // 10111010 11110101 // // ^^ // int a = ibs.getbits(2); -// cout << bitset<2>(a) << endl; +// cout << bitset<2>(a) << endl; // 01 // // 10111010 11110101 // // ^^^^ // int b = ibs.getbits(4); -// cout << bitset<4>(b) << endl; +// cout << bitset<4>(b) << endl; // 0111 // // 10111010 11110101 // // ^^ ^^^^^^^ // b = ibs.getbits(9); -// cout << bitset<9>(b) << endl; +// cout << bitset<9>(b) << endl; // 011010111 // // 10111010 11110101 // // ^ + overflow @@ -230,11 +231,13 @@ int main(int argc, char **argv) // cout << "After 3 bits: " << so.str() << endl; // // cache here: 00000111 -// obs.writebits(0xf5, 7); -// // here: Flush: 10101111 + +// // de = 11011110 +// obs.writebits(0xde, 7); +// // here: Flush: 11101111 // cout << "After 7 bits: " << so.str() << endl; // obs.flush(); -// // here: Flush: 00000011 +// // here: Flush: 00000001 // cout << "After flush: " << so.str() << endl; @@ -245,34 +248,59 @@ int main(int argc, char **argv) // cout << "After flush: " << so.str() << endl; - string s = "Some long text!!!!\x01\x02\x03\x04"; + // string s = ; - stringstream ss1(s); + stringstream ss1("Some long text!!!!\x01\x02\x03\x04\n123123456789 privet, masha!"); + stringstream ss2("Some long text!!!!\x01\x02\x03\x04\n123123456789 privet, masha!"); + stringstream so1; + stringstream so2; - HuffmanTable ht(ss1); + MashZip mz; - uint8_t *header = ht.to_header(); - // for (size_t i = 0; i < 128; i++) - // { - // cout << "Code for " << 2 * i << " and " << 2 * i + 1 << ": " << bitset<8>(header[i]) << endl; - // } + mz.mashzip_file(ss1, ss2, so1); + + std::cout << "After mashzip: " << so1.str(); + + mz.unmashzip_stream(so1, so2); + + std::cout << "After unmashzip: " << so2.str(); + + // HuffmanTable ht(ss1); + + // char *header = ht.to_header(); + // // for (size_t i = 0; i < 128; i++) + // // { + // // cout << "Code for " << 2 * i << " and " << 2 * i + 1 << ": " << bitset<8>(header[i]) << endl; + // // } - ostringstream test; + // stringstream test; - test << "MASH"; // magic + // // test << "MASH"; // magic - for (size_t i = 0; i < 128; i++) { - test << header[i]; - } + // // for (size_t i = 0; i < 128; i++) { + // // test << header[i]; + // // } - obitstream some_stream(test); + // obitstream some_stream(test); - for (char c : s) { - ht.write_symbol(some_stream, c); - } - some_stream.flush(); + // for (char c : s) { + // ht.write_symbol(some_stream, c); + // } + // some_stream.flush(); - std::cout << test.str(); + // std::cout << test.str() << endl; + + // ibitstream some_instream(test); + + // while(true) { + // char sym = (char) ht.decode_one_symbol(some_instream); + // if (sym < 0) + // { + // std::cout << "Read all" << std::endl; + // break; + // } + // std::cerr << "Sym: " << sym << std::endl; + // } return 0; } \ No newline at end of file