This commit is contained in:
Andrey Gumirov
2024-01-11 02:44:53 +07:00
parent 8780822f95
commit b6a619303d
6 changed files with 201 additions and 57 deletions

View File

@ -5,4 +5,6 @@ add_executable(main
bitstream.h bitstream.h
huffman_table.cpp huffman_table.cpp
huffman_table.h huffman_table.h
MashZip.cpp
MashZip.h
main.cpp) main.cpp)

View File

@ -1,4 +1,14 @@
#include "huffman_table.h"
#include <istream>
#include <fstream>
#define MAGIC "MASH"
class MashZip { class MashZip {
public:
MashZip() {};
void unmashzip_stream(std::basic_istream<char> &is, std::basic_ostream<char> &os);
void mashzip_file(std::basic_istream<char> &cs, std::basic_istream<char> &is, std::basic_ostream<char> &os);
}; };

View File

@ -6,11 +6,16 @@
// i.e when stream is 0b12345678 0bABCDEFGH and command is // i.e when stream is 0b12345678 0bABCDEFGH and command is
// to read 12 bits: // to read 12 bits:
// out = 00000000 00000000 0000EFGH 12345678 // out = 00000000 00000000 0000EFGH 12345678 -- wrong
// to read 2 bits: // to read 2 bits:
// out = 00000000 00000000 00000000 00000078 // out = 00000000 00000000 00000000 00000078
// TODO:
// to read 12 bits:
// out = 00000000 00000000 00008765 4321HGFE
// to read 2 bits:
// out = 00000000 00000000 00000000 00000087
int ibitstream::getbits(size_t n) { int ibitstream::getbits(size_t n) {
std::cerr << "To read " << n << " count: " << this->count << " " << this->cache << std::endl; // std::cerr << "To read " << n << " count: " << this->count << " " << this->cache << std::endl;
int out = 0, read = 0, to_read; int out = 0, read = 0, to_read;
if (n > 32 || n < 0) { if (n > 32 || n < 0) {
@ -25,16 +30,29 @@ int ibitstream::getbits(size_t n) {
while (n > 0) { while (n > 0) {
to_read = min(n, 8 - this->count); to_read = min(n, 8 - this->count);
std::cerr << "Iter n: " << n << " count: " << this->count << " " // std::cerr << "Iter n: " << n << " count: " << this->count << " "
<< this->cache << " to_read: " << to_read << " already read: " << read << std::endl; // << this->cache << " to_read: " << to_read << " already read: " << read << std::endl;
// cache & 0b11111000 if count = 3; // cache & 0b11111000 if count = 3;
// cache & 0b10000000 if count = 7; // cache & 0b10000000 if count = 7;
// cache & 0b11111111 if count = 0, etc; // cache & 0b11111111 if count = 0, etc;
uint8_t mask = (((1 << to_read) - 1) << this->count); uint8_t mask = (((1 << to_read) - 1) << this->count);
out |= ((cache & mask) >> this->count) << read; uint8_t chunk = ((cache & mask) >> this->count);
std::cerr << "Read result: " << std::bitset<8>((cache & mask) >> this->count) << " " << std::bitset<32>(((cache & mask) >> this->count) << read) << std::endl; // todo inverse chunk
uint8_t inv = 0;
for (size_t i = 0; i < to_read; i++)
{
inv |= ((chunk >> i) & 1) << (to_read - i - 1);
}
out <<= to_read; // shift by length of chunk
out |= inv; // concat with chunk
// out |= inv << read;
// std::cerr << "Mask " << std::bitset<8>(mask) << " chunk " << std::bitset<8>(chunk) <<
// " inv " << std::bitset<8>(inv) << " out " << std::bitset<32>(out) << std::endl;
this->count += to_read; this->count += to_read;
read += to_read; read += to_read;
@ -51,13 +69,33 @@ int ibitstream::getbits(size_t n) {
} }
void obitstream::writebits(short bits, size_t n) { void obitstream::writebits(short bits, size_t n) {
std::cerr << "Write: " << std::bitset<16>(bits) << " n: " << n << " count: " << this->count << " cache: " << std::bitset<8>(this->cache) << std::endl; // std::cerr << "Write: " << std::bitset<16>(bits) << " n: " << n << " count: " << this->count << " cache: " << std::bitset<8>(this->cache) << std::endl;
int written = 0, to_write; int written = 0, to_write;
while (n > 0) { while (n > 0) {
to_write = min(n, 8 - this->count); to_write = min(n, 8 - this->count);
uint8_t chunk = (bits & (((1 << to_write) - 1) << written)) >> written; // mask:
this->cache |= (chunk << this->count); // ------- n -----------------------------
// 00000..00000 1...1111111111 000000..000 == bits
// - written - | - to_write - | - offset -
// so offset = n - to_write
// because n = n - to_write at every step, so written is already present
uint16_t mask = (((1 << to_write) - 1) << (n - to_write));
uint8_t chunk = (bits & mask) >> (n - to_write);
// todo inverse bits in chunk
uint8_t inv = 0;
for (size_t i = 0; i < to_write; i++)
{
inv |= ((chunk >> i) & 1) << (to_write - i - 1);
}
// std::cerr << "Chunk " << std::bitset<8>(chunk) << " inv " << std::bitset<8>(inv) << " to_write "
// << to_write << " written " << written << " n " << n << " mask " << std::bitset<16>(mask)
// << " offset " << (n - to_write) << std::endl;
this->cache |= (inv << this->count);
this->count += to_write; this->count += to_write;
written += to_write; written += to_write;
@ -65,7 +103,7 @@ void obitstream::writebits(short bits, size_t n) {
if (this->count == 8){ if (this->count == 8){
// flush chunk // flush chunk
std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl; // std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl;
os << this->cache; os << this->cache;
os.flush(); os.flush();
@ -78,7 +116,7 @@ void obitstream::writebits(short bits, size_t n) {
} }
void obitstream::flush() { void obitstream::flush() {
std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl; // std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl;
os << this->cache; os << this->cache;
os.flush(); os.flush();

View File

@ -6,22 +6,32 @@
#include <queue> #include <queue>
#include <iostream> #include <iostream>
#define HEADER_SIZE 128 void initialize_table(const int sumLen,
const std::map<int, std::set<char> > &huffmanLengths,
void initialize_table(const std::map<int, std::set<char> > &huffmanLengths, std::unordered_map<char, std::pair<int, short> > &codingTable,
std::unordered_map<char, std::pair<int, short> > &table) std::vector<char> &symbols,
std::array<int, MAX_LEN> &counts)
{ {
int nextbl = 0; int nextbl = 0, offset = 0; // offset = total offset to symbols of current len
short code = 0; short code = 0;
symbols.resize(sumLen);
counts.fill(0);
// std::cerr << "Sum len " << sumLen << std::endl;
for (auto lenCodePairIt = huffmanLengths.begin(); lenCodePairIt != huffmanLengths.end(); lenCodePairIt++) for (auto lenCodePairIt = huffmanLengths.begin(); lenCodePairIt != huffmanLengths.end(); lenCodePairIt++)
{ {
int cnt = 0; // counter of symbols of current code length
auto lenCodePair = *lenCodePairIt; auto lenCodePair = *lenCodePairIt;
counts[lenCodePair.first] = lenCodePair.second.size();
// std::cerr << "Counts[" << lenCodePair.first << "] " << counts[lenCodePair.first] << std::endl;
for (auto it = lenCodePair.second.begin(); it != lenCodePair.second.end(); it++) for (auto it = lenCodePair.second.begin(); it != lenCodePair.second.end(); it++)
{ {
table[*it].first = lenCodePair.first; // save current bit length for code codingTable[*it].first = lenCodePair.first; // save current bit length for code
table[*it].second = code; codingTable[*it].second = code;
// code := (code + 1) << ((bit length of the next symbol) (current bit length)) // code := (code + 1) << ((bit length of the next symbol) (current bit length))
// code++; // code++;
@ -33,15 +43,20 @@ void initialize_table(const std::map<int, std::set<char> > &huffmanLengths,
nextbl = lenCodePair.first; nextbl = lenCodePair.first;
} }
// std::cerr << "symbols[" << offset + cnt << "] =" << (*it) << " code " << std::bitset<16>(code) << std::endl;
symbols[offset + cnt] = *it;
code = (code + 1) << (nextbl - lenCodePair.first); code = (code + 1) << (nextbl - lenCodePair.first);
cnt++;
} }
offset += cnt;
// code <<= 1; // code <<= 1;
} }
} }
HuffmanTable::HuffmanTable(uint8_t *header) { HuffmanTable::HuffmanTable(const char *header) {
int cnt1, cnt2; int cnt1, cnt2, total_cnt = 0;
std::map<int, std::set<char> > huffmanLengths; std::map<int, std::set<char> > huffmanLengths;
for (int i = 0; i < HEADER_SIZE; i++) { for (int i = 0; i < HEADER_SIZE; i++) {
@ -49,13 +64,15 @@ HuffmanTable::HuffmanTable(uint8_t *header) {
cnt2 = (header[i] & 0b1111); cnt2 = (header[i] & 0b1111);
if (cnt1 != 0) huffmanLengths[cnt1].insert((char)(i * 2)); if (cnt1 != 0) huffmanLengths[cnt1].insert((char)(i * 2));
if (cnt2 != 0) huffmanLengths[cnt2].insert((char)(i * 2 + 1)); if (cnt2 != 0) huffmanLengths[cnt2].insert((char)(i * 2 + 1));
total_cnt += cnt1 + cnt2;
} }
// build up codes // build up codes
initialize_table(huffmanLengths, this->huffmanCodes); initialize_table(total_cnt, huffmanLengths, this->huffmanCodes, this->symbols, this->counts);
} }
void get_lengths(Node* root, int len, void get_lengths(Node* root, int len, int &cnt,
std::map<int, std::set<char> > &huffmanLengths) std::map<int, std::set<char> > &huffmanLengths)
{ {
if (!root) if (!root)
@ -65,11 +82,12 @@ void get_lengths(Node* root, int len,
if (root->isLeaf()) { if (root->isLeaf()) {
// huffmanCode[root->ch] = str; // huffmanCode[root->ch] = str;
// std::cerr << "Got leaf: " << root->getChar() << std::endl; // std::cerr << "Got leaf: " << root->getChar() << std::endl;
cnt++;
huffmanLengths[len].insert(root->getChar()); huffmanLengths[len].insert(root->getChar());
} }
get_lengths(root->getLeft(), len + 1, huffmanLengths); get_lengths(root->getLeft(), len + 1, cnt, huffmanLengths);
get_lengths(root->getRight(), len + 1, huffmanLengths); get_lengths(root->getRight(), len + 1, cnt, huffmanLengths);
} }
HuffmanTable::HuffmanTable(std::basic_istream<char> &is) { HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
@ -81,7 +99,7 @@ HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
freq[ch]++; freq[ch]++;
} }
std::cerr << "Calculated freqs" << std::endl; // std::cerr << "Calculated freqs" << std::endl;
// Create a priority queue to store live nodes of // Create a priority queue to store live nodes of
// Huffman tree; // Huffman tree;
@ -94,7 +112,7 @@ HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
pq.push(new_node); pq.push(new_node);
} }
std::cerr << "Filled PQ: " << pq.size() << std::endl; // std::cerr << "Filled PQ: " << pq.size() << std::endl;
// do till there is more than one node in the queue // do till there is more than one node in the queue
while (pq.size() != 1) while (pq.size() != 1)
@ -119,11 +137,20 @@ HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
Node* root = pq.top(); Node* root = pq.top();
std::map<int, std::set<char> > huffmanLengths; std::map<int, std::set<char> > huffmanLengths;
get_lengths(root, 0, huffmanLengths); int total_cnt = 0;
get_lengths(root, 0, total_cnt, huffmanLengths);
// std::cerr << "Got lengths: " << huffmanLengths.size() << std::endl; // std::cerr << "Got lengths: " << huffmanLengths.size() << std::endl;
initialize_table(huffmanLengths, this->huffmanCodes); initialize_table(total_cnt, huffmanLengths, this->huffmanCodes, this->symbols, this->counts);
// for (auto s : this->symbols) {
// std::cerr << "Symbol " << s << std::endl;
// }
// for (int i = 0; i < this->counts.size(); i++) {
// std::cerr << "Count for len " << i << " " << this->counts[i] << std::endl;
// }
} }
std::pair<int, short> HuffmanTable::operator[](const char &c) { std::pair<int, short> HuffmanTable::operator[](const char &c) {
@ -133,11 +160,45 @@ std::pair<int, short> HuffmanTable::operator[](const char &c) {
void HuffmanTable::write_symbol(obitstream &os, const char &c) { void HuffmanTable::write_symbol(obitstream &os, const char &c) {
if (huffmanCodes.find(c) == huffmanCodes.end()) throw std::runtime_error("No code in table for char!"); if (huffmanCodes.find(c) == huffmanCodes.end()) throw std::runtime_error("No code in table for char!");
std::cerr << "Write code for " << c << " " << (int)c << " : " << std::bitset<16>(huffmanCodes[c].second) << " " << " len " << huffmanCodes[c].first << std::endl;
os.writebits(huffmanCodes[c].second, huffmanCodes[c].first); os.writebits(huffmanCodes[c].second, huffmanCodes[c].first);
} }
uint8_t *HuffmanTable::to_header() { int HuffmanTable::decode_one_symbol(ibitstream &bs)
uint8_t *header = new uint8_t[HEADER_SIZE]; {
uint16_t code = 0;
int len = 1, first = 0, index = 0;
while (len <= MAX_LEN) {
// read one bit
uint16_t bit = (uint16_t) bs.getbits(1);
code |= bit;
int count = this->counts[len];
// std::cerr << "Read bit " << bit << " code " << std::bitset<16>(code) << " len " << len <<
// " first " << std::bitset<16>(first) << " index " << index << " count " << count << std::endl;
if (code < first + count) {
return this->symbols[index + (code - first)];
}
index += count;
first += count;
first <<= 1;
code <<= 1;
len++;
}
return -1;
}
char *HuffmanTable::to_header() {
char *header = new char[HEADER_SIZE];
for (size_t i = 0; i < HEADER_SIZE; i++) for (size_t i = 0; i < HEADER_SIZE; i++)
{ {

View File

@ -3,6 +3,9 @@
#include "bitstream.h" #include "bitstream.h"
#define HEADER_SIZE 128
#define MAX_LEN 16
#ifndef HUFFMAN_TABLE #ifndef HUFFMAN_TABLE
#define HUFFMAN_TABLE #define HUFFMAN_TABLE
@ -10,16 +13,18 @@ class HuffmanTable
{ {
private: private:
std::unordered_map<char, std::pair<int, short> > huffmanCodes; std::unordered_map<char, std::pair<int, short> > huffmanCodes;
std::vector<char> symbols;
std::array<int, 16> counts;
public: public:
// Given the list of code lengths length[0..n-1] representing a canonical // Given the list of code lengths length[0..n-1] representing a canonical
// Huffman code for n symbols, construct the tables required to decode those // Huffman code for n symbols, construct the tables required to decode those
// codes. // codes.
HuffmanTable(uint8_t *header); HuffmanTable(const char *header);
// Build from input stream // Build from input stream
HuffmanTable(std::basic_istream<char> &is); HuffmanTable(std::basic_istream<char> &is);
uint8_t *to_header(); char *to_header();
std::pair<int, short> operator[](const char &c); std::pair<int, short> operator[](const char &c);

View File

@ -9,6 +9,7 @@
#include "bitstream.h" #include "bitstream.h"
#include "huffman_table.h" #include "huffman_table.h"
#include "MashZip.h"
using namespace std; using namespace std;
@ -201,18 +202,18 @@ int main(int argc, char **argv)
// // 10111010 11110101 // // 10111010 11110101
// // ^^ // // ^^
// int a = ibs.getbits(2); // int a = ibs.getbits(2);
// cout << bitset<2>(a) << endl; // cout << bitset<2>(a) << endl; // 01
// // 10111010 11110101 // // 10111010 11110101
// // ^^^^ // // ^^^^
// int b = ibs.getbits(4); // int b = ibs.getbits(4);
// cout << bitset<4>(b) << endl; // cout << bitset<4>(b) << endl; // 0111
// // 10111010 11110101 // // 10111010 11110101
// // ^^ ^^^^^^^ // // ^^ ^^^^^^^
// b = ibs.getbits(9); // b = ibs.getbits(9);
// cout << bitset<9>(b) << endl; // cout << bitset<9>(b) << endl; // 011010111
// // 10111010 11110101 // // 10111010 11110101
// // ^ + overflow // // ^ + overflow
@ -230,11 +231,13 @@ int main(int argc, char **argv)
// cout << "After 3 bits: " << so.str() << endl; // cout << "After 3 bits: " << so.str() << endl;
// // cache here: 00000111 // // cache here: 00000111
// obs.writebits(0xf5, 7);
// // here: Flush: 10101111 // // de = 11011110
// obs.writebits(0xde, 7);
// // here: Flush: 11101111
// cout << "After 7 bits: " << so.str() << endl; // cout << "After 7 bits: " << so.str() << endl;
// obs.flush(); // obs.flush();
// // here: Flush: 00000011 // // here: Flush: 00000001
// cout << "After flush: " << so.str() << endl; // cout << "After flush: " << so.str() << endl;
@ -245,34 +248,59 @@ int main(int argc, char **argv)
// cout << "After flush: " << so.str() << endl; // cout << "After flush: " << so.str() << endl;
string s = "Some long text!!!!\x01\x02\x03\x04"; // string s = ;
stringstream ss1(s); stringstream ss1("Some long text!!!!\x01\x02\x03\x04\n123123456789 privet, masha!");
stringstream ss2("Some long text!!!!\x01\x02\x03\x04\n123123456789 privet, masha!");
stringstream so1;
stringstream so2;
HuffmanTable ht(ss1); MashZip mz;
uint8_t *header = ht.to_header(); mz.mashzip_file(ss1, ss2, so1);
// for (size_t i = 0; i < 128; i++)
// { std::cout << "After mashzip: " << so1.str();
// cout << "Code for " << 2 * i << " and " << 2 * i + 1 << ": " << bitset<8>(header[i]) << endl;
mz.unmashzip_stream(so1, so2);
std::cout << "After unmashzip: " << so2.str();
// HuffmanTable ht(ss1);
// char *header = ht.to_header();
// // for (size_t i = 0; i < 128; i++)
// // {
// // cout << "Code for " << 2 * i << " and " << 2 * i + 1 << ": " << bitset<8>(header[i]) << endl;
// // }
// stringstream test;
// // test << "MASH"; // magic
// // for (size_t i = 0; i < 128; i++) {
// // test << header[i];
// // }
// obitstream some_stream(test);
// for (char c : s) {
// ht.write_symbol(some_stream, c);
// } // }
// some_stream.flush();
ostringstream test; // std::cout << test.str() << endl;
test << "MASH"; // magic // ibitstream some_instream(test);
for (size_t i = 0; i < 128; i++) { // while(true) {
test << header[i]; // char sym = (char) ht.decode_one_symbol(some_instream);
} // if (sym < 0)
// {
obitstream some_stream(test); // std::cout << "Read all" << std::endl;
// break;
for (char c : s) { // }
ht.write_symbol(some_stream, c); // std::cerr << "Sym: " << sym << std::endl;
} // }
some_stream.flush();
std::cout << test.str();
return 0; return 0;
} }